From 121009b5fefec9019bfa7db005be697951163aa8 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 16 Apr 2026 00:04:17 -0400 Subject: [PATCH] Add grouped validation summary to stderr output Adds ValidationSummaryHandler that collects all errors and warnings during validation and prints a categorized summary at the end, grouped by severity and sorted by frequency. This makes it easier for curators to triage validation output at a glance rather than scanning a chronological log stream. Example output: ERRORS (8): [4] Normal sample id not in list... [2] Value of numeric attribute is not a real number WARNINGS (10): [4] Unrecognized field in meta file [1] Missing clinical data for a patient... --- scripts/importer/validateData.py | 63 ++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 913f7069..7c63d0d8 100644 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -157,6 +157,58 @@ def get_exit_status(self): else: return 2 + +class ValidationSummaryHandler(logging.Handler): + + """Handler that collects validation messages and prints a grouped summary. + + Groups messages by severity (ERROR, WARNING) and by message text, + providing counts for repeated messages. This makes it easier for + curators to triage validation output by scanning categories rather + than reading a chronological stream. + """ + + def __init__(self): + """Initialize with dictionaries to track message counts.""" + super(ValidationSummaryHandler, self).__init__() + self.error_counts = {} + self.warning_counts = {} + + def emit(self, record): + """Collect messages by severity and message text.""" + msg = record.getMessage() + if record.levelno == logging.ERROR: + self.error_counts[msg] = self.error_counts.get(msg, 0) + 1 + elif record.levelno == logging.WARNING: + self.warning_counts[msg] = self.warning_counts.get(msg, 0) + 1 + + def get_summary(self): + """Return a formatted summary string grouped by severity.""" + lines = [] + total_errors = sum(self.error_counts.values()) + total_warnings = sum(self.warning_counts.values()) + lines.append('-' * 60) + lines.append('VALIDATION SUMMARY') + lines.append('-' * 60) + if total_errors == 0 and total_warnings == 0: + lines.append(' No errors or warnings.') + lines.append('') + return '\n'.join(lines) + if total_errors > 0: + lines.append('ERRORS (%d):' % total_errors) + for msg, count in sorted(self.error_counts.items(), + key=lambda x: x[1], reverse=True): + lines.append(' [%d] %s' % (count, msg)) + lines.append('') + if total_warnings > 0: + lines.append('WARNINGS (%d):' % total_warnings) + for msg, count in sorted(self.warning_counts.items(), + key=lambda x: x[1], reverse=True): + lines.append(' [%d] %s' % (count, msg)) + lines.append('') + lines.append('-' * 60) + return '\n'.join(lines) + class LineCountHandler(logging.Handler): """Handler that does nothing but track the number of lines with error and warnings.""" @@ -5635,6 +5687,8 @@ def main_validate(args): logger.setLevel(logging.DEBUG) exit_status_handler = MaxLevelTrackingHandler() logger.addHandler(exit_status_handler) + summary_handler = ValidationSummaryHandler() + logger.addHandler(summary_handler) # process the options if args.study_directory: @@ -5739,6 +5793,15 @@ def main_validate(args): collapsing_html_handler.flush() html_handler.generateHtml(cbio_version=cbio_version) + # flush collapsing handlers before printing summary so all messages + # are captured by the summary handler + collapsing_text_handler.flush() + if collapsing_html_handler is not None: + collapsing_html_handler.flush() + + # print grouped validation summary to stderr + print(summary_handler.get_summary(), file=sys.stderr) + return exit_status_handler.get_exit_status()