From fe7bd36ed15d0316eefc30a8b08077914dc3d002 Mon Sep 17 00:00:00 2001 From: Rujuta Shinde Date: Tue, 24 Feb 2026 09:47:41 -0800 Subject: [PATCH] Add gene panel matrix checks against _sequenced case list --- scripts/importer/validateData.py | 42 ++++++++++++++++++- .../data_gene_matrix_duplicate_sample.txt | 2 +- ...a_gene_matrix_missing_sequenced_sample.txt | 2 + .../data_gene_matrix_sequenced_na.txt | 3 ++ .../study_es_0/data_gene_panel_matrix.txt | 24 ++++++++--- tests/test_data/study_es_0/result_report.html | 2 +- tests/unit_tests_validate_data.py | 41 ++++++++++++++++++ 7 files changed, 107 insertions(+), 9 deletions(-) create mode 100644 tests/test_data/data_gene_matrix_missing_sequenced_sample.txt create mode 100644 tests/test_data/data_gene_matrix_sequenced_na.txt diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 1b846af7..59fd2a63 100644 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -3350,6 +3350,7 @@ def __init__(self, *args, **kwargs): self.mutation_profile_column = None self.gene_panel_sample_ids = {} self.mutation_stable_id_index = None + self.mutation_panel_by_sample = {} def checkHeader(self, data): num_errors = super(GenePanelMatrixValidator, self).checkHeader(data) @@ -3413,9 +3414,11 @@ def checkLine(self, data): # If stable id is mutation and value not NA, check whether sample ID is in sequenced case list if self.mutation_stable_id_index is not None: - sample_ids_panel_dict[sample_id] = data[self.mutation_stable_id_index - 1] + mutation_panel = data[self.mutation_stable_id_index - 1] + sample_ids_panel_dict[sample_id] = mutation_panel + self.mutation_panel_by_sample[sample_id] = mutation_panel # Sample ID has been removed from list, so subtract 1 position. - if data[self.mutation_stable_id_index - 1] != 'NA': + if mutation_panel != 'NA': if mutation_sample_ids is not None and sample_id not in mutation_sample_ids: self.logger.error('Sample ID has mutation gene panel, but is not in the sequenced case list', extra={'line_number': self.line_number, @@ -3430,6 +3433,24 @@ def checkLine(self, data): extra={'line_number': self.line_number, 'cause': gene_panel_id}) + def onComplete(self): + # Every mutation-profiled sample in the `_sequenced` case list must map + # to a non-NA mutation panel in the gene panel matrix. + if self.mutation_stable_id_index is not None and mutation_sample_ids is not None: + for sample_id in mutation_sample_ids: + if DEFINED_SAMPLE_IDS is not None and sample_id not in DEFINED_SAMPLE_IDS: + continue + if sample_id not in self.mutation_panel_by_sample: + self.logger.error( + 'Sample ID is in the sequenced case list but missing in the mutation column of the gene panel matrix', + extra={'cause': sample_id}) + elif self.mutation_panel_by_sample[sample_id] == 'NA': + self.logger.error( + 'Sample ID is in the sequenced case list but has NA mutation gene panel in the gene panel matrix', + extra={'cause': sample_id}) + + super(GenePanelMatrixValidator, self).onComplete() + class ProteinLevelValidator(FeaturewiseFileValidator): @@ -4717,6 +4738,9 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str 'No meta files found in ' + directory +'. Please make sure the directory '\ 'is the path to the folder containing the files.') + global study_meta_dictionary + study_meta_dictionary = {} + study_id = None study_cancer_type = None study_data_types = [] @@ -5401,6 +5425,13 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ global PATIENTS_WITH_SAMPLES global RESOURCE_DEFINITION_DICTIONARY global RESOURCE_PATIENTS_WITH_SAMPLES + global mutation_sample_ids + global mutation_file_sample_ids + global sample_ids_panel_dict + + mutation_sample_ids = None + mutation_file_sample_ids = set() + sample_ids_panel_dict = {} if portal_instance.cancer_type_dict is None: logger.warning('Skipping validations relating to cancer types ' @@ -5606,6 +5637,13 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks): + global mutation_sample_ids + global mutation_file_sample_ids + global sample_ids_panel_dict + mutation_sample_ids = None + mutation_file_sample_ids = set() + sample_ids_panel_dict = {} + # walk over the meta files in the dir and get properties of the study validators_by_meta_type, *_ = process_metadata_files(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) for meta_file_type, validators in validators_by_meta_type.items(): diff --git a/tests/test_data/data_gene_matrix_duplicate_sample.txt b/tests/test_data/data_gene_matrix_duplicate_sample.txt index 20b2a27d..ab0e27fd 100644 --- a/tests/test_data/data_gene_matrix_duplicate_sample.txt +++ b/tests/test_data/data_gene_matrix_duplicate_sample.txt @@ -1,4 +1,4 @@ SAMPLE_ID mutations gistic TCGA-A1-A0SB-01 TESTPANEL2 TESTPANEL1 -TCGA-A1-A0SD-01 NA TESTPANEL1 +TCGA-A1-A0SD-01 TESTPANEL1 TESTPANEL1 TCGA-A1-A0SB-01 TESTPANEL2 TESTPANEL1 diff --git a/tests/test_data/data_gene_matrix_missing_sequenced_sample.txt b/tests/test_data/data_gene_matrix_missing_sequenced_sample.txt new file mode 100644 index 00000000..35de2176 --- /dev/null +++ b/tests/test_data/data_gene_matrix_missing_sequenced_sample.txt @@ -0,0 +1,2 @@ +SAMPLE_ID mutations gistic +TCGA-A1-A0SD-01 TESTPANEL1 TESTPANEL1 diff --git a/tests/test_data/data_gene_matrix_sequenced_na.txt b/tests/test_data/data_gene_matrix_sequenced_na.txt new file mode 100644 index 00000000..17546691 --- /dev/null +++ b/tests/test_data/data_gene_matrix_sequenced_na.txt @@ -0,0 +1,3 @@ +SAMPLE_ID mutations gistic +TCGA-A1-A0SB-01 NA TESTPANEL1 +TCGA-A1-A0SD-01 TESTPANEL1 TESTPANEL1 diff --git a/tests/test_data/study_es_0/data_gene_panel_matrix.txt b/tests/test_data/study_es_0/data_gene_panel_matrix.txt index fda09dee..c2178132 100644 --- a/tests/test_data/study_es_0/data_gene_panel_matrix.txt +++ b/tests/test_data/study_es_0/data_gene_panel_matrix.txt @@ -1,21 +1,35 @@ SAMPLE_ID mutations gistic TCGA-A1-A0SB-01 TESTPANEL1 TESTPANEL1 TCGA-A1-A0SB-02 TESTPANEL2 NA +TCGA-A1-A0SK-01 TESTPANEL1 NA +TCGA-A2-A04P-01 TESTPANEL1 NA +TCGA-A2-A0CM-01 TESTPANEL1 NA +TCGA-AR-A1AR-01 TESTPANEL1 NA +TCGA-B6-A0I6-01 TESTPANEL1 NA +TCGA-B6-A0WX-01 TESTPANEL1 NA +TCGA-BH-A0E0-01 TESTPANEL1 NA +TCGA-BH-A0HL-01 TESTPANEL1 NA +TCGA-BH-A18K-01 TESTPANEL1 NA +TCGA-BH-A18V-01 TESTPANEL1 NA +TCGA-BH-A1F0-01 TESTPANEL1 NA TEST-A2B8-01 TESTPANEL1 TESTPANEL1 TEST-A2FF-01 TESTPANEL1 TESTPANEL1 TCGA-GI-A2C8-01 TESTPANEL1 TESTPANEL1 -TEST_SAMPLE_1 NA NA +TEST_SAMPLE_1 TESTPANEL1 NA TEST_SAMPLE_2 TESTPANEL1 NA -TEST_SAMPLE_3 NA TESTPANEL1 +TEST_SAMPLE_3 TESTPANEL1 TESTPANEL1 TEST_SAMPLE_4 TESTPANEL1 TESTPANEL1 -TEST_SAMPLE_5 NA NA +TEST_SAMPLE_5 TESTPANEL1 NA TEST_SAMPLE_6 TESTPANEL1 NA TEST_SAMPLE_7 NA NA TEST_SAMPLE_8 NA TESTPANEL1 TEST_SAMPLE_9 NA NA -TEST_SAMPLE_10 NA NA +TEST_SAMPLE_10 TESTPANEL1 NA TEST_SAMPLE_11 TESTPANEL1 NA -TEST_SAMPLE_12 NA NA +TEST_SAMPLE_12 TESTPANEL1 NA TEST_SAMPLE_13 TESTPANEL1 TESTPANEL1 TEST_SAMPLE_14 TESTPANEL2 TESTPANEL1 TEST_SAMPLE_15 TESTPANEL2 TESTPANEL1 +TEST_SAMPLE_SOMATIC_HOMOZYGOUS TESTPANEL1 NA +TEST_SAMPLE_SOMATIC_HETEROZYGOUS TESTPANEL1 NA +TEST_SAMPLE_SOMATIC_UNDEFINED TESTPANEL1 NA diff --git a/tests/test_data/study_es_0/result_report.html b/tests/test_data/study_es_0/result_report.html index c983adc6..7d143412 100644 --- a/tests/test_data/study_es_0/result_report.html +++ b/tests/test_data/study_es_0/result_report.html @@ -1002,7 +1002,7 @@

data_gene_panel_matrix.txt

Info – – - Read 21 lines. Lines with warning: 0. Lines with error: 0 + Read 35 lines. Lines with warning: 0. Lines with error: 0 – diff --git a/tests/unit_tests_validate_data.py b/tests/unit_tests_validate_data.py index 2ae21efc..f4c5de38 100755 --- a/tests/unit_tests_validate_data.py +++ b/tests/unit_tests_validate_data.py @@ -2286,6 +2286,29 @@ class GenePanelMatrixValidationTestCase(PostClinicalDataFileTestCase): """Test for validations in Gene Panel Matrix.""" + def setUp(self): + super(GenePanelMatrixValidationTestCase, self).setUp() + self.original_study_meta_dictionary = validateData.study_meta_dictionary + self.original_sample_ids_panel_dict = validateData.sample_ids_panel_dict + validateData.study_meta_dictionary = { + 'mutations': { + 'genetic_alteration_type': 'MUTATION_EXTENDED', + 'datatype': 'MAF', + 'stable_id': 'mutations' + }, + 'gistic': { + 'genetic_alteration_type': 'COPY_NUMBER_ALTERATION', + 'datatype': 'DISCRETE', + 'stable_id': 'gistic' + } + } + validateData.sample_ids_panel_dict = {} + + def tearDown(self): + validateData.study_meta_dictionary = self.original_study_meta_dictionary + validateData.sample_ids_panel_dict = self.original_sample_ids_panel_dict + super(GenePanelMatrixValidationTestCase, self).tearDown() + def test_duplicate_sample(self): """Test if duplicate samples are detected""" # set level according to this test case: @@ -2296,6 +2319,24 @@ def test_duplicate_sample(self): self.assertEqual(1, len(record_list)) self.assertIn("duplicated sample id.", record_list[0].getMessage().lower()) + def test_sequenced_sample_cannot_have_na_mutation_panel(self): + self.logger.setLevel(logging.ERROR) + record_list = self.validate('data_gene_matrix_sequenced_na.txt', + validateData.GenePanelMatrixValidator) + + self.assertEqual(1, len(record_list)) + self.assertIn("has na mutation gene panel", record_list[0].getMessage().lower()) + self.assertEqual("TCGA-A1-A0SB-01", record_list[0].cause) + + def test_sequenced_sample_must_exist_in_mutation_column(self): + self.logger.setLevel(logging.ERROR) + record_list = self.validate('data_gene_matrix_missing_sequenced_sample.txt', + validateData.GenePanelMatrixValidator) + + self.assertEqual(1, len(record_list)) + self.assertIn("missing in the mutation column", record_list[0].getMessage().lower()) + self.assertEqual("TCGA-A1-A0SB-01", record_list[0].cause) + class StudyCompositionTestCase(LogBufferTestCase): """Tests for validations of the number of files of certain types."""