Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 40 additions & 2 deletions scripts/importer/validateData.py
Original file line number Diff line number Diff line change
Expand Up @@ -3350,6 +3350,7 @@ def __init__(self, *args, **kwargs):
self.mutation_profile_column = None
self.gene_panel_sample_ids = {}
self.mutation_stable_id_index = None
self.mutation_panel_by_sample = {}

def checkHeader(self, data):
num_errors = super(GenePanelMatrixValidator, self).checkHeader(data)
Expand Down Expand Up @@ -3413,9 +3414,11 @@ def checkLine(self, data):

# If stable id is mutation and value not NA, check whether sample ID is in sequenced case list
if self.mutation_stable_id_index is not None:
sample_ids_panel_dict[sample_id] = data[self.mutation_stable_id_index - 1]
mutation_panel = data[self.mutation_stable_id_index - 1]
sample_ids_panel_dict[sample_id] = mutation_panel
self.mutation_panel_by_sample[sample_id] = mutation_panel
# Sample ID has been removed from list, so subtract 1 position.
if data[self.mutation_stable_id_index - 1] != 'NA':
if mutation_panel != 'NA':
if mutation_sample_ids is not None and sample_id not in mutation_sample_ids:
self.logger.error('Sample ID has mutation gene panel, but is not in the sequenced case list',
extra={'line_number': self.line_number,
Expand All @@ -3430,6 +3433,24 @@ def checkLine(self, data):
extra={'line_number': self.line_number,
'cause': gene_panel_id})

def onComplete(self):
# Every mutation-profiled sample in the `_sequenced` case list must map
# to a non-NA mutation panel in the gene panel matrix.
if self.mutation_stable_id_index is not None and mutation_sample_ids is not None:
for sample_id in mutation_sample_ids:
if DEFINED_SAMPLE_IDS is not None and sample_id not in DEFINED_SAMPLE_IDS:
continue
if sample_id not in self.mutation_panel_by_sample:
self.logger.error(
'Sample ID is in the sequenced case list but missing in the mutation column of the gene panel matrix',
extra={'cause': sample_id})
elif self.mutation_panel_by_sample[sample_id] == 'NA':
self.logger.error(
'Sample ID is in the sequenced case list but has NA mutation gene panel in the gene panel matrix',
extra={'cause': sample_id})

super(GenePanelMatrixValidator, self).onComplete()


class ProteinLevelValidator(FeaturewiseFileValidator):

Expand Down Expand Up @@ -4717,6 +4738,9 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str
'No meta files found in ' + directory +'. Please make sure the directory '\
'is the path to the folder containing the files.')

global study_meta_dictionary
study_meta_dictionary = {}

study_id = None
study_cancer_type = None
study_data_types = []
Expand Down Expand Up @@ -5401,6 +5425,13 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_
global PATIENTS_WITH_SAMPLES
global RESOURCE_DEFINITION_DICTIONARY
global RESOURCE_PATIENTS_WITH_SAMPLES
global mutation_sample_ids
global mutation_file_sample_ids
global sample_ids_panel_dict

mutation_sample_ids = None
mutation_file_sample_ids = set()
sample_ids_panel_dict = {}

if portal_instance.cancer_type_dict is None:
logger.warning('Skipping validations relating to cancer types '
Expand Down Expand Up @@ -5606,6 +5637,13 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_


def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks):
global mutation_sample_ids
global mutation_file_sample_ids
global sample_ids_panel_dict
mutation_sample_ids = None
mutation_file_sample_ids = set()
sample_ids_panel_dict = {}

# walk over the meta files in the dir and get properties of the study
validators_by_meta_type, *_ = process_metadata_files(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks)
for meta_file_type, validators in validators_by_meta_type.items():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data/data_gene_matrix_duplicate_sample.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
SAMPLE_ID mutations gistic
TCGA-A1-A0SB-01 TESTPANEL2 TESTPANEL1
TCGA-A1-A0SD-01 NA TESTPANEL1
TCGA-A1-A0SD-01 TESTPANEL1 TESTPANEL1
TCGA-A1-A0SB-01 TESTPANEL2 TESTPANEL1
2 changes: 2 additions & 0 deletions tests/test_data/data_gene_matrix_missing_sequenced_sample.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SAMPLE_ID mutations gistic
TCGA-A1-A0SD-01 TESTPANEL1 TESTPANEL1
3 changes: 3 additions & 0 deletions tests/test_data/data_gene_matrix_sequenced_na.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SAMPLE_ID mutations gistic
TCGA-A1-A0SB-01 NA TESTPANEL1
TCGA-A1-A0SD-01 TESTPANEL1 TESTPANEL1
24 changes: 19 additions & 5 deletions tests/test_data/study_es_0/data_gene_panel_matrix.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
SAMPLE_ID mutations gistic
TCGA-A1-A0SB-01 TESTPANEL1 TESTPANEL1
TCGA-A1-A0SB-02 TESTPANEL2 NA
TCGA-A1-A0SK-01 TESTPANEL1 NA
TCGA-A2-A04P-01 TESTPANEL1 NA
TCGA-A2-A0CM-01 TESTPANEL1 NA
TCGA-AR-A1AR-01 TESTPANEL1 NA
TCGA-B6-A0I6-01 TESTPANEL1 NA
TCGA-B6-A0WX-01 TESTPANEL1 NA
TCGA-BH-A0E0-01 TESTPANEL1 NA
TCGA-BH-A0HL-01 TESTPANEL1 NA
TCGA-BH-A18K-01 TESTPANEL1 NA
TCGA-BH-A18V-01 TESTPANEL1 NA
TCGA-BH-A1F0-01 TESTPANEL1 NA
TEST-A2B8-01 TESTPANEL1 TESTPANEL1
TEST-A2FF-01 TESTPANEL1 TESTPANEL1
TCGA-GI-A2C8-01 TESTPANEL1 TESTPANEL1
TEST_SAMPLE_1 NA NA
TEST_SAMPLE_1 TESTPANEL1 NA
TEST_SAMPLE_2 TESTPANEL1 NA
TEST_SAMPLE_3 NA TESTPANEL1
TEST_SAMPLE_3 TESTPANEL1 TESTPANEL1
TEST_SAMPLE_4 TESTPANEL1 TESTPANEL1
TEST_SAMPLE_5 NA NA
TEST_SAMPLE_5 TESTPANEL1 NA
TEST_SAMPLE_6 TESTPANEL1 NA
TEST_SAMPLE_7 NA NA
TEST_SAMPLE_8 NA TESTPANEL1
TEST_SAMPLE_9 NA NA
TEST_SAMPLE_10 NA NA
TEST_SAMPLE_10 TESTPANEL1 NA
TEST_SAMPLE_11 TESTPANEL1 NA
TEST_SAMPLE_12 NA NA
TEST_SAMPLE_12 TESTPANEL1 NA
TEST_SAMPLE_13 TESTPANEL1 TESTPANEL1
TEST_SAMPLE_14 TESTPANEL2 TESTPANEL1
TEST_SAMPLE_15 TESTPANEL2 TESTPANEL1
TEST_SAMPLE_SOMATIC_HOMOZYGOUS TESTPANEL1 NA
TEST_SAMPLE_SOMATIC_HETEROZYGOUS TESTPANEL1 NA
TEST_SAMPLE_SOMATIC_UNDEFINED TESTPANEL1 NA
2 changes: 1 addition & 1 deletion tests/test_data/study_es_0/result_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -1002,7 +1002,7 @@ <h4 class="panel-title">data_gene_panel_matrix.txt</h4>
<td><span class="glyphicon glyphicon-ok" aria-hidden="true"></span><span class="sr-only">Info</span></td>
<td>&ndash;</td>
<td>&ndash;</td>
<td>Read 21 lines. Lines with warning: 0. Lines with error: 0</td>
<td>Read 35 lines. Lines with warning: 0. Lines with error: 0</td>
<td>&ndash;</td>
<td></td>
</tr>
Expand Down
41 changes: 41 additions & 0 deletions tests/unit_tests_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2286,6 +2286,29 @@ class GenePanelMatrixValidationTestCase(PostClinicalDataFileTestCase):

"""Test for validations in Gene Panel Matrix."""

def setUp(self):
super(GenePanelMatrixValidationTestCase, self).setUp()
self.original_study_meta_dictionary = validateData.study_meta_dictionary
self.original_sample_ids_panel_dict = validateData.sample_ids_panel_dict
validateData.study_meta_dictionary = {
'mutations': {
'genetic_alteration_type': 'MUTATION_EXTENDED',
'datatype': 'MAF',
'stable_id': 'mutations'
},
'gistic': {
'genetic_alteration_type': 'COPY_NUMBER_ALTERATION',
'datatype': 'DISCRETE',
'stable_id': 'gistic'
}
}
validateData.sample_ids_panel_dict = {}

def tearDown(self):
validateData.study_meta_dictionary = self.original_study_meta_dictionary
validateData.sample_ids_panel_dict = self.original_sample_ids_panel_dict
super(GenePanelMatrixValidationTestCase, self).tearDown()

def test_duplicate_sample(self):
"""Test if duplicate samples are detected"""
# set level according to this test case:
Expand All @@ -2296,6 +2319,24 @@ def test_duplicate_sample(self):
self.assertEqual(1, len(record_list))
self.assertIn("duplicated sample id.", record_list[0].getMessage().lower())

def test_sequenced_sample_cannot_have_na_mutation_panel(self):
self.logger.setLevel(logging.ERROR)
record_list = self.validate('data_gene_matrix_sequenced_na.txt',
validateData.GenePanelMatrixValidator)

self.assertEqual(1, len(record_list))
self.assertIn("has na mutation gene panel", record_list[0].getMessage().lower())
self.assertEqual("TCGA-A1-A0SB-01", record_list[0].cause)

def test_sequenced_sample_must_exist_in_mutation_column(self):
self.logger.setLevel(logging.ERROR)
record_list = self.validate('data_gene_matrix_missing_sequenced_sample.txt',
validateData.GenePanelMatrixValidator)

self.assertEqual(1, len(record_list))
self.assertIn("missing in the mutation column", record_list[0].getMessage().lower())
self.assertEqual("TCGA-A1-A0SB-01", record_list[0].cause)

class StudyCompositionTestCase(LogBufferTestCase):

"""Tests for validations of the number of files of certain types."""
Expand Down