diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py old mode 100644 new mode 100755 index 1b846af7..6c443814 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -3492,6 +3492,10 @@ class TimelineValidator(Validator): REQUIRE_COLUMN_ORDER = True ALLOW_BLANKS = True + def __init__(self, *args, **kwargs): + super(TimelineValidator, self).__init__(*args, **kwargs) + self.timeline_entries = {} + def checkLine(self, data): super(TimelineValidator, self).checkLine(data) # TODO check the values @@ -3522,6 +3526,17 @@ def checkLine(self, data): extra={'line_number': self.line_number, 'column_number': col_index + 1, 'cause': value}) + # validate the uniqueness of timeline records + timeline_entry = ", ".join(data) + if timeline_entry in self.timeline_entries: + self.logger.error( + 'Duplicate entry in timeline data', + extra = {'line_number': self.line_number, + 'cause': '%s (already defined on line %d)' % ( + timeline_entry, + self.timeline_entries[timeline_entry])}) + else: + self.timeline_entries[timeline_entry] = self.line_number class CancerTypeValidator(Validator): diff --git a/tests/test_data/data_timeline_duplicated.txt b/tests/test_data/data_timeline_duplicated.txt new file mode 100644 index 00000000..7f26d475 --- /dev/null +++ b/tests/test_data/data_timeline_duplicated.txt @@ -0,0 +1,5 @@ +PATIENT_ID START_DATE STOP_DATE EVENT_TYPE SPECIMEN_SITE SPECIMEN_TYPE SOURCE +TCGA-BH-A18K 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3 +TCGA-BH-A18K 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3 +TCGA-BH-A18K 10 20 STATUS test_source_4 +TCGA-BH-NEW 100 200 STATUS test_source_1 diff --git a/tests/unit_tests_validate_data.py b/tests/unit_tests_validate_data.py index 2ae21efc..a1f7a781 100755 --- a/tests/unit_tests_validate_data.py +++ b/tests/unit_tests_validate_data.py @@ -498,6 +498,19 @@ def test_start_date_validation_TimelineValidator(self): self.assertEqual("ERROR", error.levelname) self.assertIn("Invalid START_DATE", error.getMessage()) + def test_duplicated_timeline_lines(self): + """Raise an error if timeline lines are duplicated. + """ + # set level according to this test case: + self.logger.setLevel(logging.ERROR) + record_list = self.validate('data_timeline_duplicated.txt', + validateData.TimelineValidator) + self.assertEqual(1, len(record_list)) + for error in record_list: + self.assertEqual("ERROR", error.levelname) + self.assertIn("Duplicate entry in timeline data", error.getMessage()) + + # TODO: make tests in this testcase check the number of properly defined types class CancerTypeFileValidationTestCase(DataFileTestCase):