diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 1b846af7..821126b7 100644 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -3879,10 +3879,40 @@ def checkLine(self, data): 'column_number': col_index + 1, 'cause': value}) + if col_name == 'PRIORITY': + if value.strip() and value.strip().lower() not in self.NULL_VALUES: + try: + int(value.strip()) + except ValueError: + self.logger.error( + 'wrong value of PRIORITY, the value should be an integer', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + + if col_name == 'METADATA': + metadata_value = value.strip() + if metadata_value and metadata_value.lower() not in self.NULL_VALUES: + try: + json.loads(metadata_value) + except json.JSONDecodeError: + self.logger.error( + 'Invalid JSON in METADATA column.', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + except Exception: + self.logger.error( + 'Error processing METADATA column.', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + class SampleResourceValidator(ResourceValidator): """Validator for files defining and setting sample-level attributes.""" REQUIRED_HEADERS = ['SAMPLE_ID', 'PATIENT_ID', 'RESOURCE_ID', 'URL'] + OPTIONAL_HEADERS = ['DISPLAY_NAME', 'TYPE', 'GROUP_PATH', 'METADATA', 'PRIORITY'] def __init__(self, *args, **kwargs): """Initialize a SampleResourceValidator with the given parameters.""" @@ -3946,6 +3976,7 @@ def checkLine(self, data): class PatientResourceValidator(ResourceValidator): REQUIRED_HEADERS = ['PATIENT_ID', 'RESOURCE_ID', 'URL'] + OPTIONAL_HEADERS = ['DISPLAY_NAME', 'TYPE', 'GROUP_PATH', 'METADATA', 'PRIORITY'] def __init__(self, *args, **kwargs): """Initialize a PatientResourceValidator with the given parameters.""" @@ -3998,6 +4029,7 @@ def checkLine(self, data): class StudyResourceValidator(ResourceValidator): REQUIRED_HEADERS = ['RESOURCE_ID', 'URL'] + OPTIONAL_HEADERS = ['DISPLAY_NAME', 'TYPE', 'GROUP_PATH', 'METADATA', 'PRIORITY'] def __init__(self, *args, **kwargs): """Initialize a StudyResourceValidator with the given parameters.""" diff --git a/tests/test_data/data_resource_patient_invalid_metadata.txt b/tests/test_data/data_resource_patient_invalid_metadata.txt new file mode 100644 index 00000000..30318cbc --- /dev/null +++ b/tests/test_data/data_resource_patient_invalid_metadata.txt @@ -0,0 +1,4 @@ +PATIENT_ID RESOURCE_ID URL DISPLAY_NAME TYPE GROUP_PATH METADATA PRIORITY +TCGA-A2-A04P PATIENT_NOTES http://url-to-patient-notes-patient1 Valid JSON CT 2023 {"modality":"CT","slices":120} 0 +TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 Invalid JSON CT 2023 {"modality":"CT","slices":120,} 0 +TCGA-A2-A0CM PATIENT_NOTES http://url-to-patient-notes-patient3 Empty Metadata CT 2023 0 diff --git a/tests/test_data/data_resource_patient_invalid_priority.txt b/tests/test_data/data_resource_patient_invalid_priority.txt new file mode 100644 index 00000000..df5d935c --- /dev/null +++ b/tests/test_data/data_resource_patient_invalid_priority.txt @@ -0,0 +1,4 @@ +PATIENT_ID RESOURCE_ID URL DISPLAY_NAME TYPE GROUP_PATH METADATA PRIORITY +TCGA-A2-A04P PATIENT_NOTES http://url-to-patient-notes-patient1 Valid Priority CT 2023 0 +TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 Invalid Priority CT 2023 not_an_integer +TCGA-A2-A0CM PATIENT_NOTES http://url-to-patient-notes-patient3 Empty Priority CT 2023 diff --git a/tests/test_data/data_resource_patient_valid.txt b/tests/test_data/data_resource_patient_valid.txt index ad9b93a8..bae67891 100644 --- a/tests/test_data/data_resource_patient_valid.txt +++ b/tests/test_data/data_resource_patient_valid.txt @@ -1,4 +1,4 @@ -PATIENT_ID RESOURCE_ID URL -TCGA-A2-A04P PATIENT_NOTES http://url-to-patient-notes-patient1 -TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 -TCGA-A2-A0CM PATIENT_NOTES http://url-to-patient-notes-patient3 \ No newline at end of file +PATIENT_ID RESOURCE_ID URL DISPLAY_NAME TYPE GROUP_PATH METADATA PRIORITY +TCGA-A2-A04P PATIENT_NOTES http://url-to-patient-notes-patient1 Biopsy Report PATH_REPORT 2023 0 +TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 CT Instance CT CT 2023-01-15/Series 1: Axial T2 {"modality":"CT","slices":120} 0 +TCGA-A2-A0CM PATIENT_NOTES http://url-to-patient-notes-patient3 0 diff --git a/tests/test_data/data_resource_sample_valid.txt b/tests/test_data/data_resource_sample_valid.txt index 241b0330..d2b08552 100644 --- a/tests/test_data/data_resource_sample_valid.txt +++ b/tests/test_data/data_resource_sample_valid.txt @@ -1,4 +1,4 @@ -PATIENT_ID SAMPLE_ID RESOURCE_ID URL -TCGA-A2-A04P TCGA-A2-A04P-01 PATHOLOGY_SLIDE http://url-to-slide-sample1 -TCGA-A1-A0SK TCGA-A1-A0SK-01 PATHOLOGY_SLIDE http://url-to-slide-sample2 -TCGA-A2-A0CM TCGA-A2-A0CM-01 PATHOLOGY_SLIDE http://url-to-slide-sample3 \ No newline at end of file +PATIENT_ID SAMPLE_ID RESOURCE_ID URL DISPLAY_NAME TYPE GROUP_PATH METADATA PRIORITY +TCGA-A2-A04P TCGA-A2-A04P-01 PATHOLOGY_SLIDE http://url-to-slide-sample1 H&E H_AND_E Block A – Primary Tumor {"stain":"hematoxylin","magnification":"20x"} 0 +TCGA-A1-A0SK TCGA-A1-A0SK-01 PATHOLOGY_SLIDE http://url-to-slide-sample2 IHC CD3 IHC Block A – Primary Tumor {"antibody":"CD3","clone":"SP7"} 1 +TCGA-A2-A0CM TCGA-A2-A0CM-01 PATHOLOGY_SLIDE http://url-to-slide-sample3 H&E H_AND_E 0 diff --git a/tests/test_data/data_resource_study_valid.txt b/tests/test_data/data_resource_study_valid.txt index 9502811d..45f4ea99 100644 --- a/tests/test_data/data_resource_study_valid.txt +++ b/tests/test_data/data_resource_study_valid.txt @@ -1,3 +1,3 @@ -RESOURCE_ID URL -STUDY_SPONSORS http://url-to-study-sponsors1 -STUDY_SPONSORS http://url-to-study-sponsors2 \ No newline at end of file +RESOURCE_ID URL DISPLAY_NAME TYPE GROUP_PATH METADATA PRIORITY +STUDY_SPONSORS http://url-to-study-sponsors1 Study Sponsors 0 +STUDY_SPONSORS http://url-to-study-sponsors2 Study Sponsors 2 1 diff --git a/tests/unit_tests_validate_data.py b/tests/unit_tests_validate_data.py index 2ae21efc..498406bc 100755 --- a/tests/unit_tests_validate_data.py +++ b/tests/unit_tests_validate_data.py @@ -3027,6 +3027,34 @@ def test_study_resource_has_duplication(self): record = record_list.pop() self.assertEqual(logging.ERROR, record.levelno) self.assertIn('Duplicated resources found', record.getMessage()) + + # optional column tests + def test_patient_resource_invalid_metadata(self): + validateData.RESOURCE_DEFINITION_DICTIONARY = {'PATIENT_NOTES': ['PATIENT']} + self.logger.setLevel(logging.ERROR) + record_list = self.validate('data_resource_patient_invalid_metadata.txt', + validateData.PatientResourceValidator) + + self.assertEqual(1, len(record_list)) + record = record_list.pop() + self.assertEqual(logging.ERROR, record.levelno) + self.assertIn('Invalid JSON in METADATA column', record.getMessage()) + self.assertEqual(record.cause, '{"modality":"CT","slices":120,}') + validateData.RESOURCE_DEFINITION_DICTIONARY = {} + + def test_patient_resource_invalid_priority(self): + validateData.RESOURCE_DEFINITION_DICTIONARY = {'PATIENT_NOTES': ['PATIENT']} + self.logger.setLevel(logging.ERROR) + record_list = self.validate('data_resource_patient_invalid_priority.txt', + validateData.PatientResourceValidator) + + self.assertEqual(1, len(record_list)) + record = record_list.pop() + self.assertEqual(logging.ERROR, record.levelno) + self.assertIn('wrong value of PRIORITY', record.getMessage()) + self.assertEqual(record.cause, 'not_an_integer') + validateData.RESOURCE_DEFINITION_DICTIONARY = {} + # -------------------------- end resource definition wise test ---------------------------- # --------------------------- generic assay test ------------------------------