diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 913f7069..e945a596 100644 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -91,6 +91,11 @@ # global character limit on sample stable ids MAX_SAMPLE_STABLE_ID_LENGTH = 63 +# global character limit on clinical attribute values; matches the +# `clinical_patient`/`clinical_sample` ATTR_VALUE varchar(255) database columns, +# beyond which values would be silently truncated on load +MAX_CLINICAL_ATTRIBUTE_VALUE_LENGTH = 255 + # global variable that defines the invalid ID characters INVALID_ID_CHARACTERS = r"[^A-Za-z0-9._()\[\]',+\-:;]" @@ -2750,6 +2755,18 @@ def checkLine(self, data): value = data[col_index].strip() data_type = self.attr_defs[col_index]['datatype'] + # values are stored in a varchar(255) column and would be silently + # truncated on load if they are longer than that + if len(value) > MAX_CLINICAL_ATTRIBUTE_VALUE_LENGTH: + self.logger.error( + 'Value exceeds the maximum length of %d characters and ' + 'would be truncated when loaded' % + MAX_CLINICAL_ATTRIBUTE_VALUE_LENGTH, + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'column_name': col_name, + 'cause': value[:50] + '...'}) + # if not blank, check if values match the datatype if value.strip().lower() in self.NULL_VALUES: pass diff --git a/tests/test_data/data_clin_value_too_long.txt b/tests/test_data/data_clin_value_too_long.txt new file mode 100644 index 00000000..307aa86b --- /dev/null +++ b/tests/test_data/data_clin_value_too_long.txt @@ -0,0 +1,7 @@ +#Patient Identifier Sample Identifier Subtype +#Identifier to uniquely specify a patient. A unique sample identifier. Subtype description. +#STRING STRING STRING +#1 1 1 +PATIENT_ID SAMPLE_ID SUBTYPE +TEST-PAT1 TEST-PAT1-SAMPLE1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +TEST-PAT2 TEST-PAT2-SAMPLE1 BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB diff --git a/tests/unit_tests_validate_data.py b/tests/unit_tests_validate_data.py index 2ae21efc..947c131b 100755 --- a/tests/unit_tests_validate_data.py +++ b/tests/unit_tests_validate_data.py @@ -368,6 +368,25 @@ def test_sample_with_invalid_characters_in_sample_id(self): self.assertEqual(record.line_number, 11) self.assertIn('PATIENT_ID and SAMPLE_ID can only contain letters, numbers, points, underscores and/or hyphens', record.getMessage()) + def test_clinical_attribute_value_too_long(self): + """Test that a value exceeding the maximum length raises an error. + + Clinical attribute values are stored in a varchar(255) column, so a + longer value would be silently truncated on load. The boundary value of + exactly 255 characters is accepted, while 256 characters is rejected. + """ + self.logger.setLevel(logging.ERROR) + record_list = self.validate('data_clin_value_too_long.txt', + validateData.SampleClinicalValidator) + # only the 256-character value (line 7) should be flagged; the + # 255-character value on line 6 is allowed + self.assertEqual(1, len(record_list)) + record = record_list.pop() + self.assertEqual(logging.ERROR, record.levelno) + self.assertEqual(7, record.line_number) + self.assertEqual(3, record.column_number) + self.assertIn('maximum length', record.getMessage()) + class PatientAttrFileTestCase(PostClinicalDataFileTestCase):