Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions scripts/importer/validateData.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@
# global character limit on sample stable ids
MAX_SAMPLE_STABLE_ID_LENGTH = 63

# global character limit on clinical attribute values; matches the
# `clinical_patient`/`clinical_sample` ATTR_VALUE varchar(255) database columns,
# beyond which values would be silently truncated on load
MAX_CLINICAL_ATTRIBUTE_VALUE_LENGTH = 255

# global variable that defines the invalid ID characters
INVALID_ID_CHARACTERS = r"[^A-Za-z0-9._()\[\]',+\-:;]"

Expand Down Expand Up @@ -2750,6 +2755,18 @@ def checkLine(self, data):
value = data[col_index].strip()
data_type = self.attr_defs[col_index]['datatype']

# values are stored in a varchar(255) column and would be silently
# truncated on load if they are longer than that
if len(value) > MAX_CLINICAL_ATTRIBUTE_VALUE_LENGTH:
self.logger.error(
'Value exceeds the maximum length of %d characters and '
'would be truncated when loaded' %
MAX_CLINICAL_ATTRIBUTE_VALUE_LENGTH,
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'column_name': col_name,
'cause': value[:50] + '...'})

# if not blank, check if values match the datatype
if value.strip().lower() in self.NULL_VALUES:
pass
Expand Down
7 changes: 7 additions & 0 deletions tests/test_data/data_clin_value_too_long.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#Patient Identifier Sample Identifier Subtype
#Identifier to uniquely specify a patient. A unique sample identifier. Subtype description.
#STRING STRING STRING
#1 1 1
PATIENT_ID SAMPLE_ID SUBTYPE
TEST-PAT1 TEST-PAT1-SAMPLE1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
TEST-PAT2 TEST-PAT2-SAMPLE1 BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
19 changes: 19 additions & 0 deletions tests/unit_tests_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,25 @@ def test_sample_with_invalid_characters_in_sample_id(self):
self.assertEqual(record.line_number, 11)
self.assertIn('PATIENT_ID and SAMPLE_ID can only contain letters, numbers, points, underscores and/or hyphens', record.getMessage())

def test_clinical_attribute_value_too_long(self):
"""Test that a value exceeding the maximum length raises an error.

Clinical attribute values are stored in a varchar(255) column, so a
longer value would be silently truncated on load. The boundary value of
exactly 255 characters is accepted, while 256 characters is rejected.
"""
self.logger.setLevel(logging.ERROR)
record_list = self.validate('data_clin_value_too_long.txt',
validateData.SampleClinicalValidator)
# only the 256-character value (line 7) should be flagged; the
# 255-character value on line 6 is allowed
self.assertEqual(1, len(record_list))
record = record_list.pop()
self.assertEqual(logging.ERROR, record.levelno)
self.assertEqual(7, record.line_number)
self.assertEqual(3, record.column_number)
self.assertIn('maximum length', record.getMessage())



class PatientAttrFileTestCase(PostClinicalDataFileTestCase):
Expand Down