From e74703c8a20140f6b4abc538fef1c4e0de6b54fc Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 30 Oct 2024 15:43:34 +0100 Subject: [PATCH 01/10] Cover with test data mutation filtering lines in the code --- .../scripts/TestImportExtendedMutationData.java | 14 ++++++++++++++ src/test/resources/data_mutations_filtering.txt | 16 ++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 src/test/resources/data_mutations_filtering.txt diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportExtendedMutationData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportExtendedMutationData.java index b76c33d8..d7a8c8dd 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportExtendedMutationData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportExtendedMutationData.java @@ -411,4 +411,18 @@ file, testGeneticProfileId, null, null, newHashSet("foo-namespace", "bar-namespa assertNull(mutations.get(0).getAnnotationJson()); } + /** + * Check that filtering works + */ + @Test + public void testImportExtendedMutationFiltering() throws IOException, DaoException { + File file = new File("src/test/resources/data_mutations_filtering.txt"); + ImportExtendedMutationData parser = new ImportExtendedMutationData(file, geneticProfileId, null); + parser.importData(); + MySQLbulkLoader.flushAll(); + + ArrayList mutationList = DaoMutation.getAllMutations(geneticProfileId); + + assertEquals(1, mutationList.size()); + } } diff --git a/src/test/resources/data_mutations_filtering.txt b/src/test/resources/data_mutations_filtering.txt new file mode 100644 index 00000000..0cb0441b --- /dev/null +++ b/src/test/resources/data_mutations_filtering.txt @@ -0,0 +1,16 @@ +#sequenced_samples: TCGA-XX-0800-01 TCGA-XX-0900-01 +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU c_position_WU amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU transcript_error_WU MA:variant MA:GE.rank MA:CNA MA:OV.variant.samples MA:OV.gene.samples MA:mapping.issue MA:FImpact MA:FI.score MA:Func.region MA:bindsite.protein MA:bindsite.DNA/RNA MA:bindsite.sm.mol MA:CancerGenes MA:TS MA:OG MA:COSMIC.mutations MA:COSMIC.cancers MA:Uniprot.regions MA:Pfam.domain MA:link.var MA:link.MSA MA:link.PDB ONCOTATOR_VARIANT_CLASSIFICATION ONCOTATOR_PROTEIN_CHANGE ONCOTATOR_COSMIC_OVERLAPPING ONCOTATOR_DBSNP_RS ONCOTATOR_GENE_SYMBOL +FAM90A1 55138 genome.wustl.edu 37 12 8374781 8374782 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374782 8374783 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Wildtype Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 EXCLUDE 8374783 8374784 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374784 8374785 + silent INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374785 8374786 + 3'utr INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374786 8374787 + 5'utr INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374787 8374788 + 5'flank INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374788 8374789 + igr INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374789 8374790 + rna INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 55138 genome.wustl.edu 37 12 8374790 8374791 + INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors rna p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 -1 genome.wustl.edu 37 12 8374792 8374793 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 +FAM90A1 9999999 genome.wustl.edu 37 12 8374793 8374794 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 +Unknown 0 genome.wustl.edu 37 12 8374794 8374795 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 +UNEXISTING 0 genome.wustl.edu 37 12 8374795 8374796 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 From 2dd3f7c4939bff9979e7baa3c05cb268f2e6c238 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 30 Oct 2024 16:54:46 +0100 Subject: [PATCH 02/10] Remove filtering records Validation_Status=Wildtype I am a bit puzzled by finding Validation_Status set to Wildtype. Typically, only Mutation_Status is marked as Wildtype, not Validation_Status. It seems redundant since records with Mutation_Status=Wildtype are already filtered out by MutationFilter. If there are messy records that do have Validation_Status=Wildtype we should consider accepting only records that have Validation_Status values among valid ones {Valid, Unknown, Not Valid, Untested, Redacted?} --- .../portal/scripts/ImportExtendedMutationData.java | 10 ---------- src/test/resources/data_mutations_filtering.txt | 1 - 2 files changed, 11 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 4cc8fb20..e131e693 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -222,16 +222,6 @@ public void importData() throws IOException, DaoException { processedSamples.add(sample.getInternalId()); } - String validationStatus = record.getValidationStatus(); - - if (validationStatus == null || - validationStatus.equalsIgnoreCase("Wildtype")) - { - ProgressMonitor.logWarning("Skipping entry with Validation_Status: Wildtype"); - entriesSkipped++; - continue; - } - String chr = DaoGeneOptimized.normalizeChr(record.getChr().toUpperCase()); if (chr==null) { ProgressMonitor.logWarning("Skipping entry with chromosome value: " + record.getChr()); diff --git a/src/test/resources/data_mutations_filtering.txt b/src/test/resources/data_mutations_filtering.txt index 0cb0441b..5eb2b281 100644 --- a/src/test/resources/data_mutations_filtering.txt +++ b/src/test/resources/data_mutations_filtering.txt @@ -1,7 +1,6 @@ #sequenced_samples: TCGA-XX-0800-01 TCGA-XX-0900-01 Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU c_position_WU amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU transcript_error_WU MA:variant MA:GE.rank MA:CNA MA:OV.variant.samples MA:OV.gene.samples MA:mapping.issue MA:FImpact MA:FI.score MA:Func.region MA:bindsite.protein MA:bindsite.DNA/RNA MA:bindsite.sm.mol MA:CancerGenes MA:TS MA:OG MA:COSMIC.mutations MA:COSMIC.cancers MA:Uniprot.regions MA:Pfam.domain MA:link.var MA:link.MSA MA:link.PDB ONCOTATOR_VARIANT_CLASSIFICATION ONCOTATOR_PROTEIN_CHANGE ONCOTATOR_COSMIC_OVERLAPPING ONCOTATOR_DBSNP_RS ONCOTATOR_GENE_SYMBOL FAM90A1 55138 genome.wustl.edu 37 12 8374781 8374782 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 -FAM90A1 55138 genome.wustl.edu 37 12 8374782 8374783 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Wildtype Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 FAM90A1 55138 genome.wustl.edu 37 EXCLUDE 8374783 8374784 + In_Frame_Ins INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors In_Frame_Ins p.344_345insV rs138330958;rs149653769 FAM90A1 FAM90A1 55138 genome.wustl.edu 37 12 8374784 8374785 + silent INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 FAM90A1 55138 genome.wustl.edu 37 12 8374785 8374786 + 3'utr INS - - ACG TCGA-XX-0800-01 TCGA-XX-0800-01 - - - - Unknown Unknown Somatic Capture Illumina GAIIx 12 8266048 8266049 - ACG INS FAM90A1 NM_018088 human genbank 54_36p -1 validated in_frame_ins c.1032_1031 p.345in_frame_insV 0.000:0.001 NULL NULL - no_errors p.344_345insV rs138330958;rs149653769 FAM90A1 From 335d01c9c1d40593c64335b9a2520cf84979c695 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 30 Oct 2024 17:12:36 +0100 Subject: [PATCH 03/10] Move filtering out mutation with invalid chromosome from loader to filter class --- .../cbio/portal/dao/DaoGeneOptimized.java | 24 ------------ .../scripts/ImportExtendedMutationData.java | 8 ---- .../cbio/portal/scripts/MutationFilter.java | 37 +++++++++++++++++-- 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index d625e724..ea7041ba 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -343,30 +343,6 @@ public List guessGene(String geneId, String chr) { return Collections.emptyList(); } - - private static Map validChrValues = null; - public static String normalizeChr(String strChr) { - if (strChr==null) { - return null; - } - - if (validChrValues==null) { - validChrValues = new HashMap(); - for (int lc = 1; lc<=24; lc++) { - validChrValues.put(Integer.toString(lc),Integer.toString(lc)); - validChrValues.put("CHR" + Integer.toString(lc),Integer.toString(lc)); - } - validChrValues.put("X","23"); - validChrValues.put("CHRX","23"); - validChrValues.put("Y","24"); - validChrValues.put("CHRY","24"); - validChrValues.put("NA","NA"); - validChrValues.put("MT","MT"); // mitochondria - } - - return validChrValues.get(strChr); - } - private static String getChrFromCytoband(String cytoband) { if (cytoband==null) { return null; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index e131e693..20449ccb 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -222,14 +222,6 @@ public void importData() throws IOException, DaoException { processedSamples.add(sample.getInternalId()); } - String chr = DaoGeneOptimized.normalizeChr(record.getChr().toUpperCase()); - if (chr==null) { - ProgressMonitor.logWarning("Skipping entry with chromosome value: " + record.getChr()); - entriesSkipped++; - continue; - } - record.setChr(chr); - if (record.getStartPosition() < 0) record.setStartPosition(0); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java index f006f652..eff733c5 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java @@ -32,7 +32,6 @@ package org.mskcc.cbio.portal.scripts; -import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -55,12 +54,27 @@ public class MutationFilter { private int unknownAccepts=0; public int decisions=0; private int mutationStatusNoneRejects=0; + private int invalidChromosome=0; private int lohOrWildTypeRejects=0; private int emptyAnnotationRejects=0; private int missenseGermlineRejects=0; private int redactedRejects=0; public Map rejectionMap = new HashMap(); + private static final Map VALID_CHR_VALUES = new HashMap<>(); + static { + for (int lc = 1; lc<=24; lc++) { + VALID_CHR_VALUES.put(Integer.toString(lc),Integer.toString(lc)); + VALID_CHR_VALUES.put("CHR" + Integer.toString(lc),Integer.toString(lc)); + } + VALID_CHR_VALUES.put("X","23"); + VALID_CHR_VALUES.put("CHRX","23"); + VALID_CHR_VALUES.put("Y","24"); + VALID_CHR_VALUES.put("CHRY","24"); + VALID_CHR_VALUES.put("NA","NA"); + VALID_CHR_VALUES.put("MT","MT"); // mitochondria + } + /** * Construct a MutationFilter with no white lists. * This filter will @@ -104,7 +118,11 @@ public boolean acceptMutation(ExtendedMutation mutation, Set filteredMut | Translation_Start_Site | +------------------------+ */ - + // Do not accept mutations with invalid chromosome symbol + if (normalizeChr(mutation.getChr()) == null) { + invalidChromosome++; + return false; + } // Do not accept mutations with Mutation_Status of None if (safeStringTest( mutation.getMutationStatus(), "None" )) { mutationStatusNoneRejects++; @@ -160,7 +178,14 @@ public boolean acceptMutation(ExtendedMutation mutation, Set filteredMut return true; } } - + + public static String normalizeChr(String strChr) { + if (strChr == null) { + return null; + } + return VALID_CHR_VALUES.get(strChr.toUpperCase()); + } + /** * Provide number of decisions made by this MutationFilter. * @return the number of decisions made by this MutationFilter @@ -221,7 +246,11 @@ public int getSomaticWhitelistAccepts(){ return this.somaticWhitelistAccepts; } - /** + public int getInvalidChromosome() { + return invalidChromosome; + } + + /** * Provide number of unknown whitelist ACCEPT (return true) decisions made by this MutationFilter. * @return the number of unknown ACCEPT (return true) decisions made by this MutationFilter */ From dcd37321d2921b44b43e9ecf81633121333d8fc5 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 30 Oct 2024 17:47:11 +0100 Subject: [PATCH 04/10] Move skipping RNA type of mutations from loader to filter --- .../scripts/ImportExtendedMutationData.java | 49 ++++--------------- .../cbio/portal/scripts/MutationFilter.java | 5 +- .../portal/util/ExtendedMutationUtil.java | 38 +++----------- 3 files changed, 19 insertions(+), 73 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 20449ccb..e398b266 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -228,48 +228,17 @@ public void importData() throws IOException, DaoException { if (record.getEndPosition() < 0) record.setEndPosition(0); - String mutationType, - proteinChange, - aaChange, - codonChange, - refseqMrnaId, - uniprotAccession; - - int proteinPosStart, - proteinPosEnd; - - // determine whether to use canonical or best effect transcript - - // try canonical first - if (ExtendedMutationUtil.isAcceptableMutation(record.getVariantClassification())) - { - mutationType = record.getVariantClassification(); - } - // if not acceptable either, use the default value - else - { - mutationType = ExtendedMutationUtil.getMutationType(record); - } - - // skip RNA mutations - if (mutationType != null && mutationType.equalsIgnoreCase("rna")) - { - ProgressMonitor.logWarning("Skipping entry with mutation type: RNA"); - entriesSkipped++; - continue; - } - - proteinChange = ExtendedMutationUtil.getProteinChange(parts, record); + String proteinChange = ExtendedMutationUtil.getProteinChange(parts, record); //proteinChange = record.getProteinChange(); - aaChange = record.getAminoAcidChange(); - codonChange = record.getCodons(); - refseqMrnaId = record.getRefSeq(); + String aaChange = record.getAminoAcidChange(); + String codonChange = record.getCodons(); + String refseqMrnaId = record.getRefSeq(); //always uniprot accession - uniprotAccession = record.getSwissprot(); + String uniprotAccession = record.getSwissprot(); - proteinPosStart = ExtendedMutationUtil.getProteinPosStart( + int proteinPosStart = ExtendedMutationUtil.getProteinPosStart( record.getProteinPosition(), proteinChange); - proteinPosEnd = ExtendedMutationUtil.getProteinPosEnd( + int proteinPosEnd = ExtendedMutationUtil.getProteinPosEnd( record.getProteinPosition(), proteinChange); // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) @@ -313,13 +282,13 @@ public void importData() throws IOException, DaoException { gene = daoGene.getNonAmbiguousGene(geneSymbol, true); } + String mutationType = ExtendedMutationUtil.getMutationType(record); // assume symbol=Unknown and entrez=0 (or missing Entrez column) to imply an // intergenic, irrespective of what the column Variant_Classification says if (geneSymbol.equals("Unknown") && (entrezIdString.equals("0") || mafUtil.getEntrezGeneIdIndex() == -1)) { // give extra warning if mutationType is something different from IGR: - if (mutationType != null && - !mutationType.equalsIgnoreCase("IGR")) { + if (!"IRG".equalsIgnoreCase(mutationType)) { ProgressMonitor.logWarning( "Treating mutation with gene symbol 'Unknown' " + (mafUtil.getEntrezGeneIdIndex() == -1 ? "" : "and Entrez gene ID 0") + " as intergenic ('IGR') " + diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java index eff733c5..3eac1225 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java @@ -154,13 +154,14 @@ public boolean acceptMutation(ExtendedMutation mutation, Set filteredMut return true; } } else { - // Do not accept Silent, Intronic, 3'UTR, 5'UTR or IGR Mutations + // Do not accept Silent, Intronic, 3'UTR, 5'UTR, IGR or RNA Mutations if( safeStringTest( mutation.getMutationType(), "Silent" ) || safeStringTest( mutation.getMutationType(), "Intron" ) || safeStringTest( mutation.getMutationType(), "3'UTR" ) || safeStringTest( mutation.getMutationType(), "3'Flank" ) || safeStringTest( mutation.getMutationType(), "5'UTR" ) || - safeStringTest( mutation.getMutationType(), "IGR") ){ + safeStringTest( mutation.getMutationType(), "IGR" ) || + safeStringTest( mutation.getMutationType(), "RNA")){ addRejectedVariant(rejectionMap, mutation.getMutationType()); return false; } diff --git a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java index f4f61b59..6e533c64 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java @@ -172,39 +172,15 @@ public static boolean isValidProteinChange(String proteinChange) { return !invalid; } - public static boolean isAcceptableMutation(String mutationType) { - // check for null or NA values - if (mutationType == null || - mutationType.length() == 0 || - mutationType.equals("NULL") || - mutationType.equals(TabDelimitedFileUtil.NA_STRING)) { - return false; - } - - // check for the type - boolean silent = mutationType.toLowerCase().startsWith("silent"); - boolean loh = mutationType.toLowerCase().startsWith("loh"); - boolean wildtype = mutationType.toLowerCase().startsWith("wildtype"); - boolean utr3 = mutationType.toLowerCase().startsWith("3'utr"); - boolean utr5 = mutationType.toLowerCase().startsWith("5'utr"); - boolean flank5 = mutationType.toLowerCase().startsWith("5'flank"); - boolean igr = mutationType.toLowerCase().startsWith("igr"); - boolean rna = mutationType.equalsIgnoreCase("rna"); - - return !(silent || loh || wildtype || utr3 || utr5 || flank5 || igr || rna); - } - public static String getMutationType(MafRecord record) { - String mutationType = record.getMafVariantClassification(); - - if (mutationType == null || - mutationType.length() == 0 || - mutationType.equals("NULL") || - mutationType.equals(TabDelimitedFileUtil.NA_STRING)) { - mutationType = record.getVariantClassification(); - } + return isBlank(record.getVariantClassification()) ? record.getMafVariantClassification() : record.getVariantClassification(); + } - return mutationType; + private static boolean isBlank(String value) { + return value == null || + value.length() == 0 || + value.equals("NULL") || + value.equals(TabDelimitedFileUtil.NA_STRING); } public static Integer getTumorAltCount(MafRecord record) { From e9c8c748d61aea65598e4eea07eb4f221bdcea12 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 31 Oct 2024 20:55:39 +0100 Subject: [PATCH 05/10] Make filter accept maf record instead of extended mutation --- .../scripts/ImportExtendedMutationData.java | 215 +++++++++--------- .../cbio/portal/scripts/MutationFilter.java | 57 ++--- .../portal/util/ExtendedMutationUtil.java | 9 +- .../org/mskcc/cbio/portal/util/TsvUtil.java | 10 + .../scripts/TestMutationFilter.java | 51 +++-- 5 files changed, 177 insertions(+), 165 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index e398b266..45c42d8a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -197,6 +197,10 @@ public void importData() throws IOException, DaoException { { String[] parts = TsvUtil.splitTsvLine(line); MafRecord record = mafUtil.parseRecord(line); + if (!myMutationFilter.acceptMutation(record, this.filteredMutations)) { + entriesSkipped++; + continue; + } if (!record.getNcbiBuild().equalsIgnoreCase(genomeBuildName)) { ProgressMonitor.logWarning("Genome Build Name does not match, expecting " + genomeBuildName); @@ -311,118 +315,111 @@ public void importData() throws IOException, DaoException { "and all mutation data associated with it!"); entriesSkipped++; continue; + } + ExtendedMutation mutation = new ExtendedMutation(); + + mutation.setGeneticProfileId(geneticProfileId); + mutation.setSampleId(sample.getInternalId()); + mutation.setGene(gene); + mutation.setSequencingCenter(record.getCenter()); + mutation.setSequencer(record.getSequencer()); + mutation.setProteinChange(proteinChange); + mutation.setAminoAcidChange(aaChange); + mutation.setMutationType(mutationType); + mutation.setChr(record.getChr()); + mutation.setStartPosition(record.getStartPosition()); + mutation.setEndPosition(record.getEndPosition()); + mutation.setValidationStatus(record.getValidationStatus()); + mutation.setMutationStatus(record.getMutationStatus()); + mutation.setNcbiBuild(record.getNcbiBuild()); + mutation.setStrand(record.getStrand()); + mutation.setVariantType(record.getVariantType()); + mutation.setAllele(record.getTumorSeqAllele1(), record.getTumorSeqAllele2(), record.getReferenceAllele()); + // log whether tumor seq allele is empty (failed to resolve tumor seq allele because of invalid data values) + if (mutation.getTumorSeqAllele().isEmpty()) { + ProgressMonitor.logWarning("Tumor allele could not be resolved for sample '" + sample.getStableId() + + "' (chr,start,end,ref,tum1,tum2) = (" + record.getChr() + "," + record.getStartPosition() + "," + + record.getEndPosition() + "," + record.getReferenceAllele() + "," + record.getTumorSeqAllele1() + + "," + record.getTumorSeqAllele2() + ")"); + } + mutation.setDbSnpRs(record.getDbSNP_RS()); + mutation.setDbSnpValStatus(record.getDbSnpValStatus()); + mutation.setMatchedNormSampleBarcode(record.getMatchedNormSampleBarcode()); + mutation.setMatchNormSeqAllele1(record.getMatchNormSeqAllele1()); + mutation.setMatchNormSeqAllele2(record.getMatchNormSeqAllele2()); + mutation.setTumorValidationAllele1(record.getTumorValidationAllele1()); + mutation.setTumorValidationAllele2(record.getTumorValidationAllele2()); + mutation.setMatchNormValidationAllele1(record.getMatchNormValidationAllele1()); + mutation.setMatchNormValidationAllele2(record.getMatchNormValidationAllele2()); + mutation.setVerificationStatus(record.getVerificationStatus()); + mutation.setSequencingPhase(record.getSequencingPhase()); + mutation.setSequenceSource(record.getSequenceSource()); + mutation.setValidationMethod(record.getValidationMethod()); + mutation.setScore(record.getScore()); + mutation.setBamFile(record.getBamFile()); + mutation.setTumorAltCount(ExtendedMutationUtil.getTumorAltCount(record)); + mutation.setTumorRefCount(ExtendedMutationUtil.getTumorRefCount(record)); + mutation.setNormalAltCount(ExtendedMutationUtil.getNormalAltCount(record)); + mutation.setNormalRefCount(ExtendedMutationUtil.getNormalRefCount(record)); + + // renamed the oncotator column names to mutation + mutation.setCodonChange(codonChange); + mutation.setRefseqMrnaId(refseqMrnaId); + mutation.setUniprotAccession(uniprotAccession); + mutation.setProteinPosStart(proteinPosStart); + mutation.setProteinPosEnd(proteinPosEnd); + + mutation.setDriverFilter(record.getDriverFilter()); + mutation.setDriverFilterAnn(record.getDriverFilterAnn()); + mutation.setDriverTiersFilter(record.getDriverTiersFilter()); + mutation.setDriverTiersFilterAnn(record.getDriverTiersFilterAnn()); + + // TODO we don't use this info right now... + mutation.setCanonicalTranscript(true); + + AlleleSpecificCopyNumber ascn = null; + if (namespaces != null && namespaces.contains(ASCN_NAMESPACE)) { + Map ascnData = record.getNamespacesMap().remove(ASCN_NAMESPACE); + // The AlleleSpecificCopyNumber constructor will construct the record from + // the ascnData hashmap and the ascnData will simultaneously be removed from + // the record's namespaces map since it is going into its own table + ascn = new AlleleSpecificCopyNumber(ascnData); + } + mutation.setAnnotationJson( + mafUtil.getNamespaceColumnParser().writeValueAsString(record.getNamespacesMap()) + ); + + sequencedCaseSet.add(sample.getStableId()); + + MutationEvent event = + existingEvents.containsKey(mutation.getEvent()) ? + existingEvents.get(mutation.getEvent()) : + DaoMutation.getMutationEvent(mutation.getEvent()); + if (event!=null) { + mutation.setEvent(event); } else { - ExtendedMutation mutation = new ExtendedMutation(); - - mutation.setGeneticProfileId(geneticProfileId); - mutation.setSampleId(sample.getInternalId()); - mutation.setGene(gene); - mutation.setSequencingCenter(record.getCenter()); - mutation.setSequencer(record.getSequencer()); - mutation.setProteinChange(proteinChange); - mutation.setAminoAcidChange(aaChange); - mutation.setMutationType(mutationType); - mutation.setChr(record.getChr()); - mutation.setStartPosition(record.getStartPosition()); - mutation.setEndPosition(record.getEndPosition()); - mutation.setValidationStatus(record.getValidationStatus()); - mutation.setMutationStatus(record.getMutationStatus()); - mutation.setNcbiBuild(record.getNcbiBuild()); - mutation.setStrand(record.getStrand()); - mutation.setVariantType(record.getVariantType()); - mutation.setAllele(record.getTumorSeqAllele1(), record.getTumorSeqAllele2(), record.getReferenceAllele()); - // log whether tumor seq allele is empty (failed to resolve tumor seq allele because of invalid data values) - if (mutation.getTumorSeqAllele().isEmpty()) { - ProgressMonitor.logWarning("Tumor allele could not be resolved for sample '" + sample.getStableId() + - "' (chr,start,end,ref,tum1,tum2) = (" + record.getChr() + "," + record.getStartPosition() + "," + - record.getEndPosition() + "," + record.getReferenceAllele() + "," + record.getTumorSeqAllele1() + - "," + record.getTumorSeqAllele2() + ")"); - } - mutation.setDbSnpRs(record.getDbSNP_RS()); - mutation.setDbSnpValStatus(record.getDbSnpValStatus()); - mutation.setMatchedNormSampleBarcode(record.getMatchedNormSampleBarcode()); - mutation.setMatchNormSeqAllele1(record.getMatchNormSeqAllele1()); - mutation.setMatchNormSeqAllele2(record.getMatchNormSeqAllele2()); - mutation.setTumorValidationAllele1(record.getTumorValidationAllele1()); - mutation.setTumorValidationAllele2(record.getTumorValidationAllele2()); - mutation.setMatchNormValidationAllele1(record.getMatchNormValidationAllele1()); - mutation.setMatchNormValidationAllele2(record.getMatchNormValidationAllele2()); - mutation.setVerificationStatus(record.getVerificationStatus()); - mutation.setSequencingPhase(record.getSequencingPhase()); - mutation.setSequenceSource(record.getSequenceSource()); - mutation.setValidationMethod(record.getValidationMethod()); - mutation.setScore(record.getScore()); - mutation.setBamFile(record.getBamFile()); - mutation.setTumorAltCount(ExtendedMutationUtil.getTumorAltCount(record)); - mutation.setTumorRefCount(ExtendedMutationUtil.getTumorRefCount(record)); - mutation.setNormalAltCount(ExtendedMutationUtil.getNormalAltCount(record)); - mutation.setNormalRefCount(ExtendedMutationUtil.getNormalRefCount(record)); - - // renamed the oncotator column names to mutation - mutation.setCodonChange(codonChange); - mutation.setRefseqMrnaId(refseqMrnaId); - mutation.setUniprotAccession(uniprotAccession); - mutation.setProteinPosStart(proteinPosStart); - mutation.setProteinPosEnd(proteinPosEnd); - - mutation.setDriverFilter(record.getDriverFilter()); - mutation.setDriverFilterAnn(record.getDriverFilterAnn()); - mutation.setDriverTiersFilter(record.getDriverTiersFilter()); - mutation.setDriverTiersFilterAnn(record.getDriverTiersFilterAnn()); - - // TODO we don't use this info right now... - mutation.setCanonicalTranscript(true); - - AlleleSpecificCopyNumber ascn = null; - if (namespaces != null && namespaces.contains(ASCN_NAMESPACE)) { - Map ascnData = record.getNamespacesMap().remove(ASCN_NAMESPACE); - // The AlleleSpecificCopyNumber constructor will construct the record from - // the ascnData hashmap and the ascnData will simultaneously be removed from - // the record's namespaces map since it is going into its own table - ascn = new AlleleSpecificCopyNumber(ascnData); - } - mutation.setAnnotationJson( - mafUtil.getNamespaceColumnParser().writeValueAsString(record.getNamespacesMap()) - ); - - sequencedCaseSet.add(sample.getStableId()); - - // Filter out Mutations - if( myMutationFilter.acceptMutation( mutation, this.filteredMutations )) { - MutationEvent event = - existingEvents.containsKey(mutation.getEvent()) ? - existingEvents.get(mutation.getEvent()) : - DaoMutation.getMutationEvent(mutation.getEvent()); - if (event!=null) { - mutation.setEvent(event); - } else { - mutation.setMutationEventId(++mutationEventId); - existingEvents.put(mutation.getEvent(), mutation.getEvent()); - newEvents.add(mutation.getEvent()); - } - - ExtendedMutation exist = mutations.get(mutation); - if (exist!=null) { - ExtendedMutation merged = mergeMutationData(exist, mutation); - mutations.put(merged, merged); - } else { - mutations.put(mutation,mutation); - } - // update ascn object with mutation unique key details - if (ascn != null){ - ascn.updateAscnUniqueKeyDetails(mutation); - ascnRecords.add(ascn); - } + mutation.setMutationEventId(++mutationEventId); + existingEvents.put(mutation.getEvent(), mutation.getEvent()); + newEvents.add(mutation.getEvent()); + } - //keep track: - sampleSet.add(sample.getStableId()); - internalSampleIds.add(sample.getInternalId()); - geneSet.add(mutation.getEntrezGeneId()+""); - } - else { - entriesSkipped++; - } + ExtendedMutation exist = mutations.get(mutation); + if (exist!=null) { + ExtendedMutation merged = mergeMutationData(exist, mutation); + mutations.put(merged, merged); + } else { + mutations.put(mutation,mutation); } + // update ascn object with mutation unique key details + if (ascn != null){ + ascn.updateAscnUniqueKeyDetails(mutation); + ascnRecords.add(ascn); + } + + //keep track: + sampleSet.add(sample.getStableId()); + internalSampleIds.add(sample.getInternalId()); + geneSet.add(mutation.getEntrezGeneId()+""); } } DaoSampleProfile.upsertSampleToProfileMapping(internalSampleIds, geneticProfileId, genePanelId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java index 3eac1225..aa65f8be 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java @@ -37,7 +37,11 @@ import java.util.Map; import java.util.Set; +import org.mskcc.cbio.maf.MafRecord; +import org.mskcc.cbio.maf.TabDelimitedFileUtil; import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.util.ExtendedMutationUtil; +import org.mskcc.cbio.portal.util.TsvUtil; /** * Filter mutations as they're imported into the CGDS dbms. @@ -89,15 +93,15 @@ public MutationFilter() throws IllegalArgumentException{ } /** - * Indicate whether the specified mutation should be accepted as input to + * Indicate whether the specified mafRecord should be accepted as input to * the CGDS Database. *

- * @param mutation - * an ExtendedMutation. + * @param mafRecord + * a MAF line/record. *
- * @return true if the mutation should be imported into the dbms + * @return true if the mafRecord should be imported into the dbms */ - public boolean acceptMutation(ExtendedMutation mutation, Set filteredMutations) { + public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations) { this.decisions++; /* @@ -119,58 +123,59 @@ public boolean acceptMutation(ExtendedMutation mutation, Set filteredMut +------------------------+ */ // Do not accept mutations with invalid chromosome symbol - if (normalizeChr(mutation.getChr()) == null) { + if (normalizeChr(mafRecord.getChr()) == null) { invalidChromosome++; return false; } // Do not accept mutations with Mutation_Status of None - if (safeStringTest( mutation.getMutationStatus(), "None" )) { + if (safeStringTest( mafRecord.getMutationStatus(), "None" )) { mutationStatusNoneRejects++; return false; } // Do not accept LOH or Wildtype Mutations - if( safeStringTest( mutation.getMutationStatus(), "LOH" ) || - safeStringTest( mutation.getMutationStatus(), "Wildtype" ) ){ + if( safeStringTest( mafRecord.getMutationStatus(), "LOH" ) || + safeStringTest( mafRecord.getMutationStatus(), "Wildtype" ) ){ lohOrWildTypeRejects++; return false; } // Do not accept Redacted mutations - if (safeStringTest(mutation.getValidationStatus(), "Redacted")) { + if (safeStringTest(mafRecord.getValidationStatus(), "Redacted")) { redactedRejects++; return false; } //Filter by types if specified in the meta file, else filter for the default types + String mutationType = ExtendedMutationUtil.getMutationType(mafRecord); if (filteredMutations != null) { - if (filteredMutations.contains(mutation.getMutationType())) { - addRejectedVariant(rejectionMap, mutation.getMutationType()); + if (filteredMutations.contains(mutationType)) { + addRejectedVariant(rejectionMap, mutationType); return false; } else { - if( safeStringTest( mutation.getMutationType(), "5'Flank" ) ) { - mutation.setProteinChange("Promoter"); + if( safeStringTest( mutationType, "5'Flank" ) ) { + mafRecord.setProteinChange("Promoter"); } return true; } } else { // Do not accept Silent, Intronic, 3'UTR, 5'UTR, IGR or RNA Mutations - if( safeStringTest( mutation.getMutationType(), "Silent" ) || - safeStringTest( mutation.getMutationType(), "Intron" ) || - safeStringTest( mutation.getMutationType(), "3'UTR" ) || - safeStringTest( mutation.getMutationType(), "3'Flank" ) || - safeStringTest( mutation.getMutationType(), "5'UTR" ) || - safeStringTest( mutation.getMutationType(), "IGR" ) || - safeStringTest( mutation.getMutationType(), "RNA")){ - addRejectedVariant(rejectionMap, mutation.getMutationType()); + if( safeStringTest( mutationType, "Silent" ) || + safeStringTest( mutationType, "Intron" ) || + safeStringTest( mutationType, "3'UTR" ) || + safeStringTest( mutationType, "3'Flank" ) || + safeStringTest( mutationType, "5'UTR" ) || + safeStringTest( mutationType, "IGR" ) || + safeStringTest( mutationType, "RNA")){ + addRejectedVariant(rejectionMap, mutationType); return false; } - if( safeStringTest( mutation.getMutationType(), "5'Flank" ) ) { - if (whiteListGenesForPromoterMutations.contains(mutation.getEntrezGeneId())){ - mutation.setProteinChange("Promoter"); + if( safeStringTest( mutationType, "5'Flank" ) ) { + if (whiteListGenesForPromoterMutations.contains(mafRecord.getGivenEntrezGeneId())){ + mafRecord.setProteinChange("Promoter"); } else { - addRejectedVariant(rejectionMap, mutation.getMutationType()); + addRejectedVariant(rejectionMap, mutationType); return false; } } diff --git a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java index 6e533c64..614d6236 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java @@ -173,14 +173,7 @@ public static boolean isValidProteinChange(String proteinChange) { } public static String getMutationType(MafRecord record) { - return isBlank(record.getVariantClassification()) ? record.getMafVariantClassification() : record.getVariantClassification(); - } - - private static boolean isBlank(String value) { - return value == null || - value.length() == 0 || - value.equals("NULL") || - value.equals(TabDelimitedFileUtil.NA_STRING); + return TsvUtil.isBlank(record.getVariantClassification()) ? record.getMafVariantClassification() : record.getVariantClassification(); } public static Integer getTumorAltCount(MafRecord record) { diff --git a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java index 0c2e61a2..29191d8f 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java @@ -1,5 +1,7 @@ package org.mskcc.cbio.portal.util; +import org.mskcc.cbio.maf.TabDelimitedFileUtil; + /** * Utils to parse and validate TSV lines * @author Ruslan Forostianov @@ -40,4 +42,12 @@ public static void ensureHeaderAndRowMatch(String[] headerParts, String[] rowPar + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); } } + + public static boolean isBlank(String value) { + return value == null || + value.isEmpty() || + value.equals("NULL") || + value.equals(TabDelimitedFileUtil.NA_STRING); + } } + diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java index 2c193723..b66f46ab 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java @@ -35,6 +35,7 @@ import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; +import org.mskcc.cbio.maf.MafRecord; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.model.CanonicalGene; @@ -77,7 +78,7 @@ public void testNoWhitelists( ){ nowTestAcceptMutation( myMutationFilter, true, - 4L, + "2", "Unknown", // validationStatus, "Unknown", // mutationStatus, "Unknown" // mutationType @@ -87,7 +88,7 @@ public void testNoWhitelists( ){ nowTestAcceptMutation( myMutationFilter, true, - 4L, + "3", "Valid", // validationStatus, "Unknown", // mutationStatus, "Unknown" // mutationType @@ -97,28 +98,28 @@ public void testNoWhitelists( ){ nowTestAcceptMutation( myMutationFilter, true, - 4L, + "4", "Unknown", // validationStatus, "Somatic", // mutationStatus, "Unknown" // mutationType ); // valid && somatic - nowTestAcceptMutation( + nowTestAcceptMutation( myMutationFilter, - true, - 4L, + true, + "5", "Valid", // validationStatus, "Somatic", // mutationStatus, "Unknown" // mutationType - ); + ); // valid && somatic // testing safeStringTest() nowTestAcceptMutation( myMutationFilter, true, - 4L, + "6", "vALid_as_hell", // validationStatus, "SOMatic_for_sure", // mutationStatus, "Unknown" // mutationType @@ -130,22 +131,20 @@ public void testNoWhitelists( ){ private void nowTestAcceptMutation( MutationFilter myMutationFilter, boolean expectedResult, - long entrezGeneId, + String chr, String validationStatus, String mutationStatus, String mutationType ) { - CanonicalGene gene = new CanonicalGene(entrezGeneId, "XXX"); - ExtendedMutation anExtendedMutation = new ExtendedMutation( - gene, // gene, - validationStatus, // validationStatus, - mutationStatus, // mutationStatus, - mutationType // mutationType - ); + MafRecord mafRecord = new MafRecord(); + mafRecord.setChr(chr); + mafRecord.setValidationStatus(validationStatus); + mafRecord.setMutationStatus(mutationStatus); + mafRecord.setVariantClassification(mutationType); if (expectedResult) { - assertTrue(myMutationFilter.acceptMutation(anExtendedMutation, null)); + assertTrue(myMutationFilter.acceptMutation(mafRecord, null)); } else { - assertFalse(myMutationFilter.acceptMutation(anExtendedMutation, null)); + assertFalse(myMutationFilter.acceptMutation(mafRecord, null)); } } @@ -155,7 +154,7 @@ private void alwaysRejectTheseMutations(MutationFilter myMutationFilter){ nowTestAcceptMutation( myMutationFilter, false, - 1L, + "1", "Unknown", "Unknown", "Silent" @@ -163,7 +162,7 @@ private void alwaysRejectTheseMutations(MutationFilter myMutationFilter){ nowTestAcceptMutation( myMutationFilter, false, - 1L, + "2", "Unknown", "Unknown", "Intron" @@ -171,7 +170,7 @@ private void alwaysRejectTheseMutations(MutationFilter myMutationFilter){ nowTestAcceptMutation( myMutationFilter, false, - 1L, + "3", "Unknown", "LOH", "Unknown" @@ -179,11 +178,19 @@ private void alwaysRejectTheseMutations(MutationFilter myMutationFilter){ nowTestAcceptMutation( myMutationFilter, false, - 1L, + "4", "Unknown", "Wildtype", "Unknown" ); + nowTestAcceptMutation( + myMutationFilter, + false, + "28", + "Valid", + "Somatic", + "Unknown" + ); } From c22089f4b7648b5b3ba96892cc4edb069c74553b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 31 Oct 2024 22:28:37 +0100 Subject: [PATCH 06/10] Move mutation filtering by gene id and symbol from loader to filter --- .../scripts/ImportExtendedMutationData.java | 68 ++---------- .../cbio/portal/scripts/MutationFilter.java | 105 +++++++----------- .../portal/util/ExtendedMutationUtil.java | 22 ++++ 3 files changed, 70 insertions(+), 125 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 45c42d8a..1a2f56f4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -245,67 +245,15 @@ public void importData() throws IOException, DaoException { int proteinPosEnd = ExtendedMutationUtil.getProteinPosEnd( record.getProteinPosition(), proteinChange); - // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) - String geneSymbol = record.getHugoGeneSymbol(); - String entrezIdString = record.getGivenEntrezGeneId(); - + String geneSymbol = ExtendedMutationUtil.normalizeGeneSymbol(record.getHugoGeneSymbol()); + Long entrezGeneId = ExtendedMutationUtil.parseEntrezGeneId(record.getGivenEntrezGeneId()); CanonicalGene gene = null; - // try to parse entrez if it is not empty nor 0: - if (!(entrezIdString.isEmpty() || - entrezIdString.equals("0"))) { - Long entrezGeneId; - try { - entrezGeneId = Long.parseLong(entrezIdString); - } catch (NumberFormatException e) { - entrezGeneId = null; - } - //non numeric values or negative values should not be allowed: - if (entrezGeneId == null || entrezGeneId < 0) { - ProgressMonitor.logWarning( - "Ignoring line with invalid Entrez_Id " + - entrezIdString); - entriesSkipped++; - continue; - } else { - gene = daoGene.getGene(entrezGeneId); - if (gene == null) { - //skip if not in DB: - ProgressMonitor.logWarning( - "Entrez gene ID " + entrezGeneId + - " not found. Record will be skipped."); - entriesSkipped++; - continue; - } - } - } - - // If Entrez Gene ID Fails, try Symbol. - if (gene == null && - !(geneSymbol.equals("") || - geneSymbol.equals("Unknown"))) { + // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) + if (entrezGeneId != null) { + gene = daoGene.getGene(entrezGeneId); + } else if (geneSymbol != null) { gene = daoGene.getNonAmbiguousGene(geneSymbol, true); } - - String mutationType = ExtendedMutationUtil.getMutationType(record); - // assume symbol=Unknown and entrez=0 (or missing Entrez column) to imply an - // intergenic, irrespective of what the column Variant_Classification says - if (geneSymbol.equals("Unknown") && - (entrezIdString.equals("0") || mafUtil.getEntrezGeneIdIndex() == -1)) { - // give extra warning if mutationType is something different from IGR: - if (!"IRG".equalsIgnoreCase(mutationType)) { - ProgressMonitor.logWarning( - "Treating mutation with gene symbol 'Unknown' " + - (mafUtil.getEntrezGeneIdIndex() == -1 ? "" : "and Entrez gene ID 0") + " as intergenic ('IGR') " + - "instead of '" + mutationType + "'. Entry filtered/skipped."); - } - // treat as IGR: - myMutationFilter.decisions++; - myMutationFilter.addRejectedVariant(myMutationFilter.rejectionMap, "IGR"); - // skip entry: - entriesSkipped++; - continue; - } - // skip the record if a gene was expected but not identified if (gene == null) { ProgressMonitor.logWarning( @@ -316,8 +264,8 @@ public void importData() throws IOException, DaoException { entriesSkipped++; continue; } - ExtendedMutation mutation = new ExtendedMutation(); + ExtendedMutation mutation = new ExtendedMutation(); mutation.setGeneticProfileId(geneticProfileId); mutation.setSampleId(sample.getInternalId()); mutation.setGene(gene); @@ -325,7 +273,7 @@ public void importData() throws IOException, DaoException { mutation.setSequencer(record.getSequencer()); mutation.setProteinChange(proteinChange); mutation.setAminoAcidChange(aaChange); - mutation.setMutationType(mutationType); + mutation.setMutationType(ExtendedMutationUtil.getMutationType(record)); mutation.setChr(record.getChr()); mutation.setStartPosition(record.getStartPosition()); mutation.setEndPosition(record.getEndPosition()); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java index aa65f8be..8f8c9344 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java @@ -38,10 +38,7 @@ import java.util.Set; import org.mskcc.cbio.maf.MafRecord; -import org.mskcc.cbio.maf.TabDelimitedFileUtil; -import org.mskcc.cbio.portal.model.ExtendedMutation; import org.mskcc.cbio.portal.util.ExtendedMutationUtil; -import org.mskcc.cbio.portal.util.TsvUtil; /** * Filter mutations as they're imported into the CGDS dbms. @@ -50,19 +47,15 @@ */ public class MutationFilter { - private Set whiteListGenesForPromoterMutations; + private final Set whiteListGenesForPromoterMutations; private int accepts=0; - private int germlineWhitelistAccepts=0; - private int somaticWhitelistAccepts=0; - private int unknownAccepts=0; - public int decisions=0; + public int decisions=0; private int mutationStatusNoneRejects=0; private int invalidChromosome=0; + private int invalidGeneInfo=0; private int lohOrWildTypeRejects=0; - private int emptyAnnotationRejects=0; - private int missenseGermlineRejects=0; - private int redactedRejects=0; + private int redactedOrWildTypeRejects =0; public Map rejectionMap = new HashMap(); private static final Map VALID_CHR_VALUES = new HashMap<>(); @@ -89,7 +82,7 @@ public class MutationFilter { */ public MutationFilter() throws IllegalArgumentException{ whiteListGenesForPromoterMutations = new HashSet(); - whiteListGenesForPromoterMutations.add(Long.valueOf(7015)); // TERT + whiteListGenesForPromoterMutations.add(7015L); // TERT } /** @@ -122,6 +115,22 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations | Translation_Start_Site | +------------------------+ */ + if (ExtendedMutationUtil.isBlankEntrezGeneId(mafRecord.getGivenEntrezGeneId()) + && ExtendedMutationUtil.isBlankHugoGeneSymbol(mafRecord.getHugoGeneSymbol())) { + invalidGeneInfo++; + return false; + } + long entrezGeneId; + try { + entrezGeneId = Long.parseLong(mafRecord.getGivenEntrezGeneId()); + if (entrezGeneId < 0) { + invalidGeneInfo++; + return false; + } + } catch (NumberFormatException e) { + invalidGeneInfo++; + return false; + } // Do not accept mutations with invalid chromosome symbol if (normalizeChr(mafRecord.getChr()) == null) { invalidChromosome++; @@ -140,9 +149,10 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations return false; } - // Do not accept Redacted mutations - if (safeStringTest(mafRecord.getValidationStatus(), "Redacted")) { - redactedRejects++; + // Do not accept Redacted or Wildtype mutations + if (safeStringTest(mafRecord.getValidationStatus(), "Redacted") || + safeStringTest( mafRecord.getValidationStatus(), "Wildtype" )) { + redactedOrWildTypeRejects++; return false; } @@ -150,7 +160,7 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations String mutationType = ExtendedMutationUtil.getMutationType(mafRecord); if (filteredMutations != null) { if (filteredMutations.contains(mutationType)) { - addRejectedVariant(rejectionMap, mutationType); + addRejectedVariant(mutationType); return false; } else { if( safeStringTest( mutationType, "5'Flank" ) ) { @@ -167,15 +177,15 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations safeStringTest( mutationType, "5'UTR" ) || safeStringTest( mutationType, "IGR" ) || safeStringTest( mutationType, "RNA")){ - addRejectedVariant(rejectionMap, mutationType); + addRejectedVariant(mutationType); return false; } if( safeStringTest( mutationType, "5'Flank" ) ) { - if (whiteListGenesForPromoterMutations.contains(mafRecord.getGivenEntrezGeneId())){ + if (whiteListGenesForPromoterMutations.contains(entrezGeneId)){ mafRecord.setProteinChange("Promoter"); } else { - addRejectedVariant(rejectionMap, mutationType); + addRejectedVariant(mutationType); return false; } } @@ -220,61 +230,25 @@ public int getLohOrWildTypeRejects() { return this.lohOrWildTypeRejects; } - /** - * Provide number of REJECT decisions for Emtpy Annotation Mutations. - * @return number of REJECT decisions for Empty Annotation Mutations. - */ - public int getEmptyAnnotationRejects() { - return this.emptyAnnotationRejects; - } - - /** - * Provide number of REJECT decisions for Missense Germline Mutations. - * @return number of REJECT decisions for Missense Germline Mutations. - */ - public int getMissenseGermlineRejects() { - return this.missenseGermlineRejects; - } - - /** - * Provide number of germline whitelist ACCEPT (return true) decisions made by this MutationFilter. - * @return the number of germline whitelist ACCEPT (return true) decisions made by this MutationFilter - */ - public int getGermlineWhitelistAccepts(){ - return this.germlineWhitelistAccepts; - } - - /** - * Provide number of somatic whitelist ACCEPT (return true) decisions made by this MutationFilter. - * @return the number of somatic whitelist ACCEPT (return true) decisions made by this MutationFilter - */ - public int getSomaticWhitelistAccepts(){ - return this.somaticWhitelistAccepts; - } - public int getInvalidChromosome() { return invalidChromosome; } - /** - * Provide number of unknown whitelist ACCEPT (return true) decisions made by this MutationFilter. - * @return the number of unknown ACCEPT (return true) decisions made by this MutationFilter - */ - public int getUnknownAccepts(){ - return this.unknownAccepts; + public int getInvalidGeneInfo() { + return invalidGeneInfo; } - public int getRedactedRejects() + public int getRedactedOrWildTypeRejects() { - return this.redactedRejects; + return this.redactedOrWildTypeRejects; } public Map getRejectionMap() { return this.rejectionMap; } - public void addRejectedVariant(Map rejectionMap, String mutation) { - this.rejectionMap.computeIfAbsent(mutation, (k) -> 0); + public void addRejectedVariant(String mutation) { + this.rejectionMap.putIfAbsent(mutation, 0); this.rejectionMap.computeIfPresent(mutation, (k, v) -> v + 1); } @@ -290,10 +264,11 @@ public String getStatistics(){ String statistics = "Mutation filter decisions: " + this.getDecisions() + "\nRejects: " + this.getRejects() + "\nMutation Status 'None' Rejects: " + this.getMutationStatusNoneRejects() + - "\nLOH or Wild Type Rejects: " + this.getLohOrWildTypeRejects() + - "\nEmpty Annotation Rejects: " + this.getEmptyAnnotationRejects() + - "\nMissense Germline Rejects: " + this.getMissenseGermlineRejects(); - + "\nLOH or Wild Type Mutation Status Rejects: " + this.getLohOrWildTypeRejects() + + "\nRedacted or Wild Type Validation Status Rejects: " + this.getRedactedOrWildTypeRejects() + + "\nInvalid Choromosome Rejects: " + this.getInvalidChromosome() + + "\nInvalid Gene Info Rejects: " + this.getInvalidGeneInfo(); + Map variantsRejected = this.getRejectionMap(); for (Map.Entry variant : variantsRejected.entrySet()) { statistics = statistics + "\n" + variant.getKey() + " Rejects: " + variant.getValue(); diff --git a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java index 614d6236..a7119ac2 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java @@ -394,4 +394,26 @@ private static Map annotateProteinChange(String proteinChange) annotation.put("end", end); return annotation; } + + public static Long parseEntrezGeneId(String givenEntrezGeneId) { + if (isBlankEntrezGeneId(givenEntrezGeneId)) { + return null; + } + return Long.parseLong(givenEntrezGeneId); + } + + public static boolean isBlankEntrezGeneId(String givenEntrezGeneId) { + return givenEntrezGeneId == null || givenEntrezGeneId.trim().isEmpty() || "0".equals(givenEntrezGeneId); + } + + public static String normalizeGeneSymbol(String hugoGeneSymbol) { + if (isBlankHugoGeneSymbol(hugoGeneSymbol)) { + return null; + } + return hugoGeneSymbol.trim(); + } + + public static boolean isBlankHugoGeneSymbol(String hugoGeneSymbol) { + return hugoGeneSymbol == null || hugoGeneSymbol.trim().isEmpty() || "Unknown".equals(hugoGeneSymbol); + } } From a17d0ec0546cdc12580ad3ad0250393f8415e91b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 1 Nov 2024 13:11:12 +0100 Subject: [PATCH 07/10] Fix mutation filter and write more tests --- .../cbio/portal/scripts/MutationFilter.java | 409 +++++++++--------- .../scripts/TestMutationFilter.java | 201 --------- .../portal/scripts/TestMutationFilter.java | 254 +++++++++++ 3 files changed, 456 insertions(+), 408 deletions(-) delete mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java create mode 100644 src/test/java/org/mskcc/cbio/portal/scripts/TestMutationFilter.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java index 8f8c9344..fa16604a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java @@ -28,7 +28,7 @@ * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . -*/ + */ package org.mskcc.cbio.portal.scripts; @@ -43,59 +43,47 @@ /** * Filter mutations as they're imported into the CGDS dbms. *

+ * * @author Arthur Goldberg goldberg@cbio.mskcc.org */ public class MutationFilter { - - private final Set whiteListGenesForPromoterMutations; - private int accepts=0; - public int decisions=0; - private int mutationStatusNoneRejects=0; - private int invalidChromosome=0; - private int invalidGeneInfo=0; - private int lohOrWildTypeRejects=0; - private int redactedOrWildTypeRejects =0; - public Map rejectionMap = new HashMap(); + private static final long TERT_ENTREZ_GENE_ID = 7015; + private int accepts = 0; + public int decisions = 0; + private int mutationStatusNoneRejects = 0; + private int invalidChromosome = 0; + private int invalidGeneInfo = 0; + private int lohOrWildTypeRejects = 0; + private int redactedOrWildTypeRejects = 0; + public Map rejectionMap = new HashMap(); + + private static final Map VALID_CHR_VALUES = new HashMap<>(); - private static final Map VALID_CHR_VALUES = new HashMap<>(); - static { - for (int lc = 1; lc<=24; lc++) { - VALID_CHR_VALUES.put(Integer.toString(lc),Integer.toString(lc)); - VALID_CHR_VALUES.put("CHR" + Integer.toString(lc),Integer.toString(lc)); - } - VALID_CHR_VALUES.put("X","23"); - VALID_CHR_VALUES.put("CHRX","23"); - VALID_CHR_VALUES.put("Y","24"); - VALID_CHR_VALUES.put("CHRY","24"); - VALID_CHR_VALUES.put("NA","NA"); - VALID_CHR_VALUES.put("MT","MT"); // mitochondria - } + static { + for (int lc = 1; lc <= 24; lc++) { + VALID_CHR_VALUES.put(Integer.toString(lc), Integer.toString(lc)); + VALID_CHR_VALUES.put("CHR" + Integer.toString(lc), Integer.toString(lc)); + } + VALID_CHR_VALUES.put("X", "23"); + VALID_CHR_VALUES.put("CHRX", "23"); + VALID_CHR_VALUES.put("Y", "24"); + VALID_CHR_VALUES.put("CHRY", "24"); + VALID_CHR_VALUES.put("NA", "NA"); + VALID_CHR_VALUES.put("MT", "MT"); // mitochondria + } - /** - * Construct a MutationFilter with no white lists. - * This filter will - *
- * REJECT Silent, LOH, Intron and Wildtype mutations, and - *
- * KEEP all other mutations. - */ - public MutationFilter() throws IllegalArgumentException{ - whiteListGenesForPromoterMutations = new HashSet(); - whiteListGenesForPromoterMutations.add(7015L); // TERT - } - - /** - * Indicate whether the specified mafRecord should be accepted as input to - * the CGDS Database. - *

- * @param mafRecord - * a MAF line/record. - *
- * @return true if the mafRecord should be imported into the dbms - */ - public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations) { - this.decisions++; + /** + * Indicate whether the specified mafRecord should be accepted as input to + * the CGDS Database. + *

+ * + * @param mafRecord a MAF line/record. + *
+ * @return true if the mafRecord should be imported into the dbms + */ + public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations) { + this.decisions++; /* * Mutation types from Firehose: @@ -115,108 +103,113 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations | Translation_Start_Site | +------------------------+ */ - if (ExtendedMutationUtil.isBlankEntrezGeneId(mafRecord.getGivenEntrezGeneId()) - && ExtendedMutationUtil.isBlankHugoGeneSymbol(mafRecord.getHugoGeneSymbol())) { - invalidGeneInfo++; - return false; - } - long entrezGeneId; - try { - entrezGeneId = Long.parseLong(mafRecord.getGivenEntrezGeneId()); - if (entrezGeneId < 0) { - invalidGeneInfo++; - return false; - } - } catch (NumberFormatException e) { - invalidGeneInfo++; - return false; - } - // Do not accept mutations with invalid chromosome symbol - if (normalizeChr(mafRecord.getChr()) == null) { - invalidChromosome++; - return false; - } - // Do not accept mutations with Mutation_Status of None - if (safeStringTest( mafRecord.getMutationStatus(), "None" )) { - mutationStatusNoneRejects++; - return false; - } - - // Do not accept LOH or Wildtype Mutations - if( safeStringTest( mafRecord.getMutationStatus(), "LOH" ) || - safeStringTest( mafRecord.getMutationStatus(), "Wildtype" ) ){ - lohOrWildTypeRejects++; - return false; - } - - // Do not accept Redacted or Wildtype mutations - if (safeStringTest(mafRecord.getValidationStatus(), "Redacted") || - safeStringTest( mafRecord.getValidationStatus(), "Wildtype" )) { - redactedOrWildTypeRejects++; - return false; - } - - //Filter by types if specified in the meta file, else filter for the default types - String mutationType = ExtendedMutationUtil.getMutationType(mafRecord); - if (filteredMutations != null) { - if (filteredMutations.contains(mutationType)) { - addRejectedVariant(mutationType); - return false; - } else { - if( safeStringTest( mutationType, "5'Flank" ) ) { - mafRecord.setProteinChange("Promoter"); - } - return true; - } - } else { - // Do not accept Silent, Intronic, 3'UTR, 5'UTR, IGR or RNA Mutations - if( safeStringTest( mutationType, "Silent" ) || - safeStringTest( mutationType, "Intron" ) || - safeStringTest( mutationType, "3'UTR" ) || - safeStringTest( mutationType, "3'Flank" ) || - safeStringTest( mutationType, "5'UTR" ) || - safeStringTest( mutationType, "IGR" ) || - safeStringTest( mutationType, "RNA")){ - addRejectedVariant(mutationType); - return false; - } - - if( safeStringTest( mutationType, "5'Flank" ) ) { - if (whiteListGenesForPromoterMutations.contains(entrezGeneId)){ - mafRecord.setProteinChange("Promoter"); + boolean blankEntrezGeneId = ExtendedMutationUtil.isBlankEntrezGeneId(mafRecord.getGivenEntrezGeneId()); + boolean blankHugoGeneSymbol = ExtendedMutationUtil.isBlankHugoGeneSymbol(mafRecord.getHugoGeneSymbol()); + if (blankEntrezGeneId + && blankHugoGeneSymbol) { + invalidGeneInfo++; + return false; + } + long entrezGeneId = 0; + if (!blankEntrezGeneId) { + try { + entrezGeneId = Long.parseLong(mafRecord.getGivenEntrezGeneId()); + if (entrezGeneId < 0) { + invalidGeneInfo++; + return false; + } + } catch (NumberFormatException e) { + invalidGeneInfo++; + return false; + } + } + // Do not accept mutations with invalid chromosome symbol + if (normalizeChr(mafRecord.getChr()) == null) { + invalidChromosome++; + return false; + } + // Do not accept mutations with Mutation_Status of None + if (safeStringTest(mafRecord.getMutationStatus(), "None")) { + mutationStatusNoneRejects++; + return false; + } + + // Do not accept LOH or Wildtype Mutations + if (safeStringTest(mafRecord.getMutationStatus(), "LOH") || + safeStringTest(mafRecord.getMutationStatus(), "Wildtype")) { + lohOrWildTypeRejects++; + return false; + } + + // Do not accept Redacted or Wildtype mutations + if (safeStringTest(mafRecord.getValidationStatus(), "Redacted") || + safeStringTest(mafRecord.getValidationStatus(), "Wildtype")) { + redactedOrWildTypeRejects++; + return false; + } + + //Filter by types if specified in the meta file, else filter for the default types + String mutationType = ExtendedMutationUtil.getMutationType(mafRecord); + if (filteredMutations != null) { + if (filteredMutations.contains(mutationType)) { + addRejectedVariant(mutationType); + return false; + } else { + if (safeStringTest(mutationType, "5'Flank")) { + mafRecord.setProteinChange("Promoter"); + } + } + } else { + // Do not accept Silent, Intronic, 3'UTR, 5'UTR, IGR or RNA Mutations + if (safeStringTest(mutationType, "Silent") || + safeStringTest(mutationType, "Intron") || + safeStringTest(mutationType, "3'UTR") || + safeStringTest(mutationType, "3'Flank") || + safeStringTest(mutationType, "5'UTR") || + safeStringTest(mutationType, "IGR") || + safeStringTest(mutationType, "RNA")) { + addRejectedVariant(mutationType); + return false; + } + + if (safeStringTest(mutationType, "5'Flank")) { + if (entrezGeneId == TERT_ENTREZ_GENE_ID) { + mafRecord.setProteinChange("Promoter"); } else { addRejectedVariant(mutationType); return false; } - } - - this.accepts++; - return true; - } - } + } - public static String normalizeChr(String strChr) { - if (strChr == null) { - return null; - } - return VALID_CHR_VALUES.get(strChr.toUpperCase()); - } + } + this.accepts++; + return true; + } - /** - * Provide number of decisions made by this MutationFilter. - * @return the number of decisions made by this MutationFilter - */ - public int getDecisions(){ - return this.decisions; - } + public static String normalizeChr(String strChr) { + if (strChr == null) { + return null; + } + return VALID_CHR_VALUES.get(strChr.toUpperCase()); + } - /** - * Provide number of ACCEPT (return true) decisions made by this MutationFilter. - * @return the number of ACCEPT (return true) decisions made by this MutationFilter - */ - public int getAccepts(){ - return this.accepts; - } + /** + * Provide number of decisions made by this MutationFilter. + * + * @return the number of decisions made by this MutationFilter + */ + public int getDecisions() { + return this.decisions; + } + + /** + * Provide number of ACCEPT (return true) decisions made by this MutationFilter. + * + * @return the number of ACCEPT (return true) decisions made by this MutationFilter + */ + public int getAccepts() { + return this.accepts; + } public int getMutationStatusNoneRejects() { return mutationStatusNoneRejects; @@ -224,76 +217,78 @@ public int getMutationStatusNoneRejects() { /** * Provide number of REJECT decisions for LOH or Wild Type Mutations. + * * @return number of REJECT decisions for LOH or Wild Type Mutations. */ - public int getLohOrWildTypeRejects() { - return this.lohOrWildTypeRejects; - } + public int getLohOrWildTypeRejects() { + return this.lohOrWildTypeRejects; + } - public int getInvalidChromosome() { - return invalidChromosome; - } + public int getInvalidChromosome() { + return invalidChromosome; + } - public int getInvalidGeneInfo() { - return invalidGeneInfo; - } + public int getInvalidGeneInfo() { + return invalidGeneInfo; + } - public int getRedactedOrWildTypeRejects() - { - return this.redactedOrWildTypeRejects; - } - - public Map getRejectionMap() { - return this.rejectionMap; - } - - public void addRejectedVariant(String mutation) { - this.rejectionMap.putIfAbsent(mutation, 0); - this.rejectionMap.computeIfPresent(mutation, (k, v) -> v + 1); - } + public int getRedactedOrWildTypeRejects() { + return this.redactedOrWildTypeRejects; + } - /** - * Provide number of REJECT (return false) decisions made by this MutationFilter. - * @return the number of REJECT (return false) decisions made by this MutationFilter - */ - public int getRejects(){ - return this.decisions - this.accepts; - } - - public String getStatistics(){ - String statistics = "Mutation filter decisions: " + this.getDecisions() + - "\nRejects: " + this.getRejects() + - "\nMutation Status 'None' Rejects: " + this.getMutationStatusNoneRejects() + - "\nLOH or Wild Type Mutation Status Rejects: " + this.getLohOrWildTypeRejects() + - "\nRedacted or Wild Type Validation Status Rejects: " + this.getRedactedOrWildTypeRejects() + - "\nInvalid Choromosome Rejects: " + this.getInvalidChromosome() + - "\nInvalid Gene Info Rejects: " + this.getInvalidGeneInfo(); + public Map getRejectionMap() { + return this.rejectionMap; + } - Map variantsRejected = this.getRejectionMap(); - for (Map.Entry variant : variantsRejected.entrySet()) { - statistics = statistics + "\n" + variant.getKey() + " Rejects: " + variant.getValue(); - } - - return statistics; - } + public void addRejectedVariant(String mutation) { + this.rejectionMap.putIfAbsent(mutation, 0); + this.rejectionMap.computeIfPresent(mutation, (k, v) -> v + 1); + } + + /** + * Provide number of REJECT (return false) decisions made by this MutationFilter. + * + * @return the number of REJECT (return false) decisions made by this MutationFilter + */ + public int getRejects() { + return this.decisions - this.accepts; + } + + public String getStatistics() { + String statistics = "Mutation filter decisions: " + this.getDecisions() + + "\nRejects: " + this.getRejects() + + "\nMutation Status 'None' Rejects: " + this.getMutationStatusNoneRejects() + + "\nLOH or Wild Type Mutation Status Rejects: " + this.getLohOrWildTypeRejects() + + "\nRedacted or Wild Type Validation Status Rejects: " + this.getRedactedOrWildTypeRejects() + + "\nInvalid Choromosome Rejects: " + this.getInvalidChromosome() + + "\nInvalid Gene Info Rejects: " + this.getInvalidGeneInfo(); + + Map variantsRejected = this.getRejectionMap(); + for (Map.Entry variant : variantsRejected.entrySet()) { + statistics = statistics + "\n" + variant.getKey() + " Rejects: " + variant.getValue(); + } + + return statistics; + } - /** - * Carefully look for pattern in data. - *

- * @param data - * @param pattern - * @return false if data is null; true if data starts with pattern, independent of case - */ - private boolean safeStringTest( String data, String pattern ){ - if( null == data){ - return false; - } - return data.toLowerCase().startsWith( pattern.toLowerCase() ); - } - - @Override - public String toString(){ - StringBuffer sb = new StringBuffer(); - return( sb.toString() ); - } + /** + * Carefully look for pattern in data. + *

+ * + * @param data + * @param pattern + * @return false if data is null; true if data starts with pattern, independent of case + */ + private boolean safeStringTest(String data, String pattern) { + if (null == data) { + return false; + } + return data.toLowerCase().startsWith(pattern.toLowerCase()); + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + return (sb.toString()); + } } \ No newline at end of file diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java deleted file mode 100644 index b66f46ab..00000000 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestMutationFilter.java +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS - * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder - * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no - * obligations to provide maintenance, support, updates, enhancements or - * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be - * liable to any party for direct, indirect, special, incidental or - * consequential damages, including lost profits, arising out of the use of this - * software and its documentation, even if Memorial Sloan-Kettering Cancer - * Center has been advised of the possibility of such damage. - */ - -/* - * This file is part of cBioPortal. - * - * cBioPortal is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . -*/ - -package org.mskcc.cbio.portal.integrationTest.scripts; - -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mskcc.cbio.maf.MafRecord; -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoGeneOptimized; -import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.model.ExtendedMutation; -import org.mskcc.cbio.portal.scripts.MutationFilter; -import org.springframework.test.annotation.Rollback; -import org.springframework.test.context.ContextConfiguration; -import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; -import org.springframework.transaction.annotation.Transactional; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -/** - * JUnit tests for MutationFilter class. - */ -@RunWith(SpringJUnit4ClassRunner.class) -@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) -@Rollback -@Transactional -public class TestMutationFilter { - - @Before - public void setUp() throws DaoException { - // load genes - loadGene( "FOO", 3L ); - loadGene( "BAR", 234L ); - loadGene( "BIG", 234234L ); - } - - - @Test - public void testNoWhitelists( ){ - MutationFilter myMutationFilter = new MutationFilter( ); - alwaysRejectTheseMutations( myMutationFilter ); - - // accept all of these, because a MutationFilter without whitelists - // accepts all mutations other than Silent, LOH, Intron and Wildtype mutations - // not valid && somatic - nowTestAcceptMutation( - myMutationFilter, - true, - "2", - "Unknown", // validationStatus, - "Unknown", // mutationStatus, - "Unknown" // mutationType - ); - - // valid but not somatic - nowTestAcceptMutation( - myMutationFilter, - true, - "3", - "Valid", // validationStatus, - "Unknown", // mutationStatus, - "Unknown" // mutationType - ); - - // not valid but somatic - nowTestAcceptMutation( - myMutationFilter, - true, - "4", - "Unknown", // validationStatus, - "Somatic", // mutationStatus, - "Unknown" // mutationType - ); - - // valid && somatic - nowTestAcceptMutation( - myMutationFilter, - true, - "5", - "Valid", // validationStatus, - "Somatic", // mutationStatus, - "Unknown" // mutationType - ); - - // valid && somatic - // testing safeStringTest() - nowTestAcceptMutation( - myMutationFilter, - true, - "6", - "vALid_as_hell", // validationStatus, - "SOMatic_for_sure", // mutationStatus, - "Unknown" // mutationType - ); - - } - - - private void nowTestAcceptMutation( - MutationFilter myMutationFilter, - boolean expectedResult, - String chr, - String validationStatus, - String mutationStatus, - String mutationType - ) { - MafRecord mafRecord = new MafRecord(); - mafRecord.setChr(chr); - mafRecord.setValidationStatus(validationStatus); - mafRecord.setMutationStatus(mutationStatus); - mafRecord.setVariantClassification(mutationType); - if (expectedResult) { - assertTrue(myMutationFilter.acceptMutation(mafRecord, null)); - } else { - assertFalse(myMutationFilter.acceptMutation(mafRecord, null)); - } - } - - private void alwaysRejectTheseMutations(MutationFilter myMutationFilter){ - - // REJECT: Silent, LOH, Intron and Wildtype mutations - nowTestAcceptMutation( - myMutationFilter, - false, - "1", - "Unknown", - "Unknown", - "Silent" - ); - nowTestAcceptMutation( - myMutationFilter, - false, - "2", - "Unknown", - "Unknown", - "Intron" - ); - nowTestAcceptMutation( - myMutationFilter, - false, - "3", - "Unknown", - "LOH", - "Unknown" - ); - nowTestAcceptMutation( - myMutationFilter, - false, - "4", - "Unknown", - "Wildtype", - "Unknown" - ); - nowTestAcceptMutation( - myMutationFilter, - false, - "28", - "Valid", - "Somatic", - "Unknown" - ); - - } - - private void loadGene( String geneSymbol, long geneID ) throws DaoException { - DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); - daoGene.addGene(new CanonicalGene( geneID, geneSymbol )); - } -} diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/TestMutationFilter.java b/src/test/java/org/mskcc/cbio/portal/scripts/TestMutationFilter.java new file mode 100644 index 00000000..b2192978 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/scripts/TestMutationFilter.java @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.scripts; + +import org.junit.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.NullSource; +import org.junit.jupiter.params.provider.ValueSource; +import org.mskcc.cbio.maf.MafRecord; + +import java.util.Set; +import java.util.stream.Stream; + +import static org.junit.Assert.*; + +/** + * JUnit tests for MutationFilter class. + */ +public class TestMutationFilter { + + MutationFilter testee = new MutationFilter(); + MafRecord mafRecord = new MafRecord(); + { + mafRecord.setChr("17"); + mafRecord.setGivenEntrezGeneId("7157"); + mafRecord.setHugoGeneSymbol("TP53"); + mafRecord.setMutationStatus("Somatic"); + mafRecord.setValidationStatus("Valid"); + mafRecord.setVariantClassification("In_Frame_Ins"); + } + + @Test + public void testInitialCounters() { + assertEquals(0, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(0, testee.getRejects()); + assertEquals(0, testee.getInvalidGeneInfo()); + assertEquals(0, testee.getInvalidChromosome()); + assertEquals(0, testee.getLohOrWildTypeRejects()); + assertEquals(0, testee.getRedactedOrWildTypeRejects()); + assertEquals(0, testee.getMutationStatusNoneRejects()); + } + + @Test + public void testAccept() { + assertTrue(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(1, testee.getAccepts()); + assertEquals(0, testee.getRejects()); + } + + @ParameterizedTest + @NullSource + @ValueSource(strings = {"", " ", "0"}) + public void testAcceptWhenEntrezGeneIdIsNotSpecifiedButHugoGeneSymbolIsKnown(String givenEntrezGeneId) { + mafRecord.setGivenEntrezGeneId(givenEntrezGeneId); + assertTrue(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(1, testee.getAccepts()); + assertEquals(0, testee.getRejects()); + } + + static Stream provideAllEmptyEntrezGeneIdAndHugoGeneSymbolCombinations() { + String[] entrezGeneIds = {null, "", " ", "0"}; + String[] hugoGeneSymbol = {null, "", " ", "Unknown"}; + + return Stream.of(entrezGeneIds) + .flatMap(a -> Stream.of(hugoGeneSymbol) + .map(b -> Arguments.of(a, b))); + } + @ParameterizedTest + @MethodSource("provideAllEmptyEntrezGeneIdAndHugoGeneSymbolCombinations") + public void testRejectWhenGeneInfoIsNotSpecified(String givenEntrezGeneId, String hugoGeneSymbol) { + mafRecord.setGivenEntrezGeneId(givenEntrezGeneId); + mafRecord.setHugoGeneSymbol(hugoGeneSymbol); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(1, testee.getInvalidGeneInfo()); + } + + @ParameterizedTest + @ValueSource(strings = {"-1", "ABC"}) + public void testRejectWhenEntrezGeneIdIsNegativeOrNotNumeric(String givenEntrezGeneId) { + mafRecord.setGivenEntrezGeneId(givenEntrezGeneId); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(1, testee.getInvalidGeneInfo()); + } + + @ParameterizedTest + @NullSource + @ValueSource(strings = {"", "-1", "0", "Z"}) + public void testRejectWhenChromosomeIsInvalid(String chr) { + mafRecord.setChr(chr); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(1, testee.getInvalidChromosome()); + } + + @Test + public void testRejectWhenMutationStatusIsNone() { + mafRecord.setMutationStatus("None"); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(1, testee.getMutationStatusNoneRejects()); + } + + @ParameterizedTest + @ValueSource(strings = {"LOH", "Wildtype"}) + public void testRejectWhenMutationStatusIsLohOrWildtype(String mutationStatus) { + mafRecord.setMutationStatus(mutationStatus); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(1, testee.getLohOrWildTypeRejects()); + } + + @ParameterizedTest + @ValueSource(strings = {"Redacted", "Wildtype"}) + public void testRejectWhenValidationStatusIsRedactedOrWildtype(String validationStatus) { + mafRecord.setValidationStatus(validationStatus); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(1, testee.getRedactedOrWildTypeRejects()); + } + + final static String[] FILTERED_OUT_MUTATION_TYPES = {"Silent", "Intron", "3'UTR", "3'Flank", "5'UTR", "IGR", "RNA"}; + static Stream filteredOutMutationTypesProvider() { + return Stream.of(FILTERED_OUT_MUTATION_TYPES); + } + @ParameterizedTest + @MethodSource("filteredOutMutationTypesProvider") + public void testRejectWhenVariantClassificationInTheDefaultBlackList(String mutationType) { + mafRecord.setVariantClassification(mutationType); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(Set.of(mutationType), testee.getRejectionMap().keySet()); + } + + static Stream provideAllVariantClassificationEmptyValuesAndFilteredOutMutationTypeCombinations() { + String[] variantClassificationEmptyValues = {null, "", "NULL", "NA"}; + + return Stream.of(variantClassificationEmptyValues) + .flatMap(a -> Stream.of(FILTERED_OUT_MUTATION_TYPES) + .map(b -> Arguments.of(a, b))); + } + @ParameterizedTest + @MethodSource("provideAllVariantClassificationEmptyValuesAndFilteredOutMutationTypeCombinations") + public void testRejectWhenMafVariantClassificationInTheDefaultBlackList(String varianClassificatinEmptyValue, String mutationType) { + mafRecord.setVariantClassification(varianClassificatinEmptyValue); + mafRecord.setMafVariantClassification(mutationType); + assertFalse(testee.acceptMutation(mafRecord, null)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(Set.of(mutationType), testee.getRejectionMap().keySet()); + } + + @Test + public void testRejectWhenVariantClassificationInTheCustomBlackList() { + Set filteredMutations = Set.of("In_Frame_Ins"); + assertFalse(testee.acceptMutation(mafRecord, filteredMutations)); + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(filteredMutations, testee.getRejectionMap().keySet()); + } + + @Test + public void testAccept5FlankMutationWhenNotInTheCustomBlackList() { + assertNull(mafRecord.getProteinChange()); + + mafRecord.setVariantClassification("5'Flank"); + Set filteredMutations = Set.of("In_Frame_Ins"); + assertTrue(testee.acceptMutation(mafRecord, filteredMutations)); + + assertEquals("Promoter", mafRecord.getProteinChange()); + assertEquals(1, testee.getDecisions()); + assertEquals(1, testee.getAccepts()); + assertEquals(0, testee.getRejects()); + assertEquals(Set.of(), testee.getRejectionMap().keySet()); + } + + @Test + public void testReject5FlankMutationGene() { + mafRecord.setVariantClassification("5'Flank"); + + assertFalse(testee.acceptMutation(mafRecord, null)); + + assertEquals(1, testee.getDecisions()); + assertEquals(0, testee.getAccepts()); + assertEquals(1, testee.getRejects()); + assertEquals(Set.of("5'Flank"), testee.getRejectionMap().keySet()); + } + + @Test + public void testAccept5FlankMutationGeneWhenTERTEntrezGeneIdIsSpecified() { + mafRecord.setVariantClassification("5'Flank"); + mafRecord.setGivenEntrezGeneId("7015"); + + assertTrue(testee.acceptMutation(mafRecord, null)); + + assertEquals("Promoter", mafRecord.getProteinChange()); + assertEquals(1, testee.getDecisions()); + assertEquals(1, testee.getAccepts()); + assertEquals(0, testee.getRejects()); + assertEquals(Set.of(), testee.getRejectionMap().keySet()); + } +} From 8e99eabbf5b1828a45655a55add77b25ed398960 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 1 Nov 2024 16:30:48 +0100 Subject: [PATCH 08/10] Add script to filter mutations in MAF file This script uses the same filters that are used during the load (except filter by gene that required connection to the database) --- .../portal/scripts/FilterMutationData.java | 137 ++++++++++++++++++ .../scripts/TestFilterMutationData.java | 100 +++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java create mode 100644 src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java new file mode 100644 index 00000000..6934bf67 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2015 - 2022 Memorial Sloan Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.scripts; + +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.mskcc.cbio.maf.MafRecord; +import org.mskcc.cbio.maf.MafUtil; +import org.mskcc.cbio.portal.util.*; + +import java.io.*; +import java.util.*; + +/** + * Read MAF records, filter records of interest and writes back to the file. The script backs up original file under {filename with extension}_backup. + * + * @author Ruslan Forostianov + */ +public class FilterMutationData extends ConsoleRunnable { + + /** + * Instantiates a ConsoleRunnable to run with the given command line args. + * + * @param args the command line arguments to be used + * @see {@link #run()} + */ + public FilterMutationData(String[] args) { + super(args); + } + + public void run() { + String description = "Filter MAF file for records of interest and rewrites it with selected mutations."; + OptionParser parser = new OptionParser(); + OptionSpec data = parser.accepts( "data", + "MAF data file" ).withRequiredArg().describedAs( "data_mutations.txt" ).ofType( String.class ); + OptionSpec meta = parser.accepts( "meta", + "meta (description) file" ).withOptionalArg().describedAs( "meta_mutations.txt" ).ofType( String.class ); + + OptionSet options = null; + File originalMutationFile; + Set namespaces = null; + Set filteredMutations = null; + + try { + options = parser.parse( args ); + originalMutationFile = new File((String) options.valueOf("data")); + if (options.has("meta")) { + File descriptorFile = new File((String) options.valueOf( "meta" ) ); + filteredMutations = GeneticProfileReader.getVariantClassificationFilter(descriptorFile); + namespaces = GeneticProfileReader.getNamespaces(descriptorFile); + } + } catch (OptionException e) { + throw new UsageException( + this.getClass().getName(), description, parser, + e.getMessage()); + } catch (Exception e) { + throw new RuntimeException(e); + } + ProgressMonitor.setCurrentMessage("Start filtering mutation records in the MAF file ..."); + File resultMutationFile = new File(originalMutationFile.getAbsolutePath() + "_filtered"); + final MutationFilter mutationFilter = new MutationFilter(); + try ( + BufferedReader originalFileBufferedReader = new BufferedReader(new FileReader(originalMutationFile)); + BufferedWriter resultFileBufferedWriter = new BufferedWriter(new FileWriter(resultMutationFile)) + ) { + String line; + MafUtil mafUtil = null; + while ((line = originalFileBufferedReader.readLine()) != null) { + ProgressMonitor.incrementCurValue(); + ConsoleUtil.showProgress(); + + if (TsvUtil.isDataLine(line)) { + if (mafUtil == null) { + mafUtil = new MafUtil(line, namespaces); + } else { + MafRecord record = mafUtil.parseRecord(line); + if (!mutationFilter.acceptMutation(record, filteredMutations)) { + continue; + } + } + } + resultFileBufferedWriter.write(line); + resultFileBufferedWriter.write(System.lineSeparator()); + } + } catch (IOException e) { + e.printStackTrace(); + } + File backupMutationFile = new File(originalMutationFile.getAbsolutePath() + "_backup"); + if (originalMutationFile.renameTo(backupMutationFile)) { + ProgressMonitor.setCurrentMessage("The original file is backed up to:" + + backupMutationFile.getAbsolutePath()); + if (resultMutationFile.renameTo(originalMutationFile)) { + ProgressMonitor.setCurrentMessage("The MAF file has been overwritten with filtered records."); + } else { + throw new RuntimeException("Failed to rename the filtered MAF file (" + + resultMutationFile.getAbsolutePath() + ") to the input MAF file (" + + originalMutationFile.getAbsolutePath() + ")."); + } + } else { + throw new RuntimeException("Failed to rename MAF file (" + + originalMutationFile.getAbsolutePath() + ") for backup."); + } + ProgressMonitor.setCurrentMessage(mutationFilter.getStatistics()); + } +} \ No newline at end of file diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java b/src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java new file mode 100644 index 00000000..6a4b9613 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/scripts/TestFilterMutationData.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.scripts; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.BeforeEach; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.Comparator; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * JUnit tests for FilterMutationData step + */ +public class TestFilterMutationData { + + public static final String SRC_MAF_DATA_FILE_PATH = "src/test/resources/data_mutations_extended.txt"; + private Path tempDir; + + @BeforeEach + public void setUp() throws IOException { + // Create a temporary directory for each test + tempDir = Files.createTempDirectory("tempTestDir"); + + // Copy files to the temporary directory + Path dataFile = Paths.get(SRC_MAF_DATA_FILE_PATH); + Path copiedDataFile = tempDir.resolve(dataFile.getFileName()); + Files.copy(dataFile, copiedDataFile, StandardCopyOption.REPLACE_EXISTING); + + Path metaFile = Paths.get("src/test/resources/meta_mutations_extended.txt"); + Path copiedMetaFile = tempDir.resolve(metaFile.getFileName()); + Files.copy(metaFile, copiedMetaFile, StandardCopyOption.REPLACE_EXISTING); + } + + @AfterEach + public void tearDown() throws IOException { + // Delete the temporary directory and files after each test + Files.walk(tempDir) + .sorted(Comparator.reverseOrder()) + .map(Path::toFile) + .forEach(File::delete); + } + + @Test + public void testFilterMutationData() throws IOException { + String mafFile = tempDir + "/data_mutations_extended.txt"; + String[] args = { + "--data", mafFile, + "--meta", tempDir + "/meta_mutations_extended.txt" + }; + FilterMutationData runner = new FilterMutationData(args); + runner.run(); + + List filteredDataFileLines = Files.readAllLines(Paths.get(mafFile)); + List backedUpDataFileLines = Files.readAllLines(Paths.get(mafFile + "_backup")); + List originalDataFileLines = Files.readAllLines(Paths.get(SRC_MAF_DATA_FILE_PATH)); + assertEquals(originalDataFileLines, backedUpDataFileLines); + assertFalse(filteredDataFileLines.isEmpty()); + assertTrue(originalDataFileLines.size() > filteredDataFileLines.size()); + assertTrue(originalDataFileLines.containsAll(filteredDataFileLines)); + } +} From 3e8e2a7141d95cf9524a4fdbffac09019d0cd1d0 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 1 Nov 2024 17:23:48 +0100 Subject: [PATCH 09/10] Make FilteMutationData executable --- .../org/mskcc/cbio/portal/scripts/FilterMutationData.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java index 6934bf67..b9346646 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/FilterMutationData.java @@ -134,4 +134,9 @@ public void run() { } ProgressMonitor.setCurrentMessage(mutationFilter.getStatistics()); } + + public static void main(String[] args) { + ConsoleRunnable runner = new FilterMutationData(args); + runner.runInConsole(); + } } \ No newline at end of file From e4ea9c93d81bffdc82af0c4de1bafc7269b7b838 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 1 Nov 2024 17:13:51 +0100 Subject: [PATCH 10/10] Install python packages for docker image into venv envirounment --- Dockerfile | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8e850d68..a45ce5f0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,45 +1,57 @@ +# Stage 1: Build the Java application JAR FROM maven:3-eclipse-temurin-21 as jar_builder # Set the working directory in the Maven image WORKDIR /app -# Copy the java source files and the pom.xml file into the image +# Copy the Java source files and the pom.xml file into the image COPY src ./src COPY pom.xml . # Build the application RUN mvn clean package -DskipTests +# Stage 2: Prepare the final image FROM maven:3-eclipse-temurin-21 -# download system dependencies first to take advantage of docker caching -RUN apt-get update; apt-get install -y --no-install-recommends \ +# Download system dependencies first to take advantage of Docker caching +RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ default-mysql-client \ default-libmysqlclient-dev \ python3 \ - python3-setuptools \ + python3-venv \ python3-dev \ - python3-pip \ unzip \ perl \ - && rm -rf /var/lib/apt/lists/* \ - && pip3 install wheel + && rm -rf /var/lib/apt/lists/* -# Install any needed packages specified in requirements.txt +# Set up a Python virtual environment +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install Python packages in the virtual environment COPY requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt -RUN ln -s $(which python3) /usr/local/bin/python || true +# Link python3 to python for compatibility +RUN ln -s /opt/venv/bin/python /usr/local/bin/python || true +# Copy the built JAR from the first stage COPY --from=jar_builder /app/core-*.jar / + +# Copy and set permissions for scripts COPY scripts/ scripts/ RUN chmod -R a+x /scripts/ # Set the working directory in the container WORKDIR /scripts/ +# Environment variable ENV PORTAL_HOME=/ -# This file is empty. It has to be overriden by bind mounting the actual application.properties +# Create an empty application.properties file RUN touch /application.properties + +# Entry command +CMD ["python", "your_script.py"]