From 8b9d361fbf5ba97834201a41652fae5ce0b81d6d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 24 Nov 2020 06:11:22 -0800 Subject: [PATCH 1/3] Experiments in exploring the taxonomy of contaminants, mark 2. (#149) (#155) * update to output more detailed contam info * add system exit * fix template notebook names * make numpy not a req for import * make sourmash not a req for import * fix nbconvert command line * udpate README From 71265a92a2c77b2cc2430eaafa424ec5619b2b6a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 24 Nov 2020 06:31:46 -0800 Subject: [PATCH 2/3] Fix tests (#156) * switch to using sig.name in sourmash * fix contam json * yay finished fixing tests --- charcoal/compare_taxonomy.py | 6 +++--- charcoal/contigs_search.py | 2 +- charcoal/utils.py | 2 +- tests/test-data/loomba/contam.json | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/charcoal/compare_taxonomy.py b/charcoal/compare_taxonomy.py index 1d06d9f..51d495f 100644 --- a/charcoal/compare_taxonomy.py +++ b/charcoal/compare_taxonomy.py @@ -148,10 +148,10 @@ def get_genome_taxonomy(matches_filename, genome_sig_filename, provided_lineage, new_siglist.append(ss) else: if provided_lineage and provided_lineage != 'NA': - print(f'found exact match: {ss.name()}. removing.') + print(f'found exact match: {ss.name}. removing.') else: - print(f'found exact match: {ss.name()}. but no provided lineage!') - comment = f'found exact match: {ss.name()}. but no provided lineage! cannot analyze.' + print(f'found exact match: {ss.name}. but no provided lineage!') + comment = f'found exact match: {ss.name}. but no provided lineage! cannot analyze.' return None, comment, True, 1.0, 1.0 # ...but leave exact matches in if they're the only matches, I guess! diff --git a/charcoal/contigs_search.py b/charcoal/contigs_search.py index b500f62..8927b24 100644 --- a/charcoal/contigs_search.py +++ b/charcoal/contigs_search.py @@ -44,7 +44,7 @@ def main(args): new_siglist = [] for ss in siglist: if genome_sig.similarity(ss) == 1.0: - print(f'removing an identical match: {ss.name()}') + print(f'removing an identical match: {ss.name}') else: new_siglist.append(ss) siglist = new_siglist diff --git a/charcoal/utils.py b/charcoal/utils.py index 62cc233..e726084 100644 --- a/charcoal/utils.py +++ b/charcoal/utils.py @@ -180,7 +180,7 @@ def summarize_at_rank(lincounts, rank): def get_ident(sig): "Hack and slash identifiers." - ident = sig.name() + ident = sig.name ident = ident.split()[0] ident = ident.split('.')[0] return ident diff --git a/tests/test-data/loomba/contam.json b/tests/test-data/loomba/contam.json index daf58fe..578ca41 100644 --- a/tests/test-data/loomba/contam.json +++ b/tests/test-data/loomba/contam.json @@ -1 +1 @@ -[[[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"]], 14], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"]], 12], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"]], 24], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"]], 17], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], 1503]] \ No newline at end of file +{"loomba": [[[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Anaerotignaceae"], ["genus", "g__Anaerotignum"]], 7], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__An200"]], 3], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Flavonifractor"]], 1], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], [["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Flavonifractor"]], 3]]} \ No newline at end of file From f6fd132b10b8859294fd78ac10d7cf62d37a6585 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 24 Nov 2020 06:40:38 -0800 Subject: [PATCH 3/3] various attempted fixes --- charcoal/compare_taxonomy.py | 40 ------------------------------------ charcoal/gather_taxonomy.py | 2 +- charcoal/just_taxonomy.py | 4 ++-- tests/test_decontam.py | 2 +- 4 files changed, 4 insertions(+), 44 deletions(-) diff --git a/charcoal/compare_taxonomy.py b/charcoal/compare_taxonomy.py index 428f3dd..ee373e1 100644 --- a/charcoal/compare_taxonomy.py +++ b/charcoal/compare_taxonomy.py @@ -120,46 +120,6 @@ def choose_genome_lineage(guessed_genome_lineage, provided_lineage, match_rank, def get_genome_taxonomy(genome_name, genome_gather_json_filename, provided_lineage, tax_assign, match_rank, min_f_ident, min_f_major): - with open(matches_filename, 'rt') as fp: - try: - siglist = list(sourmash.load_signatures(fp, do_raise=True, quiet=True)) - except sourmash.exceptions.SourmashError: - siglist = None - - if not siglist: - comment = 'no matches for this genome.' - print(comment) - return None, comment, False, 0.0, 0.0 - - # construct a template minhash object that we can use to create new 'uns - empty_mh = siglist[0].minhash.copy_and_clear() - ksize = empty_mh.ksize - scaled = empty_mh.scaled - moltype = empty_mh.moltype - - genome_sig = sourmash.load_one_signature(genome_sig_filename) - entire_mh = genome_sig.minhash - - assert entire_mh.scaled == scaled - - # Hack for examining members of our search database: remove exact matches. - new_siglist = [] - for ss in siglist: - if entire_mh.similarity(ss.minhash) < 1.0: - new_siglist.append(ss) - else: - if provided_lineage and provided_lineage != 'NA': - print(f'found exact match: {ss.name}. removing.') - else: - print(f'found exact match: {ss.name}. but no provided lineage!') - comment = f'found exact match: {ss.name}. but no provided lineage! cannot analyze.' - return None, comment, True, 1.0, 1.0 - - # ...but leave exact matches in if they're the only matches, I guess! - if new_siglist: - siglist = new_siglist ->>>>>>> 71265a92a2c77b2cc2430eaafa424ec5619b2b6a - guessed_genome_lineage, f_major, f_ident = "", 0.0, 0.0 # did we get gather results? genome_info = utils.load_contigs_gather_json(genome_gather_json_filename) diff --git a/charcoal/gather_taxonomy.py b/charcoal/gather_taxonomy.py index 57f8053..40f77e9 100644 --- a/charcoal/gather_taxonomy.py +++ b/charcoal/gather_taxonomy.py @@ -110,7 +110,7 @@ def main(args): # genome search genome_len=0 entire_mh = genome_sig.minhash - genome_name = os.path.basename(genome_sig.name()) + genome_name = os.path.basename(genome_sig.name) num_hashes = len(entire_mh.hashes) if not genome_len: for record in screed_iter: diff --git a/charcoal/just_taxonomy.py b/charcoal/just_taxonomy.py index afbf3f9..5087d40 100644 --- a/charcoal/just_taxonomy.py +++ b/charcoal/just_taxonomy.py @@ -386,10 +386,10 @@ def report(*args): new_siglist.append(ss) else: if args.lineage and args.lineage != 'NA': - report(f'found exact match: {ss.name()}. removing.') + report(f'found exact match: {ss.name}. removing.') identical_match_removed = True else: - report(f'found exact match: {ss.name()}. but no provided lineage! exiting.') + report(f'found exact match: {ss.name}. but no provided lineage! exiting.') comment = "Exact match in matches, but no provided lineage." create_empty_output(genomebase, comment, args.summary, None, args.contig_report, diff --git a/tests/test_decontam.py b/tests/test_decontam.py index 585c37a..2f9c63b 100644 --- a/tests/test_decontam.py +++ b/tests/test_decontam.py @@ -70,7 +70,7 @@ def make_lca_and_lineages(match_files, lineages_csv, scaled, ksize, # build database of matches & lineages! for ss in siglist: - print(ss.name(), ss.minhash.scaled) + print(ss.name, ss.minhash.scaled) ident = just_taxonomy.get_ident(ss) lineage = tax_assign[ident]