diff --git a/charcoal/Snakefile b/charcoal/Snakefile index d682840..8f576c3 100644 --- a/charcoal/Snakefile +++ b/charcoal/Snakefile @@ -217,21 +217,23 @@ rule search_all: """ # generate contigs taxonomy -rule make_contigs_taxonomy_json: +rule make_taxonomy_json: input: genome = genome_dir + '/{f}', genome_sig = output_dir + '/{f}.sig', matches = output_dir + '/{f}.matches.sig', lineages = config['lineages_csv'] output: - json=output_dir + '/{f}.contigs-tax.json', + contig_json=output_dir + '/{f}.contigs-tax.json', + genome_json=output_dir + '/{f}.genome-tax.json', conda: 'conf/env-sourmash.yml' shell: """ - python -m charcoal.contigs_search \ + python -m charcoal.gather_taxonomy \ --genome {input.genome} --lineages-csv {input.lineages} \ --genome-sig {input.genome_sig} \ --matches-sig {input.matches} \ - --json-out {output.json} + --contigs-json-out {output.contig_json} \ + --genome-json-out {output.genome_json} """ # actually do cleaning @@ -301,8 +303,8 @@ rule combined_summary: # compare all the genomes. rule compare_taxonomy: input: - all_json=expand(output_dir + '/{g}.contigs-tax.json', g=genome_list), - all_sig=expand(output_dir + '/{g}.matches.sig', g=genome_list), + all_contigs_json=expand(output_dir + '/{g}.contigs-tax.json', g=genome_list), + all_genome_json=expand(output_dir + '/{g}.genome-tax.json', g=genome_list), lineages = config['lineages_csv'], provided_lineages = provided_lineages_file, genome_list = genome_list_file diff --git a/charcoal/compare_taxonomy.py b/charcoal/compare_taxonomy.py index 1d06d9f..47176cd 100644 --- a/charcoal/compare_taxonomy.py +++ b/charcoal/compare_taxonomy.py @@ -63,14 +63,15 @@ def calculate_clean(genome_lin, contigs_d, rank): return (good_names, bad_names) -def guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, report_fp): +def guess_tax_by_gather(gather_results, num_hashes, match_rank, report_fp, minimum_matches=GATHER_MIN_MATCHES): "Guess likely taxonomy using gather." sum_ident = 0 first_lin = () first_count = 0 - - for lin, count in gather_at_rank(entire_mh, lca_db, lin_db, match_rank): - if count >= GATHER_MIN_MATCHES: + # if match_rank is genus, this should be same as using gather_results + rank_gather = summarize_at_rank(gather_results, match_rank) + for lin, count in rank_gather: + if count >= minimum_matches: # record the first lineage we come across as likely lineage. if not first_lin: first_lin = lin @@ -78,7 +79,7 @@ def guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, report_fp): sum_ident += count - f_ident = sum_ident / len(entire_mh) + f_ident = sum_ident / num_hashes f_major = first_count / sum_ident return first_lin, f_ident, f_major @@ -117,71 +118,28 @@ def choose_genome_lineage(guessed_genome_lineage, provided_lineage, match_rank, return genome_lineage, comment, needs_lineage -def get_genome_taxonomy(matches_filename, genome_sig_filename, provided_lineage, +def get_genome_taxonomy(genome_name, genome_gather_json_filename, provided_lineage, tax_assign, match_rank, min_f_ident, min_f_major): - with open(matches_filename, 'rt') as fp: - try: - siglist = list(sourmash.load_signatures(fp, do_raise=True, quiet=True)) - except sourmash.exceptions.SourmashError: - siglist = None - - if not siglist: - comment = 'no matches for this genome.' - print(comment) - return None, comment, False, 0.0, 0.0 - - # construct a template minhash object that we can use to create new 'uns - empty_mh = siglist[0].minhash.copy_and_clear() - ksize = empty_mh.ksize - scaled = empty_mh.scaled - moltype = empty_mh.moltype - - genome_sig = sourmash.load_one_signature(genome_sig_filename) - entire_mh = genome_sig.minhash - - assert entire_mh.scaled == scaled - - # Hack for examining members of our search database: remove exact matches. - new_siglist = [] - for ss in siglist: - if entire_mh.similarity(ss.minhash) < 1.0: - new_siglist.append(ss) - else: - if provided_lineage and provided_lineage != 'NA': - print(f'found exact match: {ss.name()}. removing.') - else: - print(f'found exact match: {ss.name()}. but no provided lineage!') - comment = f'found exact match: {ss.name()}. but no provided lineage! cannot analyze.' - return None, comment, True, 1.0, 1.0 - - # ...but leave exact matches in if they're the only matches, I guess! - if new_siglist: - siglist = new_siglist - # create empty LCA database to populate... - lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype) - lin_db = LineageDB() + guessed_genome_lineage, f_major, f_ident = "", 0.0, 0.0 + # did we get gather results? + genome_info = utils.load_contigs_gather_json(genome_gather_json_filename) - # ...with specific matches. - for ss in siglist: - ident = get_ident(ss) - lineage = tax_assign[ident] + if genome_info: + gather_results = genome_info[genome_name].gather_tax + genome_len = genome_info[genome_name].length + genome_hashes = genome_info[genome_name].num_hashes - lca_db.insert(ss, ident=ident) - lin_db.insert(ident, lineage) - print(f'loaded {len(siglist)} signatures & created LCA Database') + # calculate lineage from majority vote on LCA + guessed_genome_lineage, f_major, f_ident = guess_tax_by_gather(gather_results, genome_hashes, match_rank, sys.stdout) - # calculate lineage from majority vote on LCA - guessed_genome_lineage, f_major, f_ident = \ - guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, sys.stdout) + print(f'Gather classification on this genome yields: {pretty_print_lineage(guessed_genome_lineage)}') - print(f'Gather classification on this genome yields: {pretty_print_lineage(guessed_genome_lineage)}') - - if f_major == 1.0 and f_ident == 1.0: - comment = "All genome hashes belong to one lineage! Nothing to do." - print(comment) - return guessed_genome_lineage, comment, False, f_major, f_ident + if f_major == 1.0 and f_ident == 1.0: + comment = "All genome hashes belong to one lineage! Nothing to do." + print(comment) + return guessed_genome_lineage, comment, False, f_major, f_ident # did we get a passed-in lineage assignment? provided_lin = "" @@ -236,13 +194,12 @@ def main(args): # process every genome individually. summary_d = {} for n, genome_name in enumerate(genome_names): - matches_filename = os.path.join(dirname, genome_name + '.matches.sig') - genome_sig = os.path.join(dirname, genome_name + '.sig') lineage = provided_lineages.get(genome_name, '') + genome_json = os.path.join(dirname, genome_name + '.genome-tax.json') contigs_json = os.path.join(dirname, genome_name + '.contigs-tax.json') - x = get_genome_taxonomy(matches_filename, - genome_sig, + x = get_genome_taxonomy(genome_name, + genome_json, lineage, tax_assign, match_rank, args.min_f_ident, @@ -354,7 +311,7 @@ def main(args): # output a sorted hit list CSV fp = open(args.hit_list, 'wt') hitlist_w = csv.writer(fp) - + hitlist_w.writerow(['genome', 'filter_at', 'override_filter_at', 'total_bad_bp', 'superkingdom_bad_bp', 'phylum_bad_bp', 'class_bad_bp', 'order_bad_bp', 'family_bad_bp', 'genus_bad_bp', diff --git a/charcoal/contigs_search.py b/charcoal/gather_taxonomy.py similarity index 77% rename from charcoal/contigs_search.py rename to charcoal/gather_taxonomy.py index b500f62..e5ba93b 100644 --- a/charcoal/contigs_search.py +++ b/charcoal/gather_taxonomy.py @@ -54,9 +54,11 @@ def main(args): # complain.) if not siglist: print('no non-identical matches for this genome, exiting.') - contigs_tax = {} - with open(args.json_out, 'wt') as fp: + contigs_tax, genome_tax = {},{} + with open(args.contigs_json_out, 'wt') as fp: fp.write(json.dumps(contigs_tax)) + with open(args.genome_json_out, 'wt') as fp: + fp.write(json.dumps(genome_tax)) return 0 # construct a template minhash object that we can use to create new 'uns @@ -83,7 +85,8 @@ def main(args): print(f'reading contigs from {genomebase}') screed_iter = screed.open(args.genome) - contigs_tax = {} + contigs_tax, genome_tax = {},{} + # contigs search for n, record in enumerate(screed_iter): # look at each contig individually mh = empty_mh.copy_and_clear() @@ -100,9 +103,25 @@ def main(args): print(f"Processed {len(contigs_tax)} contigs.") # save! - with open(args.json_out, 'wt') as fp: + with open(args.contigs_json_out, 'wt') as fp: fp.write(json.dumps(contigs_tax)) + + # genome search + genome_len=0 + entire_mh = genome_sig.minhash + genome_name = os.path.basename(genome_sig.name()) + num_hashes = len(entire_mh.hashes) + if not genome_len: + for record in screed_iter: + genome_len+=len(record.sequence) + gather_results = list(gather_at_rank(entire_mh, lca_db, lin_db, match_rank)) + # currently treats genome as a giant contig, since we need the same info. Any reason to include addl info? + genome_gather_info = ContigGatherInfo(genome_len, len(entire_mh), gather_results) + genome_tax[genome_name] = genome_gather_info + with open(args.genome_json_out, 'wt') as fp: + fp.write(json.dumps(genome_tax)) + return 0 @@ -116,7 +135,10 @@ def cmdline(sys_args): p.add_argument('--force', help='continue past survivable errors', action='store_true') - p.add_argument('--json-out', + p.add_argument('--contigs-json-out', + help='JSON-format output file of all contig tax results', + required=True) + p.add_argument('--genome-json-out', help='JSON-format output file of all tax results', required=True) args = p.parse_args() diff --git a/tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json b/tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json new file mode 100644 index 0000000..75cffc7 --- /dev/null +++ b/tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json @@ -0,0 +1 @@ +{"LoombaR_2017__SID1050_bax__bin.11.fa.gz": [0, 2723, [[[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], 1995], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Angelakisella"]], 18], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__OEMR01"]], 14], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Gemmiger_A"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__An200"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Anaerotignaceae"], ["genus", "g__Anaerotignum"]], 9], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Lawsonibacter"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Flavonifractor"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__Blautia_A"]], 3], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__GCA-900066575"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Fournierella"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Pseudoflavonifractor"]], 1], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__Merdibacter"]], 1]]]} \ No newline at end of file diff --git a/tests/test_contigs_search.py b/tests/test_gather_taxonomy.py similarity index 56% rename from tests/test_contigs_search.py rename to tests/test_gather_taxonomy.py index 76fff3b..36add68 100644 --- a/tests/test_contigs_search.py +++ b/tests/test_gather_taxonomy.py @@ -2,7 +2,7 @@ from . import pytest_utils as utils import json -from charcoal import contigs_search +from charcoal import gather_taxonomy @utils.in_tempdir @@ -13,14 +13,19 @@ def test_1(location): args.genome_sig = utils.relative_file("tests/test-data/genomes/2.fa.gz.sig") args.matches_sig = utils.relative_file("tests/test-data/2.fa.gz.gather-matches.sig.gz") args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv") - args.json_out = os.path.join(location, 'tax.json') + args.contigs_json_out = os.path.join(location, 'contigs-tax.json') + args.genome_json_out = os.path.join(location, 'genome-tax.json') - status = contigs_search.main(args) + status = gather_taxonomy.main(args) assert status == 0 - assert os.path.exists(args.json_out) + assert os.path.exists(args.contigs_json_out) + assert os.path.exists(args.genome_json_out) - with open(args.json_out, 'rt') as fp: + with open(args.contigs_json_out, 'rt') as fp: + results = json.load(fp) + assert results == {} + with open(args.genome_json_out, 'rt') as fp: results = json.load(fp) assert results == {} @@ -33,14 +38,16 @@ def test_2_loomba(location): args.genome_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.sig") args.matches_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.sig") args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv") - args.json_out = os.path.join(location, 'tax.json') + args.contigs_json_out = os.path.join(location, 'contigs-tax.json') + args.genome_json_out = os.path.join(location, 'genome-tax.json') - status = contigs_search.main(args) + status = gather_taxonomy.main(args) assert status == 0 - assert os.path.exists(args.json_out) + assert os.path.exists(args.contigs_json_out) - with open(args.json_out, 'rt') as fp: + # check contig results + with open(args.contigs_json_out, 'rt') as fp: this_results = json.load(fp) saved_results_file = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.contigs-tax.json") @@ -48,3 +55,13 @@ def test_2_loomba(location): saved_results = json.load(fp) assert this_results == saved_results + + # check genome results + with open(args.genome_json_out, 'rt') as fp: + this_results = json.load(fp) + + saved_results_file = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json") + with open(saved_results_file, 'rt') as fp: + saved_results = json.load(fp) + + assert this_results == saved_results