From b6bc4f598f864f8c147426106ccd25b50680d096 Mon Sep 17 00:00:00 2001 From: Evguenia Kopylova Date: Fri, 8 Jul 2016 07:04:04 -0400 Subject: [PATCH 1/8] Initial commit --- README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..803a16c --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# tcga +TCGA analysis From 58858fad03319c35f597ca9f24a995251910c524 Mon Sep 17 00:00:00 2001 From: System Administrator Date: Fri, 8 Jul 2016 07:23:02 -0400 Subject: [PATCH 2/8] scripts --- scripts/compute_taxonomy_breakdown.py | 76 +++++++++++++++++ scripts/filter_fasta.py | 42 ++++++++++ scripts/generate_kraken_db.py | 116 ++++++++++++++++++++++++++ 3 files changed, 234 insertions(+) create mode 100644 scripts/compute_taxonomy_breakdown.py create mode 100644 scripts/filter_fasta.py create mode 100644 scripts/generate_kraken_db.py diff --git a/scripts/compute_taxonomy_breakdown.py b/scripts/compute_taxonomy_breakdown.py new file mode 100644 index 0000000..65f2d68 --- /dev/null +++ b/scripts/compute_taxonomy_breakdown.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +# Compute number of unique taxonomies and their counts + +import sys + + +def get_genome_paths(scores_average_fp, scores_repophlan_fp): + """Return taxonomy strings and their counts. + """ + genomes = [] + # Get genome IDs + with open(scores_average_fp) as scores_average_f: + next(scores_average_f) + for line in scores_average_f: + line = line.strip().split('\t') + genome_id = line[0] + if genome_id not in genomes: + genomes.append(genome_id) + else: + raise ValueError("Duplicate genome IDs %s" % genome_id) + # Get taxonomies based on used genome IDs + taxonomies = {} + genomes_without_taxonomy = [] + with open(scores_repophlan_fp) as scores_repophlan_f: + # header + line = scores_repophlan_f.readline() + line = line.strip().split('\t') + tax_idx = line.index('taxonomy') + for line in scores_repophlan_f: + line = line.strip().split('\t') + genome_id = line[0] + # only want tax_ids for genomes passing quality filter + if genome_id in genomes: + taxonomy = line[tax_idx] + if (('k__Bacteria' not in taxonomy) and + ('k__Archaea' not in taxonomy) and + ('k__Viruses' not in taxonomy) and + ('k__Viroids' not in taxonomy)): + genomes_without_taxonomy.append(genome_id) + if taxonomy not in taxonomies: + taxonomies[taxonomy] = 1 + else: + taxonomies[taxonomy] += 1 + return ([(k, taxonomies[k]) for k in sorted(taxonomies, key=taxonomies.get, reverse=True)], + genomes_without_taxonomy) + + +def main(): + """Parse output of RepoPhlAn's repophlan_get_microbes.txt to obtain + unique taxonomies and their counts. + """ + # Output of RepoPhlAn's repophlan_get_microbes.py + repophlan_scores_fp = sys.argv[1] + # Output of Jon's score average script + repophlan_scores_average_fp = sys.argv[2] + + taxonomies, genomes_without_taxonomy = get_genome_paths( + repophlan_scores_average_fp, repophlan_scores_fp) + if len(genomes_without_taxonomy) != 0: + print("Genomes without taxonomy: %s" % len(genomes_without_taxonomy)) + + for tup in taxonomies: + print("%s\t%s" % (tup[0], tup[1])) + + +if __name__ == '__main__': + main() diff --git a/scripts/filter_fasta.py b/scripts/filter_fasta.py new file mode 100644 index 0000000..1b6d74c --- /dev/null +++ b/scripts/filter_fasta.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +import sys + +import skbio +from skbio import Sequence + +def main(): + """Output FASTA file using sequences in Kraken output. + """ + kraken_results_fp = sys.argv[1] + fasta_fp = sys.argv[2] + output_fp = sys.argv[3] + fasta_d = {} + # Load all FASTA seqs into dictionary + for seq in skbio.io.read(fasta_fp, format='fasta'): + label = seq.metadata['id'] + description = seq.metadata['description'] + if label not in fasta_d: + fasta_d[label] = [description, str(seq)] + else: + raise ValueError("Duplicate FASTA labels %s" % label) + # Output sequences to FASTA. + with open(output_fp, 'w') as output_f: + with open(kraken_results_fp) as kraken_results_f: + for line in kraken_results_f: + label = line.strip().split('\t')[1] + if label in fasta_d: + output_f.write(">%s%s\n%s\n" % (label, fasta_d[label][0], + fasta_d[label][1])) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/generate_kraken_db.py b/scripts/generate_kraken_db.py new file mode 100644 index 0000000..491266d --- /dev/null +++ b/scripts/generate_kraken_db.py @@ -0,0 +1,116 @@ +#!/bin/python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +# Edit the FASTA label to Kraken format: +# >sequence16|kraken:taxid|32630 Adapter sequence +# CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA + +import sys +import bz2 +from skbio import Sequence +import skbio + +from os import walk +from os.path import join, splitext, basename, isfile + + +def get_genome_paths(scores_average_fp, scores_repophlan_fp): + """Return genome ID, tax_id and .fna.bz2 path + """ + genomes = {} + # Get quality filtered genome ID and filepath to .fna.bz2 + with open(scores_average_fp) as scores_average_f: + # header + line = scores_average_f.readline() + line = line.strip().split('\t') + fna_idx = line.index('fna_lname') + for line in scores_average_f: + line = line.strip().split('\t') + genome_id = line[0] + fna_fp = line[fna_idx] + if genome_id not in genomes: + genomes[genome_id] = [fna_fp] + else: + raise ValueError("Duplicate genome IDs %s" % genome_id) + # Get tax_id + genomes_without_tax_id = [] + with open(scores_repophlan_fp) as scores_repophlan_f: + # header + line = scores_repophlan_f.readline() + line = line.strip().split('\t') + tax_id_idx = line.index('taxid') + for line in scores_repophlan_f: + line = line.strip().split('\t') + genome_id = line[0] + # only want tax_ids for genomes passing quality filter + if genome_id in genomes: + tax_id = line[tax_id_idx] + # tax_id must be an integer, if not check the field + # immediately prior (bug in repophlan parsing) + if not tax_id.isdigit(): + tax_id = line[tax_id_idx-1] + if not tax_id.isdigit(): + genomes_without_tax_id.append(genome_id) + continue + genomes[genome_id].append(int(tax_id)) + return genomes, genomes_without_tax_id + + +def format_labels(genomes, repophlan_scores_filtered_genomes_dp): + """Format FASTA labels in qualified genomes. + """ + kraken_db_str = "|kraken:taxid|" + for genome_id, info in genomes.items(): + genome_fp = info[0] + taxid = info[1] + genome_fp_name = basename(splitext(genome_fp)[0]) + # check FNA file exists for genome + if genome_fp_name != "": + output_fp = join(repophlan_scores_filtered_genomes_dp, + genome_fp_name) + # skip files already modified (e.g. from previous run) + if not isfile(output_fp): + with open(output_fp, 'w') as output_f: + for seq in skbio.io.read( + genome_fp, format='fasta', compression='bz2'): + id_ = seq.metadata['id'] + description = seq.metadata['description'] + new_id_ = "%s%s%s" % (id_, kraken_db_str, taxid) + seq.metadata['id'] = new_id_ + output_f.write( + ">%s%s%s %s\n%s\n" % (id_, kraken_db_str, + taxid, description, + str(seq))) + + +def main(): + """Edit qualified genomes' labels to Kraken format. + """ + # .fna.bz2 genomes folder + all_genomes_bz2_dp = sys.argv[1] + # Output of RepoPhlAn's repophlan_get_microbes.py + repophlan_scores_fp = sys.argv[2] + # Output of Jon's score average script + # https://github.com/tanaes/script_bin/blob/master/filter_repophlan.py + repophlan_scores_average_fp = sys.argv[3] + # Output directory path + repophlan_scores_filtered_genomes_dp = sys.argv[4] + + genomes, genomes_without_tax_id = get_genome_paths( + repophlan_scores_average_fp, repophlan_scores_fp) + if len(genomes_without_tax_id) != 0: + raise ValueError( + "Some genomes do not have a taxid: %s" % genomes_without_tax_id) + + format_labels(genomes, repophlan_scores_filtered_genomes_dp) + + +if __name__ == '__main__': + main() \ No newline at end of file From 04668bc28a1584467cfeab9d18e7b2c58c5c3ce2 Mon Sep 17 00:00:00 2001 From: Jad Kanbar Date: Tue, 1 Nov 2016 17:44:10 -0700 Subject: [PATCH 3/8] adding to python scripts --- python_scripts/cgc_bowtie2_build.py | 165 ++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 python_scripts/cgc_bowtie2_build.py diff --git a/python_scripts/cgc_bowtie2_build.py b/python_scripts/cgc_bowtie2_build.py new file mode 100644 index 0000000..a0c70b6 --- /dev/null +++ b/python_scripts/cgc_bowtie2_build.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova, Jad Kanbar +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +""" +Build Bowtie2 database on all reference genomes in Kraken report. +""" + +import click +import collections + +def load_kraken_mpa_report(kraken_mpa_report_fp, + taxonomic_rank, + taxa_levels, + taxa_levels_idx, + read_per_taxa): + """Absolute abundance of number of reads matching a defined taxa level. + Parameters + ---------- + kraken_translate_report_fp: str + filepath to output of "kraken translate" + taxonomic_rank: str + taxonomy level (e.g., genus or species) + taxa_levels: dict + keys are full name taxonomic ranks and values are abbreviated ranks + taxa_levels_idx: dict + 2-way dict storing integer depths for abbreviated taxonomic ranks + read_per_taxa: in + integer of number of minimum number of reads to keep per taxa + + Returns + ------- + taxonomies: set + set of taxonomies from kraken_translate_report_fp + split_on_level: str + string that determines the level to split the taxonomy to be used in + create_db_folder function + repo_file_name: str + string with filename suffix from kraken_translate_report_fp to be used + in create_db_folder function + + """ + repo_file_name = kraken_mpa_report_fp.strip().split('/')[-1].split('mpa')[0] + + taxonomic_rank_level_str = taxa_levels[taxonomic_rank] + taxonomic_rank_level_int = taxa_levels_idx[taxonomic_rank_level_str] + + if taxonomic_rank_level_int < 6: + split_on_level = taxa_levels_idx[str(taxonomic_rank_level_int + 1)] + else: + # keep full string (to species level) + split_on_level = '\n' + taxonomic_abundances= [] + with open(kraken_mpa_report_fp) as kraken_translate_report_f: + for line in kraken_translate_report_f: + label, taxonomy = line.strip().split('\t') + if taxonomic_rank_level_str in taxonomy: + # keep taxonomy string up to specified level + taxonomy_parse = taxonomy.split(split_on_level)[0] + taxonomy_parse = taxonomy_parse.replace('d__','k__') + taxonomic_abundances.append(taxonomy_parse) + + taxonomies = set([k for k, v in + collections.Counter(taxonomic_abundances).iteritems() + if v > read_per_taxa]) + + return taxonomies, split_on_level, repo_file_name + +def create_db_folder(repophlan_genomeid_taxonomy_fp, + genome_tax, + split_on_level, + repo_file_name): + + """Return sets for sample IDs and taxonomy strings. + Parameters + ---------- + repophlan_genomeid_taxonomy_fp: str + filepath to output of repophlan file genome IDs and associated taxa + genome_tax: set + set of taxonomies from kraken_translate_report_fp + split_on_level: str + string that determines the level to split the taxonomy in genome_tax + repo_file_name + string with filename suffix from kraken_translate_report_fp + + Returns + ------- + Writes a text file with genome IDs, directory to genomes, and associated + taxa from repophlan_genomeid_taxonomy_fp containned with the set + genome_tax + """ + repo_file_name = repo_file_name+'repophlan_genomeID_taxonomy.good' + + with open(repo_file_name, 'w') as f: + with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f: + for line in repophlan_genomeID_taxonomy_f: + tax_id_repo, directory_repo, taxonomy_repo =\ + line.strip().split('\t') + taxonomy_repo = taxonomy_repo.split(split_on_level)[0] + if taxonomy_repo in genome_tax: + f.writelines(line) + + +@click.command() +@click.option('--kraken-mpa-report-fp', required=True, + type=click.Path(resolve_path=True, readable=True, exists=True, + file_okay=True), + help='Filepath to Kraken report') +@click.option('--repophlan-genomeID-taxonomy-fp', required=True, + type=click.Path(resolve_path=True, readable=True, exists=False, + file_okay=True), + help='Filepath to RepoPhlAn genome ID and taxonomy list') +@click.option('--taxonomic-rank', type=click.Choice(['genus', 'species', + 'family', 'order', + 'class', 'phylum', + 'domain']), + required=False, default=['genus'], show_default=True, + help="Taxonomic rank at which to generate summary") + +@click.option('--read-per-taxa', required=False, + default=10, show_default=True, + help="Minimum number of reads needed for each taxa") + + +def main(kraken_mpa_report_fp, + repophlan_genomeid_taxonomy_fp, + taxonomic_rank, + read_per_taxa): + + taxa_levels = {"domain": "d__", + "phylum": "|p__", + "class": "|c__", + "order": "|o__", + "family": "|f__", + "genus": "|g__", + "species": "|s__"} + + taxa_levels_idx = {"d__": 0, "|p__": 1, "|c__": 2, + "|o__": 3, "|f__": 4, "|g__": 5, + "|s__": 6, "6": "|s__", "5": "|g__", + "4": "|f__", "3": "|o__", "2": "|c__", + "1": "|p__", "0": "d__"} + + taxonomic_set, spit_on_level, repo_file_name =\ + load_kraken_mpa_report( + kraken_mpa_report_fp=kraken_mpa_report_fp, + taxonomic_rank=taxonomic_rank, + taxa_levels=taxa_levels, + taxa_levels_idx=taxa_levels_idx, + read_per_taxa=read_per_taxa) + + create_db_folder( + repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp, + genome_tax=taxonomic_set, + split_on_level=spit_on_level, + repo_file_name=repo_file_name) + +if __name__ == "__main__": + main() From b79a084f7d08bb6f99fa06688a7927b05b13e5c6 Mon Sep 17 00:00:00 2001 From: Jad Kanbar Date: Wed, 2 Nov 2016 22:41:19 -0700 Subject: [PATCH 4/8] Made changes to add tuple of input kraken mpa reports --- cgc_bowtie2_build.py | 91 +++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 52 deletions(-) diff --git a/cgc_bowtie2_build.py b/cgc_bowtie2_build.py index a0c70b6..33de9bd 100644 --- a/cgc_bowtie2_build.py +++ b/cgc_bowtie2_build.py @@ -16,66 +16,45 @@ import collections def load_kraken_mpa_report(kraken_mpa_report_fp, - taxonomic_rank, taxa_levels, - taxa_levels_idx, read_per_taxa): """Absolute abundance of number of reads matching a defined taxa level. Parameters ---------- kraken_translate_report_fp: str filepath to output of "kraken translate" - taxonomic_rank: str - taxonomy level (e.g., genus or species) - taxa_levels: dict - keys are full name taxonomic ranks and values are abbreviated ranks - taxa_levels_idx: dict - 2-way dict storing integer depths for abbreviated taxonomic ranks - read_per_taxa: in + taxa_levels: list + list of two elements that includes the taxonomic rank at which + to generate summary and rank below to split by + read_per_taxa: int integer of number of minimum number of reads to keep per taxa Returns ------- taxonomies: set set of taxonomies from kraken_translate_report_fp - split_on_level: str - string that determines the level to split the taxonomy to be used in - create_db_folder function - repo_file_name: str - string with filename suffix from kraken_translate_report_fp to be used - in create_db_folder function - """ - repo_file_name = kraken_mpa_report_fp.strip().split('/')[-1].split('mpa')[0] - - taxonomic_rank_level_str = taxa_levels[taxonomic_rank] - taxonomic_rank_level_int = taxa_levels_idx[taxonomic_rank_level_str] - - if taxonomic_rank_level_int < 6: - split_on_level = taxa_levels_idx[str(taxonomic_rank_level_int + 1)] - else: - # keep full string (to species level) - split_on_level = '\n' taxonomic_abundances= [] - with open(kraken_mpa_report_fp) as kraken_translate_report_f: - for line in kraken_translate_report_f: - label, taxonomy = line.strip().split('\t') - if taxonomic_rank_level_str in taxonomy: - # keep taxonomy string up to specified level - taxonomy_parse = taxonomy.split(split_on_level)[0] - taxonomy_parse = taxonomy_parse.replace('d__','k__') - taxonomic_abundances.append(taxonomy_parse) + for report_fp in kraken_mpa_report_fp: + with open(report_fp) as report_fp: + for line in report_fp: + label, taxonomy = line.strip().split('\t') + if taxa_levels[0] in taxonomy: + # keep taxonomy string up to specified level + taxonomy_parse = taxonomy.split(taxa_levels[1])[0] + taxonomy_parse = taxonomy_parse.replace('d__','k__') + taxonomic_abundances.append(taxonomy_parse) taxonomies = set([k for k, v in collections.Counter(taxonomic_abundances).iteritems() if v > read_per_taxa]) - return taxonomies, split_on_level, repo_file_name + return taxonomies def create_db_folder(repophlan_genomeid_taxonomy_fp, genome_tax, split_on_level, - repo_file_name): + output_filename): """Return sets for sample IDs and taxonomy strings. Parameters @@ -86,8 +65,8 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, set of taxonomies from kraken_translate_report_fp split_on_level: str string that determines the level to split the taxonomy in genome_tax - repo_file_name - string with filename suffix from kraken_translate_report_fp + output_filename: str + string with output filename Returns ------- @@ -95,9 +74,7 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, taxa from repophlan_genomeid_taxonomy_fp containned with the set genome_tax """ - repo_file_name = repo_file_name+'repophlan_genomeID_taxonomy.good' - - with open(repo_file_name, 'w') as f: + with open(output_filename, 'w') as f: with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f: for line in repophlan_genomeID_taxonomy_f: tax_id_repo, directory_repo, taxonomy_repo =\ @@ -108,7 +85,8 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, @click.command() -@click.option('--kraken-mpa-report-fp', required=True, + +@click.option('--kraken-mpa-report-fp', required=True, multiple=True, type=click.Path(resolve_path=True, readable=True, exists=True, file_okay=True), help='Filepath to Kraken report') @@ -122,16 +100,19 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, 'domain']), required=False, default=['genus'], show_default=True, help="Taxonomic rank at which to generate summary") - @click.option('--read-per-taxa', required=False, default=10, show_default=True, - help="Minimum number of reads needed for each taxa") - + help="Minimum number of reads for each taxa") +@click.option('--output-filename', required=False, + default='subset_repophlan_genomeID_taxonomy.good', + show_default=True, + help="Output filename for RepoPhlAn genome ID and taxonomy list") def main(kraken_mpa_report_fp, repophlan_genomeid_taxonomy_fp, taxonomic_rank, - read_per_taxa): + read_per_taxa, + output_filename): taxa_levels = {"domain": "d__", "phylum": "|p__", @@ -147,19 +128,25 @@ def main(kraken_mpa_report_fp, "4": "|f__", "3": "|o__", "2": "|c__", "1": "|p__", "0": "d__"} - taxonomic_set, spit_on_level, repo_file_name =\ + taxa_levels_str=taxa_levels[taxonomic_rank] + taxa_levels_idx_int=taxa_levels_idx[taxa_levels_str] + + if taxa_levels_idx_int < 6: + split_on_level = taxa_levels_idx[str(taxa_levels_idx_int + 1)] + else: + split_on_level = '\t' + + taxonomic_set =\ load_kraken_mpa_report( kraken_mpa_report_fp=kraken_mpa_report_fp, - taxonomic_rank=taxonomic_rank, - taxa_levels=taxa_levels, - taxa_levels_idx=taxa_levels_idx, + taxa_levels=[taxa_levels_str,split_on_level], read_per_taxa=read_per_taxa) create_db_folder( repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp, genome_tax=taxonomic_set, - split_on_level=spit_on_level, - repo_file_name=repo_file_name) + split_on_level=split_on_level, + output_filename=output_filename) if __name__ == "__main__": main() From 6c4628a18d1a2b0251cb811b6d54034ed486e72a Mon Sep 17 00:00:00 2001 From: Jad Kanbar Date: Wed, 2 Nov 2016 22:46:17 -0700 Subject: [PATCH 5/8] Delete cgc_bowtie2_build.py --- cgc_bowtie2_build.py | 152 ------------------------------------------- 1 file changed, 152 deletions(-) delete mode 100644 cgc_bowtie2_build.py diff --git a/cgc_bowtie2_build.py b/cgc_bowtie2_build.py deleted file mode 100644 index 33de9bd..0000000 --- a/cgc_bowtie2_build.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env python - -#----------------------------------------------------------------------------- -# Copyright (c) 2016--, Evguenia Kopylova, Jad Kanbar -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file COPYING.txt, distributed with this software. -#----------------------------------------------------------------------------- - -""" -Build Bowtie2 database on all reference genomes in Kraken report. -""" - -import click -import collections - -def load_kraken_mpa_report(kraken_mpa_report_fp, - taxa_levels, - read_per_taxa): - """Absolute abundance of number of reads matching a defined taxa level. - Parameters - ---------- - kraken_translate_report_fp: str - filepath to output of "kraken translate" - taxa_levels: list - list of two elements that includes the taxonomic rank at which - to generate summary and rank below to split by - read_per_taxa: int - integer of number of minimum number of reads to keep per taxa - - Returns - ------- - taxonomies: set - set of taxonomies from kraken_translate_report_fp - """ - taxonomic_abundances= [] - for report_fp in kraken_mpa_report_fp: - with open(report_fp) as report_fp: - for line in report_fp: - label, taxonomy = line.strip().split('\t') - if taxa_levels[0] in taxonomy: - # keep taxonomy string up to specified level - taxonomy_parse = taxonomy.split(taxa_levels[1])[0] - taxonomy_parse = taxonomy_parse.replace('d__','k__') - taxonomic_abundances.append(taxonomy_parse) - - taxonomies = set([k for k, v in - collections.Counter(taxonomic_abundances).iteritems() - if v > read_per_taxa]) - - return taxonomies - -def create_db_folder(repophlan_genomeid_taxonomy_fp, - genome_tax, - split_on_level, - output_filename): - - """Return sets for sample IDs and taxonomy strings. - Parameters - ---------- - repophlan_genomeid_taxonomy_fp: str - filepath to output of repophlan file genome IDs and associated taxa - genome_tax: set - set of taxonomies from kraken_translate_report_fp - split_on_level: str - string that determines the level to split the taxonomy in genome_tax - output_filename: str - string with output filename - - Returns - ------- - Writes a text file with genome IDs, directory to genomes, and associated - taxa from repophlan_genomeid_taxonomy_fp containned with the set - genome_tax - """ - with open(output_filename, 'w') as f: - with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f: - for line in repophlan_genomeID_taxonomy_f: - tax_id_repo, directory_repo, taxonomy_repo =\ - line.strip().split('\t') - taxonomy_repo = taxonomy_repo.split(split_on_level)[0] - if taxonomy_repo in genome_tax: - f.writelines(line) - - -@click.command() - -@click.option('--kraken-mpa-report-fp', required=True, multiple=True, - type=click.Path(resolve_path=True, readable=True, exists=True, - file_okay=True), - help='Filepath to Kraken report') -@click.option('--repophlan-genomeID-taxonomy-fp', required=True, - type=click.Path(resolve_path=True, readable=True, exists=False, - file_okay=True), - help='Filepath to RepoPhlAn genome ID and taxonomy list') -@click.option('--taxonomic-rank', type=click.Choice(['genus', 'species', - 'family', 'order', - 'class', 'phylum', - 'domain']), - required=False, default=['genus'], show_default=True, - help="Taxonomic rank at which to generate summary") -@click.option('--read-per-taxa', required=False, - default=10, show_default=True, - help="Minimum number of reads for each taxa") -@click.option('--output-filename', required=False, - default='subset_repophlan_genomeID_taxonomy.good', - show_default=True, - help="Output filename for RepoPhlAn genome ID and taxonomy list") - -def main(kraken_mpa_report_fp, - repophlan_genomeid_taxonomy_fp, - taxonomic_rank, - read_per_taxa, - output_filename): - - taxa_levels = {"domain": "d__", - "phylum": "|p__", - "class": "|c__", - "order": "|o__", - "family": "|f__", - "genus": "|g__", - "species": "|s__"} - - taxa_levels_idx = {"d__": 0, "|p__": 1, "|c__": 2, - "|o__": 3, "|f__": 4, "|g__": 5, - "|s__": 6, "6": "|s__", "5": "|g__", - "4": "|f__", "3": "|o__", "2": "|c__", - "1": "|p__", "0": "d__"} - - taxa_levels_str=taxa_levels[taxonomic_rank] - taxa_levels_idx_int=taxa_levels_idx[taxa_levels_str] - - if taxa_levels_idx_int < 6: - split_on_level = taxa_levels_idx[str(taxa_levels_idx_int + 1)] - else: - split_on_level = '\t' - - taxonomic_set =\ - load_kraken_mpa_report( - kraken_mpa_report_fp=kraken_mpa_report_fp, - taxa_levels=[taxa_levels_str,split_on_level], - read_per_taxa=read_per_taxa) - - create_db_folder( - repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp, - genome_tax=taxonomic_set, - split_on_level=split_on_level, - output_filename=output_filename) - -if __name__ == "__main__": - main() From 87971fc7ca6acc320255ffe65c3d9beb68cc2f67 Mon Sep 17 00:00:00 2001 From: Jad Kanbar Date: Wed, 2 Nov 2016 22:46:36 -0700 Subject: [PATCH 6/8] Update README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 2e8723e..2859232 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ -<<<<<<< HEAD ## TCGA Analysis Workflow to rapidly search TCGA data for microbial presence. ======= # tcga TCGA analysis ->>>>>>> 3382f26eb9271f9457eb9c7246c95b1117437a6b From 85ef3a2fb7c0ccf9f15775720b83745dabd2690f Mon Sep 17 00:00:00 2001 From: Jad Kanbar Date: Wed, 2 Nov 2016 22:47:24 -0700 Subject: [PATCH 7/8] Can now add input tuple of krakne mpa reports --- python_scripts/cgc_bowtie2_build.py | 91 +++++++++++++---------------- 1 file changed, 39 insertions(+), 52 deletions(-) diff --git a/python_scripts/cgc_bowtie2_build.py b/python_scripts/cgc_bowtie2_build.py index a0c70b6..33de9bd 100644 --- a/python_scripts/cgc_bowtie2_build.py +++ b/python_scripts/cgc_bowtie2_build.py @@ -16,66 +16,45 @@ import collections def load_kraken_mpa_report(kraken_mpa_report_fp, - taxonomic_rank, taxa_levels, - taxa_levels_idx, read_per_taxa): """Absolute abundance of number of reads matching a defined taxa level. Parameters ---------- kraken_translate_report_fp: str filepath to output of "kraken translate" - taxonomic_rank: str - taxonomy level (e.g., genus or species) - taxa_levels: dict - keys are full name taxonomic ranks and values are abbreviated ranks - taxa_levels_idx: dict - 2-way dict storing integer depths for abbreviated taxonomic ranks - read_per_taxa: in + taxa_levels: list + list of two elements that includes the taxonomic rank at which + to generate summary and rank below to split by + read_per_taxa: int integer of number of minimum number of reads to keep per taxa Returns ------- taxonomies: set set of taxonomies from kraken_translate_report_fp - split_on_level: str - string that determines the level to split the taxonomy to be used in - create_db_folder function - repo_file_name: str - string with filename suffix from kraken_translate_report_fp to be used - in create_db_folder function - """ - repo_file_name = kraken_mpa_report_fp.strip().split('/')[-1].split('mpa')[0] - - taxonomic_rank_level_str = taxa_levels[taxonomic_rank] - taxonomic_rank_level_int = taxa_levels_idx[taxonomic_rank_level_str] - - if taxonomic_rank_level_int < 6: - split_on_level = taxa_levels_idx[str(taxonomic_rank_level_int + 1)] - else: - # keep full string (to species level) - split_on_level = '\n' taxonomic_abundances= [] - with open(kraken_mpa_report_fp) as kraken_translate_report_f: - for line in kraken_translate_report_f: - label, taxonomy = line.strip().split('\t') - if taxonomic_rank_level_str in taxonomy: - # keep taxonomy string up to specified level - taxonomy_parse = taxonomy.split(split_on_level)[0] - taxonomy_parse = taxonomy_parse.replace('d__','k__') - taxonomic_abundances.append(taxonomy_parse) + for report_fp in kraken_mpa_report_fp: + with open(report_fp) as report_fp: + for line in report_fp: + label, taxonomy = line.strip().split('\t') + if taxa_levels[0] in taxonomy: + # keep taxonomy string up to specified level + taxonomy_parse = taxonomy.split(taxa_levels[1])[0] + taxonomy_parse = taxonomy_parse.replace('d__','k__') + taxonomic_abundances.append(taxonomy_parse) taxonomies = set([k for k, v in collections.Counter(taxonomic_abundances).iteritems() if v > read_per_taxa]) - return taxonomies, split_on_level, repo_file_name + return taxonomies def create_db_folder(repophlan_genomeid_taxonomy_fp, genome_tax, split_on_level, - repo_file_name): + output_filename): """Return sets for sample IDs and taxonomy strings. Parameters @@ -86,8 +65,8 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, set of taxonomies from kraken_translate_report_fp split_on_level: str string that determines the level to split the taxonomy in genome_tax - repo_file_name - string with filename suffix from kraken_translate_report_fp + output_filename: str + string with output filename Returns ------- @@ -95,9 +74,7 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, taxa from repophlan_genomeid_taxonomy_fp containned with the set genome_tax """ - repo_file_name = repo_file_name+'repophlan_genomeID_taxonomy.good' - - with open(repo_file_name, 'w') as f: + with open(output_filename, 'w') as f: with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f: for line in repophlan_genomeID_taxonomy_f: tax_id_repo, directory_repo, taxonomy_repo =\ @@ -108,7 +85,8 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, @click.command() -@click.option('--kraken-mpa-report-fp', required=True, + +@click.option('--kraken-mpa-report-fp', required=True, multiple=True, type=click.Path(resolve_path=True, readable=True, exists=True, file_okay=True), help='Filepath to Kraken report') @@ -122,16 +100,19 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, 'domain']), required=False, default=['genus'], show_default=True, help="Taxonomic rank at which to generate summary") - @click.option('--read-per-taxa', required=False, default=10, show_default=True, - help="Minimum number of reads needed for each taxa") - + help="Minimum number of reads for each taxa") +@click.option('--output-filename', required=False, + default='subset_repophlan_genomeID_taxonomy.good', + show_default=True, + help="Output filename for RepoPhlAn genome ID and taxonomy list") def main(kraken_mpa_report_fp, repophlan_genomeid_taxonomy_fp, taxonomic_rank, - read_per_taxa): + read_per_taxa, + output_filename): taxa_levels = {"domain": "d__", "phylum": "|p__", @@ -147,19 +128,25 @@ def main(kraken_mpa_report_fp, "4": "|f__", "3": "|o__", "2": "|c__", "1": "|p__", "0": "d__"} - taxonomic_set, spit_on_level, repo_file_name =\ + taxa_levels_str=taxa_levels[taxonomic_rank] + taxa_levels_idx_int=taxa_levels_idx[taxa_levels_str] + + if taxa_levels_idx_int < 6: + split_on_level = taxa_levels_idx[str(taxa_levels_idx_int + 1)] + else: + split_on_level = '\t' + + taxonomic_set =\ load_kraken_mpa_report( kraken_mpa_report_fp=kraken_mpa_report_fp, - taxonomic_rank=taxonomic_rank, - taxa_levels=taxa_levels, - taxa_levels_idx=taxa_levels_idx, + taxa_levels=[taxa_levels_str,split_on_level], read_per_taxa=read_per_taxa) create_db_folder( repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp, genome_tax=taxonomic_set, - split_on_level=spit_on_level, - repo_file_name=repo_file_name) + split_on_level=split_on_level, + output_filename=output_filename) if __name__ == "__main__": main() From 85997102707a368d0b9ad39664cec4846166aa8b Mon Sep 17 00:00:00 2001 From: Jad Kanbar Date: Wed, 2 Nov 2016 23:24:20 -0700 Subject: [PATCH 8/8] pull request ready for review --- python_scripts/cgc_bowtie2_build.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python_scripts/cgc_bowtie2_build.py b/python_scripts/cgc_bowtie2_build.py index 33de9bd..fc21496 100644 --- a/python_scripts/cgc_bowtie2_build.py +++ b/python_scripts/cgc_bowtie2_build.py @@ -21,8 +21,8 @@ def load_kraken_mpa_report(kraken_mpa_report_fp, """Absolute abundance of number of reads matching a defined taxa level. Parameters ---------- - kraken_translate_report_fp: str - filepath to output of "kraken translate" + kraken_mpa_report_fp: str + filepath to output of "kraken mpa report" taxa_levels: list list of two elements that includes the taxonomic rank at which to generate summary and rank below to split by @@ -32,7 +32,7 @@ def load_kraken_mpa_report(kraken_mpa_report_fp, Returns ------- taxonomies: set - set of taxonomies from kraken_translate_report_fp + set of taxonomies from kraken_mpa_report_fp """ taxonomic_abundances= [] for report_fp in kraken_mpa_report_fp: @@ -62,7 +62,7 @@ def create_db_folder(repophlan_genomeid_taxonomy_fp, repophlan_genomeid_taxonomy_fp: str filepath to output of repophlan file genome IDs and associated taxa genome_tax: set - set of taxonomies from kraken_translate_report_fp + set of taxonomies from kraken_mpa_report_fp split_on_level: str string that determines the level to split the taxonomy in genome_tax output_filename: str