-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMetagenome_generation.py
302 lines (258 loc) · 15.5 KB
/
Metagenome_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import os
import argparse
import yaml
import subprocess
import warnings
import pandas as pd
import numpy as np
from typing import List
from Bio import Entrez, BiopythonDeprecationWarning
from pysat.examples.hitman import Hitman
from scripts.Genomes_db_update import update_genomes, write_multifasta
GENOMES_DIR = 'genomes'
def parse_args():
parser = argparse.ArgumentParser(usage='Metagenome_generation.py [PHENO] ...',
description='''Generate Illumina reads for the described metagenome in the FILE. ''')
parser.add_argument('-p', '--phenotype', default='2_species', nargs='?',
help='the base phenotype for metagenome construction ("Health", "HIV")')
parser.add_argument('-m', '--metagenome_file', default=None, nargs='?',
help='read metagenome composition from the file (tsv with species and abundances)')
parser.add_argument('--pathways', default=None, nargs='?',
help='read matebolic pathways to account from the file (each pathway on the new line')
parser.add_argument('--metabolites', default=None, nargs='?',
help='read metabolites, format: KEGG Compound ID (e.g. C07274)')
parser.add_argument('--c_model', default='primitive', nargs='?',
help='model for core metagenome selection ("primitive", "random", "weighted", "weighted_lognormal", "weighted_exponential", "shannon")')
parser.add_argument('--a_model', default='mean', nargs='?',
help='model for species abundances selection ("mean", "exponential", "normal", "lognormal")')
parser.add_argument('-c', '--n_core', default=None, nargs='?', help='number of core species to leave in metagenome')
parser.add_argument('-t', '--threads', default=1, help='number of threads (cores)')
parser.add_argument('-n', '--n_samples', default=1, nargs='?', help='number of generated metagenome samples')
parser.add_argument('-r', '--n_reads', default=None, nargs='?',
help='number of reads to generate (if set, overwrites the number present in iss_params.yml) ')
parser.add_argument('-o', '--out_dir', default='results', nargs='?',
help='path to directory to save generated files')
parser.add_argument('--email', default='[email protected]', nargs='?', help='Email address for Entrez requests')
parser.add_argument('--api_key', default=None, nargs='?', help='NCBI API key for Entrez requests (if any)')
return parser.parse_args()
def drop_missing_genomes(metagenome, missing_genomes):
"""
Drop missing genomes from the baseline metagenome.
Args:
metagenome (pd.DataFrame): Metagenome dataframe (with columns ['tax_id', 'species', 'mean_abundance', 'sd_abundance'])
missing_genomes: A pandas DataFrame containing the tax_id and name for species without an assembly in NCBI.
Returns:
A pandas DataFrame with the missing genomes dropped.
"""
metagenome = metagenome[~metagenome['tax_id'].isin(missing_genomes['tax_id'])]
return metagenome
def generate_core_metagenome(total_metagenome: pd.DataFrame, metagenome_size: int, c_model: str,
a_model: str) -> pd.DataFrame:
"""
Generate a core metagenome by selecting the most abundant species from the given genomes.
Args:
total_metagenome (pd.DataFrame): Metagenome dataframe (with columns ['tax_id', 'species', 'mean_abundance', 'sd_abundance'])
metagenome_size: The number of species to select for the core metagenome.
c_model: The model to use for core metagenome selection.
Can be one of 'primitive', 'random', 'weighted', 'weighted_lognormal', 'weighted_exponential', 'shannon'.
a_model: The model to use for species abundances selection.
Can be one of 'mean', 'exponential', 'normal', 'lognormal'.
Returns:
A pandas DataFrame with the core metagenome.
"""
match c_model:
case 'primitive':
core = total_metagenome.sort_values(by='mean_abundance', ascending=False).head(metagenome_size)
case 'random':
core = total_metagenome.sample(n=metagenome_size)
case 'weighted':
core = total_metagenome.sample(n=metagenome_size, weights='mean_abundance')
case 'weighted_lognormal':
core = total_metagenome.sample(n=metagenome_size,
weights=np.random.lognormal(total_metagenome['mean_abundance'],
total_metagenome['sd_abundance']))
case 'weighted_exponential':
core = total_metagenome.sample(n=metagenome_size,
weights=np.random.exponential(scale=total_metagenome['mean_abundance']))
case 'shannon':
total_metagenome['shannon_index'] = - total_metagenome['mean_abundance'] * np.log(
total_metagenome['mean_abundance'])
core = total_metagenome.sample(n=metagenome_size, weights='shannon_index').drop(columns=('shannon_index'))
case _:
raise ValueError(
f"Unknown model for core metagenome selection: {c_model}\nCan be one of 'primitive', 'random', 'weighted', 'weighted_lognormal', 'weighted_exponential', 'shannon'.")
match a_model:
case 'mean':
core['abundance'] = core['mean_abundance']
case 'exponential':
core['abundance'] = np.random.exponential(scale=core['mean_abundance'])
case 'normal':
core['abundance'] = np.abs(np.random.normal(core['mean_abundance'], core['sd_abundance']))
case 'lognormal':
core['abundance'] = np.random.lognormal(mean=core['mean_abundance'], sigma=core['sd_abundance'])
case _:
raise ValueError(
f"Unknown model for species abundances selection: {a_model}\nCan be one of 'mean', 'exponential', 'normal', 'lognormal'.")
return core
def do_hits(metagenome: pd.DataFrame, metabolic_needs: list[list], total_metagenome: pd.DataFrame) -> list:
"""
This function uses the Hitman package to calculate minimal refill required for metabolic pathways specified
in a given metabolic database.
Args:
metagenome (pd.DataFrame): Metagenome dataframe (with columns ['tax_id', 'species', 'mean_abundance', 'sd_abundance'])
metabolic_needs (List[List[str]]): A list of lists of metabolites required for each metabolic pathway.
total_metagenome (pd.DataFrame): Total baseline metagenome
Returns:
List[str]: A list representing the species needed to account for the specified metabolites.
"""
metagenome_species = set(metagenome.species.to_list())
needs_to_hit = []
for metabolic_need in metabolic_needs:
metabolic_need = set(metabolic_need)
if not set.intersection(metabolic_need, metagenome_species):
needs_to_hit.append(metabolic_need)
possible_hits = []
with Hitman(bootstrap_with=needs_to_hit, htype='sorted') as hitman:
for hs in hitman.enumerate():
possible_hits.append(hs)
hits_scores = []
for hit in possible_hits:
hits_scores.append(total_metagenome[total_metagenome['species'].isin(hit)]['abundance'].sum() / len(hit))
max_index = max(range(len(hits_scores)), key=lambda i: hits_scores[i])
best_hit = possible_hits[max_index]
return best_hit
def find_minimal_refill(metagenome: pd.DataFrame, metabolites_specified: List[str], pathways_db: pd.DataFrame,
total_metagenome) -> List[str]:
"""
Given a metagenome composition, a list of metabolites, and a pathways database,
find the minimal set of additional species that are needed to account for the given metabolites.
Args:
metagenome (pd.DataFrame): Metagenome dataframe (with columns ['tax_id', 'species', 'mean_abundance', 'sd_abundance'])
metabolites_specified (List[str]): A list of metabolite names that need to be accounted for.
pathways_db (pd.DataFrame): A pandas DataFrame containing the pathways database,
where rows represent metabolites and columns represent pathways.
The values are boolean indicating whether a metabolite is present in a pathway.
Returns:
List[str]: A list representing the species needed to account for the specified metabolites.
"""
cols = pathways_db.columns
missing_pathways = set(metabolites_specified) - set(pathways_db.index)
if missing_pathways:
raise KeyError(f"The following elements are missing: {list(missing_pathways)}")
selected_pathways = pathways_db.loc[metabolites_specified].astype('bool')
metabolic_needs = selected_pathways.apply(lambda x: list(cols[x.values]), axis=1).to_list()
return do_hits(metagenome, metabolic_needs, total_metagenome)
def append_species_refill(metagenome: pd.DataFrame, species_to_refill: set) -> pd.DataFrame:
"""
Append species to an existing dataframe of abundances and adjust abundance levels to maintain normalization.
Args:
metagenome (pd.DataFrame): Metagenome dataframe (with columns ['tax_id', 'species', 'mean_abundance', 'sd_abundance'])
species_to_refill (set): Species ids and names to add to the abundance dataframe.
Returns:
A pandas DataFrame with the new species added and abundance levels adjusted to maintain normalization.
Raises:
ValueError: If the input dataframe does not have the expected two columns, or if the second column
does not contain numeric data.
"""
# TO-DO: check columns and names
abundance_level = metagenome.abundance.mean()
species_to_refill = pd.DataFrame(species_to_refill)
species_to_refill['abundance'] = [abundance_level] * len(to_refill)
species_to_refill.columns = metagenome.columns
metagenome_new = pd.concat([metagenome, species_to_refill])
metagenome_new['abundance'] = metagenome_new.abundance / metagenome_new.abundance.sum()
return metagenome_new
def read_pathways(pathways_input: str) -> List[str]:
"""Reads metabolic pathways from a file or a comma-separated string.
Args:
pathways_input (str): Path to a file or a comma-separated string containing metabolic pathways.
Returns:
List of strings: A list of metabolic pathway names.
Raises:
ValueError: If the input is not a valid path to a file or a comma-separated string.
"""
if os.path.isfile(pathways_input):
with open(pathways_input, 'r') as f:
pathways = [line.strip() for line in f.readlines()]
elif ',' in pathways_input:
pathways = pathways_input.split(',')
else:
raise ValueError('Invalid input. Please provide a path to a file or a comma-separated string.')
return pathways
if __name__ == '__main__':
pheno = parse_args().phenotype
metagenome_file = parse_args().metagenome_file
pathways = parse_args().pathways
metabolites = parse_args().metabolites
n_core = int(parse_args().n_core)
n_samples = int(parse_args().n_samples)
n_threads = int(parse_args().threads)
n_reads = parse_args().n_reads
RESULTS_DIR = parse_args().out_dir
core_selection_model = parse_args().c_model
abundance_selection_model = parse_args().a_model
email = parse_args().email
api_key = parse_args().api_key
Entrez.email = email
if api_key is not None:
Entrez.api_key = api_key
for sample in range(1, n_samples + 1):
path = os.path.join(RESULTS_DIR, f'sample_{sample}')
os.makedirs(path, exist_ok=True)
baseline_abundances = pd.read_csv(os.path.join('baseline_phenotypes', pheno + '.csv'), header=None, sep=',')
missing_genomes = pd.read_csv(os.path.join('baseline_phenotypes', 'missing_genomes', 'missing_genomes.csv'),
header=1, sep=',', names=['tax_id', 'species'])
baseline_abundances.columns = ['tax_id', 'species', 'mean_abundance', 'sd_abundance']
baseline_abundances = drop_missing_genomes(baseline_abundances, missing_genomes)
baseline_abundances['mean_abundance'] = baseline_abundances['mean_abundance'] / baseline_abundances[
'mean_abundance'].sum()
baseline_abundances['mean_abundance'].fillna(baseline_abundances['mean_abundance'].mean(), inplace=True)
baseline_abundances['sd_abundance'].fillna(baseline_abundances['sd_abundance'].mean(), inplace=True)
pathways_db = pd.read_csv(os.path.join('Databases', 'MetaCyc_pathways_by_species.csv'), sep=';',
index_col='Pathways')
metagenome_size = min(n_core, len(baseline_abundances)) if n_core else len(baseline_abundances)
core_metagenome = generate_core_metagenome(baseline_abundances, metagenome_size, core_selection_model,
abundance_selection_model)
with open('iss_params.yml', 'r') as f:
yaml_iss_params = yaml.safe_load(f)
for sample in range(1, n_samples + 1):
dir = os.path.join(RESULTS_DIR, f'sample_{sample}')
# Metagenome update
species_to_refill = []
# Core metagenome update from metabolites
if metabolites is not None:
for metabolite in metabolites:
metabol_cmd = f'python clusters.py -M {metabolite} -P {pheno}'
result = subprocess.run(metabol_cmd.split())
if os.path.isfile('bacteria_metab.txt'):
with open('bacteria_metab.txt', 'r') as file:
for taxid_specie in file:
species_to_refill.append(tuple(taxid_specie.split(",")))
# Core metagenome update from pathways
if pathways is not None:
print('Reading required pathways...')
pathways_specified = read_pathways(pathways)
species_to_refill = find_minimal_refill(core_metagenome, pathways_specified,
pathways_db, baseline_abundances)
if species_to_refill:
core_metagenome = append_species_refill(core_metagenome, species_to_refill)
prepared_metagenome = update_genomes(core_metagenome, GENOMES_DIR,
os.path.join(RESULTS_DIR, f'sample_{sample}'),
n_threads)
wr_code = write_multifasta(prepared_metagenome, GENOMES_DIR)
iss_params = {'-g': os.path.join(GENOMES_DIR, 'multifasta.fna'),
'--abundance_file': os.path.join(RESULTS_DIR, f'sample_{sample}', 'abundances_for_iss.txt'),
'-m': 'miseq', '-o': os.path.join(RESULTS_DIR, f'sample_{sample}', 'miseq_reads'),
'--cpus': n_threads}
iss_params = iss_params | yaml_iss_params
if n_reads is not None:
iss_params['--n_reads'] = int(n_reads)
iss_cmd = ['iss', 'generate'] + [str(item) for pair in iss_params.items() for item in pair]
result = subprocess.run(iss_cmd)
if os.path.exists(os.path.join(GENOMES_DIR, 'multifasta.fna')):
os.remove(os.path.join(GENOMES_DIR, 'multifasta.fna'))
if result.returncode == 0:
print(f'\nThe sample_{sample} was successfully generated.')
else:
print(f'\nThe sample_{sample} generation completed with errors.')
print('\n Metagenomes generation finished!')