-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgbm_summarize.py
182 lines (160 loc) · 8.94 KB
/
gbm_summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import requests
import csv
import StringIO
import urllib
import os
import sys
import pprint
class CaseSet(object):
genetic_profile_ids = ['gbm_tcga_gistic', 'gbm_tcga_mutations']
cases = {}
gene_meta_data = {}
meta_data = {}
summary = {}
def __init__(self, gene_list):
"""Initialized the case-set object by adding data for the GBM genetic profiles data for for each gene in the gene_list parameter"""
for genetic_profile_id in self.genetic_profile_ids:
self.add_genetic_profile(genetic_profile_id, gene_list, useLocalFile=False)
def add_genetic_profile(self, genetic_profile_id, gene_list, useLocalFile=False):
"""Adds the genetic profile data to for every gene provided."""
gene_meta_data_fields = ['COMMON', 'GENE_ID']
if type(gene_list) == list():
gene_list = ','.join(gene_list)
if useLocalFile:
file_obj = open(self.get_profile_data_file_name(genetic_profile_id, gene_list), 'r')
else:
url = self.get_profile_data_URL(genetic_profile_id, gene_list)
# create a object with same interface as open file to allow for the use of file-based parsing methods
file_obj = self.get_profile_data_API_response(url)
# create an array to store the position of each line the file, to allow for multiple rows of meta-data to proceed the column headers
header_row_begin_position = 0
for line in iter(file_obj.readline, ''):
# check if the line begins with "#" as this is the convention for providing meta-data in the text returned by the API
if line[0] == '#':
header_row_begin_position = file_obj.tell()
else:
break
# get the position before the header row (the file curesor will be at the end of the end of the header row when the loop exits)
file_obj.seek(header_row_begin_position)
reader = csv.DictReader(file_obj, delimiter='\t')
for row in reader:
# get the common abbreviation for the gene represented in the row of data
gene = row['COMMON']
for field in row:
# populate the meta data for gene represented in this row
if field in gene_meta_data_fields:
if not (gene in self.gene_meta_data):
self.gene_meta_data[gene] = {}
self.gene_meta_data[gene][field] = row[field]
else:
if not (field in self.cases):
self.cases[field] = {}
if not (gene in self.cases[field]):
self.cases[field][gene] = {}
self.cases[field][gene][genetic_profile_id] = row[field]
file_obj.close()
def display_case_set_summary(self, display_mode='verbose'):
""""Displays a summary of all the analysis done on the GBM genetic profile data available for all specfied genes."""
display = [{'analysis': 'profile_gbm_tcga_mutations_summary', 'text': '{gene} is mutated in {result_text}% of all cases.', 'summary_function': self.get_mutation_percent, 'display_modes': ['verbose']},
{'analysis': 'profile_gbm_tcga_gistic_summary', 'text': '{gene} is copy number altered in {result_text}% of all cases.', 'summary_function': self.get_copy_number_altered_percent, 'display_modes': ['verbose']},
{'analysis': 'aggregate', 'text': 'Total % of cases where {gene} is altered by either mutation or copy number alteration: {result_text}% of all cases.', 'summary_function': self.get_all_alterated_percent, 'display_modes': ['verbose', 'concise']}]
self.set_summary()
tokens = {}
for gene in self.summary:
tokens = {'gene': gene}
for metric in display:
if display_mode in metric['display_modes']:
result_percent = metric['summary_function'](gene)
tokens['result_text'] = '{0:.{1}f}'.format(result_percent*100, 0)
print metric['text'].format(**tokens)
def get_mutation_percent(self, gene):
"""Returns the perceantage (as decimal) of all cases where mutations were detected for a specific gene (includes cases with multiple alterations)."""
return float(self.summary[gene]['mutated_case_count']) / float(self.summary[gene]['total_case_count'])
def get_copy_number_altered_percent(self, gene):
"""Returns the perceantage (as decimal) of all cases where copy number alterations were detected for a specific gene (includes cases with multiple alterations)."""
return float(self.summary[gene]['copy_number_alterated_case_count']) / float(self.summary[gene]['total_case_count'])
def get_all_alterated_percent(self, gene):
"""Returns the perceantage (as decimal) of all cases where alterations (mutation or copy number alterations) where detected for a specific gene (includes cases with multiple alterations)."""
return float(self.summary[gene]['copy_number_alterated_case_count'] + self.summary[gene]['mutated_case_count'] - self.summary[gene]['multiple_alterations_case_count']) / float(self.summary[gene]['total_case_count'])
def is_mutated(self, case, gene):
"""Returns a boolean indicating if the a specific gene is mutated for the case in question."""
genetic_profile_id = 'gbm_tcga_mutations'
profile_result = case[gene][genetic_profile_id]
if profile_result == 'NaN' or str(profile_result) == '0':
return False
else:
return True
def is_copy_number_alterated(self, case, gene):
"""Returns a boolean indicating if the a specific gene has copy number alteration for the case in question."""
genetic_profile_id = 'gbm_tcga_gistic'
profile_result = case[gene][genetic_profile_id]
# @TODO: determine best way to handle 'NA' values
if int(profile_result) not in [0, 1, -1]:
return True
else:
return False
def set_summary(self):
"""Summarizes the alterations (mutations or copy number alterations) observerd for all cases in the case set,
looking at all genes specified at the creation of CaseSet objet."""
mutated_case_count = 0
copy_number_alterated_case_count = 0
multiple_alterations_case_count = 0
self.total_case_count = len(self.cases)
# for all genes for which data has been retrieved
for gene in self.gene_meta_data:
for case in self.cases:
if self.is_copy_number_alterated(self.cases[case], gene):
copy_number_alterated_case_count += 1
if self.is_mutated(self.cases[case], gene):
mutated_case_count += 1
if self.is_copy_number_alterated(self.cases[case], gene) and self.is_mutated(self.cases[case], gene):
multiple_alterations_case_count += 1
mutated_case_percent = float(mutated_case_count) / float(self.total_case_count)
copy_number_alterated_percent = float(copy_number_alterated_case_count) / float(self.total_case_count)
all_alterated_percent = float(copy_number_alterated_case_count + mutated_case_count - multiple_alterations_case_count) / float(self.total_case_count)
self.summary[gene] = {'total_case_count': self.total_case_count,
'mutated_case_count': mutated_case_count,
'copy_number_alterated_case_count': copy_number_alterated_case_count,
'multiple_alterations_case_count': multiple_alterations_case_count,
'all_alterated_percent': all_alterated_percent}
def get_request_payload(self, genetic_profile_id, gene_list):
"""Returns the correctly formatted paylod (query params) for making request the cBioPortal web API."""
payload = {}
payload['cmd'] = 'getProfileData'
payload['id_type'] = 'gene_symbol'
payload['case_set_id'] = 'gbm_tcga_cnaseq'
payload['genetic_profile_id'] = genetic_profile_id
payload['gene_list'] = gene_list
return payload
def get_profile_data_API_response(self, url):
"""Returns an object for interacting with data povided by the API and exposing the same interface as an open file object."""
response = requests.get(url)
file_obj = StringIO.StringIO(response.text)
return file_obj
def get_profile_data_file_name(self, genetic_profile_id, gene_list):
"""This function returns the file_path to data cahced from the web API.
Cached files simply use the URL-encoded querystring of a call to the web API for the file name (with .txt as an extension)"""
file_dir = '../data_for_cbioportal_cmd_util/'
payload = self.get_request_payload(genetic_profile_id, gene_list)
query = 'cmd={cmd}&genetic_profile_id={genetic_profile_id}&id_type={id_type}&gene_list={gene_list}&case_set_id={case_set_id}'.format(**payload)
file_name = file_dir + urllib.quote(query) + '.txt'
return file_name
def get_profile_data_URL(self, genetic_profile_id, gene_list):
"""Returns a the URL to be used to query the web API for the relevant data."""
payload = self.get_request_payload(genetic_profile_id, gene_list)
return 'http://www.cbioportal.org/webservice.do?cmd={cmd}&genetic_profile_id={genetic_profile_id}&id_type={id_type}&gene_list={gene_list}&case_set_id={case_set_id}'.format(**payload)
def main():
"""Executes whenever the script is executed directly (not imported). Handles arguments passed from the command line."""
if ( len(sys.argv) == 1 ) or ( len(sys.argv) > 4 ):
print('usage: gbm_summarize.py gene1 [gene2, gene3]')
else:
if len(sys.argv) > 2:
display_mode='concise'
else:
display_mode='verbose'
# get all arguments except the argv[0] (the script file), add them to a comma-delimited string
gene_list = ','.join(sys.argv[1:])
case_set = CaseSet(gene_list)
case_set.display_case_set_summary(display_mode)
if __name__ == '__main__':
main()