-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpreproc.py
237 lines (191 loc) · 9.19 KB
/
preproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
Process neurons, synapses, muscles etc. from Chen's excel(csv) for import into DB.
"""
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
# Filenames
data_folder = "data/"
chen_neurons_fnm = "ChenVarshney/NeuronType.xls"
chen_conns_fnm = "ChenVarshney/NeuronConnect.xls"
chen_muscles_fnm = "ChenVarshney/NeuronFixedPoints.xls"
kaiser_pos_fnm = "DynamicConnectome/celegans277/celegans277positions.csv"
kaiser_pos_lab_fnm = "DynamicConnectome/celegans277/celegans277labels.csv"
ow_neurons_fnm = ""
ww_neurons_fnm = "WormWeb/name_neurons.txt"
sensors_fnm = "Self/Sensors.tsv"
neuron_attrs = ["Neuron", "SomaPosition", "SomaRegion", "AYGanglionDesignation", "AYNbr"]
# ------------------------------------------------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------------------------------------------------
def zero_lead(num_str):
""" Add leading zero to single digits at the end of a string """
if num_str[-1].isdigit() and not num_str[-2].isdigit():
return num_str[0:-1] + "0" + num_str[-1]
else:
return num_str
def remove_leading_zero(num_str):
""" Remove leading zero from single digits at the end of a string """
if num_str[-1].isdigit() and num_str[-2] == '0':
return num_str[0:-2] + num_str[-1]
else:
return num_str
def sym_node_name(name):
""" For names of symmetric nodes ending in L or R, return the counterpart's name."""
suffix = name[-1]
if suffix == "L":
return name[0:-1] + "R"
elif suffix == "R":
return name[0:-1] + "L"
else:
return name
def expand_type_abbr(name):
""" Expand neuron type appreviation: se->sensory neuron, mo->motor neuron, in->interneuron, mu->muscle, bm->basement membrane,
# gln->gland cell, mc->marginal cell """
sep = ", "
types = []
if not name.find("se") == -1: types.append("sensory")
if not name.find("mo") == -1: types.append("motor")
if not name.find("in") == -1: types.append("inter")
if not name.find("mu") == -1: types.append("muscle")
if not name.find("bm") == -1: types.append("basement membrane")
if not name.find("gln") == -1: types.append("gland cell")
if not name.find("mc") == -1: types.append("marginal cell")
return sep.join(types)
def muscle_to_bodypart(name):
""" Return name of body part (head, neck or body) for given muscle name (e.g. MDL3 -> head) """
# Get number suffix: last two characters if double digit else last digit
if name[-2].isdigit():
num = int(name[-2:])
elif name[-1].isdigit():
num = int(name[-1:])
else:
return name
if num <= 4:
return "head"
elif num <= 8:
return "neck"
else:
return "body"
# ------------------------------------------------------------------------------------------------------------
# Importers
# ------------------------------------------------------------------------------------------------------------
def kaiser_positions_df():
""" Return a pandas DF containing 2d positions from Kaiser indexed by neuron label """
pos_labels = pd.io.parsers.read_csv(data_folder + kaiser_pos_lab_fnm, header=None)
positions = pd.io.parsers.read_csv(data_folder + kaiser_pos_fnm, header=None)
dfr = pd.concat([pos_labels, positions], axis=1, ignore_index=True)
dfr.columns = ["label", "kx", "ky"]
dfr = dfr.set_index("label")
return dfr
def chen_neurons_df():
""" Return a pandas DF containing neuron information from Chen indexed by neuron label """
dfr = pd.io.excel.read_excel(data_folder + chen_neurons_fnm, sheetname=0, index_col=None, header=0)
dfr.columns = [x.replace(" ", "") for x in dfr.columns]
dfr = dfr[neuron_attrs]
dfr["Neuron"] = [remove_leading_zero(x) for x in dfr["Neuron"]]
dfr.set_index("Neuron", drop=True, inplace=True)
return dfr
def ww_neurons_df():
""" Return WormWeb info about neuron class and type (from japanese CCEP group, itself based on White data) """
dfr = pd.io.parsers.read_csv(data_folder + ww_neurons_fnm, comment="#",
header=None, index_col=0, sep=" ", skipinitialspace=True, names=["name", "group", "type"])
dfr["type"] = [expand_type_abbr(x) for x in dfr["type"]]
return dfr
def wa_links_df():
""" Scrape following page for neuron single page links: http://www.wormatlas.org/neurons/Individual%20Neurons/Neuronframeset.html """
base_url = "http://www.wormatlas.org/neurons/Individual%20Neurons/"
table_url = base_url + "Neuronframeset.html"
header = {'User-agent' : "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}
res = requests.get(table_url, headers=header)
html = res.text
soup = bs(html)
def rem_slash(name):
""" Remove extra slashes from some neuron names """
if name[-2] == "/":
return name[:-2]
else:
return name
links = soup.find_all("table")[2].find_all("a")
hrefs = [base_url + l['href'] for l in links]
names = [rem_slash(l.string) for l in links]
dfr = pd.DataFrame()
dfr['link'] = hrefs
dfr.index = names
nans = dfr.index[dfr['link'].isnull()]
if len(nans) > 0:
print [dfr.loc[x] for x in nans]
return dfr
# Turns our own generated sensor file into df
def sensors_df():
df = pd.io.parsers.read_csv(data_folder + sensors_fnm, sep="\t", comment="#", header=0,
index_col=False, skipinitialspace=True, na_values=[''])
df = df.drop(["number", "location"], 1)
df.set_index("group", drop=True, inplace=True)
df.fillna('', inplace=True)
return df
def muscles_df():
""" Return a pandas DF containing muscle information from Chen indexed by muscle label """
df = pd.io.excel.read_excel(data_folder + chen_muscles_fnm, sheetname=0, index_col=None, header=0)
df.columns = [x.replace(" ", "") for x in df.columns]
df.Landmark = [remove_leading_zero(x) for x in df.Landmark]
#df.set_index("Neuron", drop=True, inplace=True)
# Extract only muscles
df = df[df.Landmark.str.startswith("M")]
muscles = df.Landmark.unique()
pos = [df[df.Landmark == m].LandmarkPosition.iloc[0] for m in muscles] # Extract position from df
mu = pd.DataFrame({'name':muscles, 'pos':pos})
mu['part'] = mu.name.apply(muscle_to_bodypart)
mu.set_index('name', drop=True, inplace=True)
return mu
# Kaiser's positions are missing for some left-right symmetric neurons (AIBL, AIYL, SMDVL),
# so we substitute the right "mirror" neuron, which is ok anyway, since the left-right
# coordinates are not provided anyway. Equally, Chen/Varshney's data doesn't have a VC06 neuron.
# In Varshney paper it is mentioned that it doesn't make synapses with other neurons
# (and neither does CANL/R) so fine to omit. The set therefore contains 279 of the 282 somatic
# nervous system neurons (of 302 total including pharyngeal NS).
def neurons_df():
""" Join all available neuron information into a single Pandas DF """
nodes = chen_neurons_df()
pos = kaiser_positions_df()
nodes_only = np.setdiff1d(nodes.index.values, pos.index.values)
pos_only = np.setdiff1d(pos.index.values, nodes.index.values)
print "Nodes but not pos: ", nodes_only
print "Pos but not nodes: ", pos_only
# Perform a left join with nodes on the left and positions on the right,
# so we only use those neurons provided by Chen
dfr = nodes.join(pos, how="left")
# Now add missing positions from symmetric partners
for node in nodes_only:
mirror = sym_node_name(node)
print "Copying position for ", node, " from ", mirror
dfr.loc[node, 'kx'] = dfr.loc[mirror, 'kx']
dfr.loc[node, 'ky'] = dfr.loc[mirror, 'ky']
# Join with info about neuron class and type
ww_nodes = ww_neurons_df()
dfr = dfr.join(ww_nodes, how="left")
# Join links from worm atlas website
wa_links = wa_links_df()
dfr = dfr.join(wa_links, how="left")
print dfr.info()
return dfr
# The original xsl file lists every synapse twice, once as "send" from n1 to n2,
# and once as "receive" by n2 from n1. Here we only keep the "send" copy.
def conns_df():
""" Return a pandas DF of connections in from->to format """
dfr = pd.io.excel.read_excel(data_folder + chen_conns_fnm, sheetname=0, index_col=None, header=0)
dfr.columns = [x.replace(" ", "") for x in dfr.columns]
dfr["Neuron1"] = [remove_leading_zero(x) for x in dfr["Neuron1"]]
dfr["Neuron2"] = [remove_leading_zero(x) for x in dfr["Neuron2"]]
dfr = dfr[(dfr['Type'] != 'R') & (dfr['Type'] != 'Rp') & (dfr['Type'] != 'NMJ')]
return dfr
def muscle_conns_df():
""" Return a pandas DF of connections between motor neurons and muscles in from->to format """
df = pd.io.excel.read_excel(data_folder + chen_muscles_fnm, sheetname=0, index_col=None, header=0)
df.drop('Landmark Position', axis=1, inplace=True)
df = df[df.Landmark.str.startswith("M")] # only muscles (also contains sensory landmarks)
df.Landmark = [remove_leading_zero(x) for x in df.Landmark]
df.Neuron = [remove_leading_zero(x) for x in df.Neuron]
df.rename(columns={'Landmark':'Muscle'}, inplace=True)
return df