-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathadduct_grouping.py
347 lines (287 loc) · 12.5 KB
/
adduct_grouping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import pdb
import emzed
from collections import defaultdict
from itertools import product
class MainPeak(object):
def __init__(self, id_, mz_main, rt_main, area_main, z, mzs, elements):
self.id_ = id_
self.mz_main = mz_main
self.rt_main = rt_main
self.area_main = area_main
self.z = z
self.mzs = mzs
self.elements = set(elements)
def __len__(self):
return len(self.mzs)
class AdductAssigner(object):
def __init__(self, mode, mz_tolerance=8e-4, rt_tolerance=5, cl_only_as_adduct=True,
allow_acetate=False):
assert mode == "negative_mode", "other modes not implemented yet"
self.mode = mode
self.mz_tolerance = mz_tolerance
self.rt_tolerance = rt_tolerance
self.cl_only_as_adduct = cl_only_as_adduct
self.allow_acetate = allow_acetate
def process(self, table):
col_names = table.getColNames()
assert "isotope_cluster_id" in col_names
assert "mz" in col_names
assert "rt" in col_names
assert "element_names" in col_names
assert "area" in col_names
assert "z" in col_names
peaks, peak_from_id = self._extract_main_peaks(table)
graph = self._build_graph(peaks)
groups = self._decompose(graph)
assigned_adducts = self._resolve_adducts(graph, groups)
self._enrich_table(table, groups, peak_from_id, assigned_adducts)
for j, adducts in assigned_adducts.items():
if len(adducts) > 1:
print "isotope_cluster=", j, "alternatives=", adducts
@staticmethod
def find_consistent_assignments(graph):
"""
we consider the nodes of the graph as variables and each connection
represents an constraint.
so we iterate over all possible variable assignments and check
the constraints.
we assume a symmetric graph, that is each starting node occurs in another
nodes target nodes and vice versa.
"""
def find_closed_components(graph):
nodes = set(graph.keys())
seen = set()
n = len(nodes)
while len(seen) < n and nodes:
start_with = nodes.pop()
if start_with not in seen:
# dfs search starting with 'start_with' node
stack = [start_with]
component = set()
while stack:
active_node = stack.pop()
seen.add(active_node)
component.add(active_node)
for follower in graph[active_node]:
if follower not in seen and follower not in stack:
stack.append(follower)
yield component
nodes = set(graph.keys())
# we iterativliy look for components which have max size and are consistent
# with the adduct asignment hypthesises
TO_OBSERVE = None
if TO_OBSERVE in nodes:
import pprint
pprint.pprint(graph)
# create map node -> list of poosible assignments
assignments = defaultdict(set)
for in_, v in graph.items():
if in_ in nodes:
for out, a0, a1 in v:
if out in nodes:
assignments[out].add(a1)
assignments[in_].add(a0)
if not(assignments):
# might happen if nodes are not connected
return
# now we iterate over each possible node assignment setting
# and try to find the best one, aka the one which creates the largest sub graph
partial_solutions = []
for assignment in product(*[assignments[k] for k in nodes]):
if len(set(assignment)) < len(assignment):
# assignment if same adduct to two isotope clustes
continue
if TO_OBSERVE in nodes:
print assignment
assignment = dict(zip(nodes, assignment))
# create sub graph which is consistent with the current assignemtn
# that is: only one edge per pair of nodes, not multipls
single_linked_graph = defaultdict(set)
for in_node, possibilities in graph.items():
for (out_node, a0, a1) in possibilities:
if assignment[in_node] == a0 and assignment[out_node] == a1:
single_linked_graph[in_node].add(out_node)
if TO_OBSERVE in nodes:
print
print single_linked_graph
partial_solution = []
max_comp_size = -1
for component in find_closed_components(single_linked_graph):
if TO_OBSERVE in nodes:
print " ", component
reduced_assignment = dict((k, v) for (k, v) in assignment.items() if k in
component)
partial_solution.append(reduced_assignment)
max_comp_size = max(max_comp_size, len(component))
if TO_OBSERVE in nodes:
print max_comp_size, partial_solution
partial_solutions.append((len(partial_solution), partial_solution))
if partial_solutions:
lmax, __ = max(partial_solutions)
for l, solution in partial_solutions:
if l == lmax:
if TO_OBSERVE in nodes:
print solution
for assignment in solution:
yield assignment
@staticmethod
def test_resolve_graph():
graph = dict(a=[("b", "l0", "l1"), ("c", "l0", "l4"), ("c", "l6", "l5")],
b=[("c", "l2", "l3"), ("c", "l1", "l4")],
c=[("b", "l4", "l1"), ("b", "l3", "l2"), ("a", "l4", "l0"),
("a", "l5", "l6")])
found, = AdductAssigner.find_consistent_assignments(graph)
assert found == dict(a="l0", b="l1", c="l4")
def _extract_main_peaks(self, table):
peaks = []
peak_from_id = dict()
table.sortBy("isotope_cluster_id")
for group in table.splitBy("isotope_cluster_id"):
if group.z.uniqueValue() == 0:
continue
id_ = group.isotope_cluster_id.uniqueValue()
if len(group) == 1:
main_peak = group
else:
main_peak = group.filter(group.isotope_rank == 0)
mz_main = main_peak.mz.values[0]
rt_main = main_peak.rt.values[0]
area_main = main_peak.area.values[0]
mzs = group.mz.values
z = group.z.values[0]
elements = [e.strip() for e in main_peak.element_names.values[0].split(",")]
peak = MainPeak(id_, mz_main, rt_main, area_main, z, mzs, elements)
peaks.append(peak)
peak_from_id[id_] = peak
return peaks, peak_from_id
def _build_graph(self, peaks):
"""
vertices in this graph connect isotope clusters which could represent the
same adduct
"""
graph = defaultdict(list)
if self.mode == "negative_mode":
adducts = emzed.adducts.negative.adducts
else:
adducts = emzed.adducts.positive.adducts
adducts = [(name, delta, abs(z)) for (name, delta, z) in adducts
if self.allow_acetate or name != "M+CH3COO"]
rt_tolerance = self.rt_tolerance
mz_tolerance = self.mz_tolerance
for i, peak_i in enumerate(peaks):
mz_i = peak_i.mz_main
id_i = peak_i.id_
for (name_i, delta_m_i, z_i) in adducts:
if self.cl_only_as_adduct and "Cl" in peak_i.elements and "Cl" not in name_i:
continue
if z_i != peak_i.z:
continue
m0_i = mz_i * z_i - delta_m_i
if m0_i < 0:
continue
for j, peak_j in enumerate(peaks):
if i <= j:
continue
id_j = peak_j.id_
mz_j = peak_j.mz_main
if abs(peak_j.rt_main - peak_i.rt_main) < rt_tolerance:
for (name_j, delta_m_j, z_j) in adducts:
if self.cl_only_as_adduct:
if "Cl" in peak_j.elements and "Cl" not in name_j:
continue
if z_j != peak_j.z:
continue
m0_j = mz_j * z_j - delta_m_j
if m0_j < 0:
continue
if abs(m0_i - m0_j) <= mz_tolerance:
graph[id_i].append((id_j, name_i, name_j))
graph[id_j].append((id_i, name_j, name_i)) # make graph symmetric
return graph
@staticmethod
def _decompose(graph):
"""
decomposes graph in connected components, asserts a symmetric graph
as built above.
"""
groups = []
to_start = set(graph.keys())
while to_start:
i = to_start.pop()
stack = [i]
group = set()
while stack:
i0 = stack.pop()
if i0 not in group:
group.add(i0)
if i0 in to_start:
to_start.remove(i0)
for j, __, __ in graph[i0]:
if j not in stack and j not in group:
stack.append(j)
groups.append(group)
return groups
def _resolve_adducts(self, graph, groups):
"""
the graph may contain multiple vertices between two nodes, where each vertex
represents an adduct assignment for theses nodes.
this method determines all consistent adduct assignments within each group.
"""
assigned_adducts = defaultdict(list)
for group in groups:
sub_graph = dict((k, values)
for (k, values) in graph.items()
if k in group and any(v in group for v, __, __ in values))
# we only consider assignments which annotate the full group:
full_assignments = []
for assignment in self.find_consistent_assignments(sub_graph):
if len(assignment) <= len(group):
full_assignments.append(assignment)
# if the assignemnt is unique we add this information
if len(full_assignments):
for full_assignment in full_assignments:
for k, v in full_assignment.items():
assigned_adducts[k].append(v)
return assigned_adducts
def _enrich_table(self, table, groups, peak_from_id, assigned_adducts):
"""
adds the calculated assignments as columns to the table
"""
id_to_group_map = dict()
for gid, group in enumerate(groups):
for i in group:
id_to_group_map[i] = gid
class AdductGroupAssigner(object):
"""mimmics stateful function"""
def __init__(self, next_id=len(groups)):
self.next_id = next_id
def __call__(self, id_):
group_id = id_to_group_map.get(id_)
if group_id is None:
group_id = self.next_id
id_to_group_map[id_] = group_id
self.next_id += 1
return group_id
def assign_adducts(id_):
return ", ".join(assigned_adducts.get(id_, ""))
table.addColumn("adduct_group",
table.isotope_cluster_id.apply(AdductGroupAssigner()),
insertBefore="element_names")
table.addColumn("possible_adducts",
table.isotope_cluster_id.apply(assign_adducts),
insertBefore="element_names")
if __name__ == "__main__":
#table = emzed.io.loadTable("S9_isotope_clustered.table")
#cnames1 = table.getColNames()
import glob
#for p in glob.glob("201311*.table"):
p = "20140122_adduct_grouping.table"
p = "isotope_clustered.table"
if p:
print p
table = emzed.io.loadTable(p)
#emzed.gui.inspect(table)
#table.dropColumns("adduct_group", "possible_adducts")
AdductAssigner("negative_mode").process(table)
emzed.gui.inspect(table)
#emzed.io.storeTable(table, p, True)