1
2
3
4
5
6 import re
7
8 line_floats_re = re.compile("-*\d+\.\d+")
9
10 try:
11 float("nan")
12 _nan_float = float
13 except ValueError:
14
16 try:
17 return float(text)
18 except ValueError:
19 if text.lower()=="nan":
20 import struct
21 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0]
22 else:
23 raise
24
25
27 """Parse the basic information that should be present in most codeml output files.
28 """
29
30
31 multi_models = False
32 multi_genes = False
33 version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*")
34 model_re = re.compile("Model:\s+(.+)")
35 num_genes_re = re.compile("\(([0-9]+) genes: separate data\)")
36
37
38
39
40 codon_freq_re = re.compile("Codon frequenc[a-z\s]{3,7}:\s+(.+)")
41 siteclass_re = re.compile("Site-class models:\s*([^\s]*)")
42 for line in lines:
43
44 line_floats_res = line_floats_re.findall(line)
45 line_floats = [_nan_float(val) for val in line_floats_res]
46
47 version_res = version_re.match(line)
48 if version_res is not None:
49 results["version"] = version_res.group(1)
50 continue
51
52 model_res = model_re.match(line)
53 if model_res is not None:
54 results["model"] = model_res.group(1)
55
56 num_genes_res = num_genes_re.search(line)
57 if num_genes_res is not None:
58 results["genes"] = []
59 num_genes = int(num_genes_res.group(1))
60 for n in range(num_genes):
61 results["genes"].append({})
62 multi_genes = True
63 continue
64
65 codon_freq_res = codon_freq_re.match(line)
66 if codon_freq_res is not None:
67 results["codon model"] = codon_freq_res.group(1)
68 continue
69
70
71
72
73
74 siteclass_res = siteclass_re.match(line)
75 if siteclass_res is not None:
76 siteclass_model = siteclass_res.group(1)
77 if siteclass_model != "":
78 results["site-class model"] = siteclass_model
79 multi_models = False
80 else:
81 multi_models = True
82
83 if "ln Lmax" in line and len(line_floats) > 0:
84 results["lnL max"] = line_floats[0]
85 return (results, multi_models, multi_genes)
86
87
89 """Determine which NSsites models are present and parse them.
90 """
91
92 ns_sites = {}
93 model_re = re.compile("Model (\d+):\s+(.+)")
94 gene_re = re.compile("Gene\s+([0-9]+)\s+.+")
95 siteclass_model = results.get("site-class model")
96 if not multi_models:
97
98
99 if siteclass_model is None:
100 siteclass_model = "one-ratio"
101 current_model = {"one-ratio" : 0,
102 "NearlyNeutral" : 1,
103 "PositiveSelection" : 2,
104 "discrete" : 3,
105 "beta" : 7,
106 "beta&w>1" : 8}[siteclass_model]
107 if multi_genes:
108 genes = results["genes"]
109 current_gene = None
110 gene_start = None
111 for line_num, line in enumerate(lines):
112 gene_res = gene_re.match(line)
113 if gene_res:
114 if current_gene is not None:
115 parse_model(lines[gene_start:line_num], model_results)
116 genes[current_gene-1] = model_results
117 gene_start = line_num
118 current_gene = int(gene_res.group(1))
119 model_results = {"description": siteclass_model}
120 if len(genes[current_gene-1]) == 0:
121 model_results = parse_model(lines[gene_start:], model_results)
122 genes[current_gene-1] = model_results
123 else:
124 model_results = {"description" : siteclass_model}
125 model_results = parse_model(lines, model_results)
126 ns_sites[current_model] = model_results
127 else:
128
129
130 current_model = None
131 model_start = None
132 for line_num, line in enumerate(lines):
133
134
135
136
137 model_res = model_re.match(line)
138 if model_res:
139 if current_model is not None:
140
141
142
143 parse_model(lines[model_start:line_num], model_results)
144 ns_sites[current_model] = model_results
145 model_start = line_num
146 current_model = int(model_res.group(1))
147 model_results = {"description":model_res.group(2)}
148 if ns_sites.get(current_model) is None:
149
150
151 model_results = parse_model(lines[model_start:], model_results)
152 ns_sites[current_model] = model_results
153
154
155
156 if len(ns_sites) == 1:
157 m0 = ns_sites.get(0)
158 if not m0 or len(m0) > 1:
159 results["NSsites"] = ns_sites
160 elif len(ns_sites) > 1:
161 results["NSsites"] = ns_sites
162 return results
163
164
166 """Parse an individual NSsites model's results.
167 """
168 parameters = {}
169 SEs_flag = False
170 dS_tree_flag = False
171 dN_tree_flag = False
172 w_tree_flag = False
173 num_params = None
174 tree_re = re.compile("\(\(+")
175 branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+")
176 model_params_re = re.compile("(?<!\S)([a-z]\d?)\s*=\s+(\d+\.\d+)")
177 for line in lines:
178
179 line_floats_res = line_floats_re.findall(line)
180 line_floats = [_nan_float(val) for val in line_floats_res]
181
182 branch_res = branch_re.match(line)
183
184 model_params = model_params_re.findall(line)
185
186
187
188 if "lnL(ntime:" in line and len(line_floats) > 0:
189 results["lnL"] = line_floats[0]
190 np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)",line)
191 if np_res is not None:
192 num_params = int(np_res.group(1))
193
194
195
196
197
198
199 elif len(line_floats) == num_params and not SEs_flag:
200 parameters["parameter list"] = line.strip()
201
202
203
204
205
206
207 elif "SEs for parameters:" in line:
208 SEs_flag = True
209 elif SEs_flag and len(line_floats) == num_params:
210 parameters["SEs"] = line.strip()
211 SEs_flag = False
212
213
214 elif "tree length =" in line and len(line_floats) > 0:
215 results["tree length"] = line_floats[0]
216
217
218 elif tree_re.match(line) is not None:
219 if ":" in line or "#" in line:
220 if dS_tree_flag:
221 results["dS tree"] = line.strip()
222 dS_tree_flag = False
223 elif dN_tree_flag:
224 results["dN tree"] = line.strip()
225 dN_tree_flag = False
226 elif w_tree_flag:
227 results["omega tree"] = line.strip()
228 w_tree_flag = False
229 else:
230 results["tree"] = line.strip()
231 elif "dS tree:" in line:
232 dS_tree_flag = True
233 elif "dN tree:" in line:
234 dN_tree_flag = True
235 elif "w ratios as labels for TreeView:" in line:
236 w_tree_flag = True
237
238
239 elif "rates for" in line and len(line_floats) > 0:
240 line_floats.insert(0, 1.0)
241 parameters["rates"] = line_floats
242
243
244 elif "kappa (ts/tv)" in line and len(line_floats) > 0:
245 parameters["kappa"] = line_floats[0]
246
247
248 elif "omega (dN/dS)" in line and len(line_floats) > 0:
249 parameters["omega"] = line_floats[0]
250 elif "w (dN/dS)" in line and len(line_floats) > 0:
251 parameters["omega"] = line_floats
252
253
254 elif "gene # " in line:
255 gene_num = int(re.match("gene # (\d+)", line).group(1))
256 if parameters.get("genes") is None:
257 parameters["genes"] = {}
258 parameters["genes"][gene_num] = {"kappa":line_floats[0],
259 "omega":line_floats[1]}
260
261
262 elif "tree length for dN" in line and len(line_floats) > 0:
263 parameters["dN"] = line_floats[0]
264
265
266 elif "tree length for dS" in line and len(line_floats) > 0:
267 parameters["dS"] = line_floats[0]
268
269
270
271
272
273 elif line[0:2] == "p:" or line[0:10] == "proportion":
274 site_classes = parse_siteclass_proportions(line_floats)
275 parameters["site classes"] = site_classes
276
277
278 elif line[0:2] == "w:":
279 site_classes = parameters.get("site classes")
280 site_classes = parse_siteclass_omegas(line, site_classes)
281 parameters["site classes"] = site_classes
282
283
284
285
286 elif "branch type " in line:
287 branch_type = re.match("branch type (\d)", line)
288 if branch_type:
289 site_classes = parameters.get("site classes")
290 branch_type_no = int(branch_type.group(1))
291 site_classes = parse_clademodelc(branch_type_no, line_floats,
292 site_classes)
293 parameters["site classes"] = site_classes
294
295
296
297
298 elif line[0:12] == "foreground w":
299 site_classes = parameters.get("site classes")
300 site_classes = parse_branch_site_a(True, line_floats, site_classes)
301 parameters["site classes"] = site_classes
302
303
304
305
306 elif line[0:12] == "background w":
307 site_classes = parameters.get("site classes")
308 site_classes = parse_branch_site_a(False, line_floats, site_classes)
309 parameters["site classes"] = site_classes
310
311
312
313
314
315 elif branch_res is not None and len(line_floats) > 0:
316 branch = branch_res.group(1)
317 if parameters.get("branches") is None:
318 parameters["branches"] = {}
319
320 line = line.replace(" -nan", " nan")
321 params = line.strip().split()[1:]
322 parameters["branches"][branch]= {
323 "t" : _nan_float(params[0].strip()),
324 "N" : _nan_float(params[1].strip()),
325 "S" : _nan_float(params[2].strip()),
326 "omega" :_nan_float(params[3].strip()),
327 "dN" : _nan_float(params[4].strip()),
328 "dS" : _nan_float(params[5].strip()),
329 "N*dN" : _nan_float(params[6].strip()),
330 "S*dS" : _nan_float(params[7].strip())}
331
332
333
334
335
336 elif len(model_params) > 0:
337 float_model_params = []
338 for param in model_params:
339 float_model_params.append((param[0], _nan_float(param[1])))
340 parameters = dict(parameters.items() + float_model_params)
341 if len(parameters) > 0:
342 results["parameters"] = parameters
343 return results
344
345
347 """For models which have multiple site classes, find the proportion of the alignment assigned to each class.
348 """
349 site_classes = {}
350 if len(line_floats) > 0:
351 for n in range(len(line_floats)):
352 site_classes[n] = {"proportion" : line_floats[n]}
353 return site_classes
354
355
357 """For models which have multiple site classes, find the omega estimated for each class.
358 """
359
360
361
362
363
364 line_floats = re.findall("\d{1,3}\.\d{5}", line)
365 if not site_classes or len(line_floats) == 0:
366 return
367 for n in range(len(line_floats)):
368 site_classes[n]["omega"] = line_floats[n]
369 return site_classes
370
371
373 """Parse results specific to the clade model C.
374 """
375 if not site_classes or len(line_floats) == 0:
376 return
377 for n in range(len(line_floats)):
378 if site_classes[n].get("branch types") is None:
379 site_classes[n]["branch types"] = {}
380 site_classes[n]["branch types"][branch_type_no] = line_floats[n]
381 return site_classes
382
383
385 """Parse results specific to the branch site A model.
386 """
387 if not site_classes or len(line_floats) == 0:
388 return
389 for n in range(len(line_floats)):
390 if site_classes[n].get("branch types") is None:
391 site_classes[n]["branch types"] = {}
392 if foreground:
393 site_classes[n]["branch types"]["foreground"] =\
394 line_floats[n]
395 else:
396 site_classes[n]["branch types"]["background"] =\
397 line_floats[n]
398 return site_classes
399
400
402 """Parse results from pairwise comparisons.
403 """
404
405
406
407
408
409
410
411 pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)")
412 pairwise = {}
413 for line in lines:
414
415 line_floats_res = line_floats_re.findall(line)
416 line_floats = [_nan_float(val) for val in line_floats_res]
417 pair_res = pair_re.match(line)
418 if pair_res:
419 seq1 = pair_res.group(1)
420 seq2 = pair_res.group(2)
421 if pairwise.get(seq1) is None:
422 pairwise[seq1] = {}
423 if pairwise.get(seq2) is None:
424 pairwise[seq2] = {}
425 if len(line_floats) == 1:
426 pairwise[seq1][seq2] = {"lnL" : line_floats[0]}
427 pairwise[seq2][seq1] = pairwise[seq1][seq2]
428 elif len(line_floats) == 6:
429 pairwise[seq1][seq2] = {"t" : line_floats[0],
430 "S" : line_floats[1],
431 "N" : line_floats[2],
432 "omega" : line_floats[3],
433 "dN" : line_floats[4],
434 "dS" : line_floats[5]}
435 pairwise[seq2][seq1] = pairwise[seq1][seq2]
436 if len(pairwise) > 0:
437 results["pairwise"] = pairwise
438 return results
439
440
442 """Parse amino acid sequence distance results.
443 """
444 distances = {}
445 sequences = []
446 raw_aa_distances_flag = False
447 ml_aa_distances_flag = False
448 matrix_row_re = re.compile("(.+)\s{5,15}")
449 for line in lines:
450
451 line_floats_res = line_floats_re.findall(line)
452 line_floats = [_nan_float(val) for val in line_floats_res]
453 if "AA distances" in line:
454 raw_aa_distances_flag = True
455
456
457 ml_aa_distances_flag = False
458 elif "ML distances of aa seqs." in line:
459 ml_aa_distances_flag = True
460 raw_aa_distances_flag = False
461
462 matrix_row_res = matrix_row_re.match(line)
463 if matrix_row_res and (raw_aa_distances_flag or
464 ml_aa_distances_flag):
465 seq_name = matrix_row_res.group(1).strip()
466 if seq_name not in sequences:
467 sequences.append(seq_name)
468 if raw_aa_distances_flag:
469 if distances.get("raw") is None:
470 distances["raw"] = {}
471 distances["raw"][seq_name] = {}
472 for i in range(0, len(line_floats)):
473 distances["raw"][seq_name][sequences[i]] = line_floats[i]
474 distances["raw"][sequences[i]][seq_name] = line_floats[i]
475 else:
476 if distances.get("ml") is None:
477 distances["ml"] = {}
478 distances["ml"][seq_name] = {}
479 for i in range(0, len(line_floats)):
480 distances["ml"][seq_name][sequences[i]] = line_floats[i]
481 distances["ml"][sequences[i]][seq_name] = line_floats[i]
482 if len(distances) > 0:
483 results["distances"] = distances
484 return results
485