1
2
3
4
5
6 import re
7
8
10 """ Parse the Nei & Gojobori (1986) section of the resuls.
11 Nei_Gojobori results are organized in a lower
12 triangular mattrix, with the sequence names labeling
13 the rows and statistics in the format:
14 w (dN dS) per column
15 Example row (2 columns):
16 0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)"""
17 sequences = []
18 for line in lines:
19
20 line_floats_res = re.findall("-*\d+\.\d+", line)
21 line_floats = [float(val) for val in line_floats_res]
22 matrix_row_res = re.match("(.+)\s{5,15}",line)
23 if matrix_row_res is not None:
24 seq_name = matrix_row_res.group(1).strip()
25 sequences.append(seq_name)
26 results[seq_name] = {}
27 for i in range(0, len(line_floats), 3):
28 NG86 = {}
29 NG86["omega"] = line_floats[i]
30 NG86["dN"] = line_floats[i+1]
31 NG86["dS"] = line_floats[i+2]
32 results[seq_name][sequences[i//3]] = {"NG86":NG86}
33 results[sequences[i//3]][seq_name] = {"NG86":NG86}
34 return (results, sequences)
35
36
38 """ Parse the Yang & Nielsen (2000) part of the results.
39 Yang & Nielsen results are organized in a table with
40 each row comprising one pairwise species comparison.
41 Rows are labeled by spequence number rather than by
42 sequence name."""
43
44
45
46
47
48 for line in lines:
49
50 line_floats_res = re.findall("-*\d+\.\d+", line)
51 line_floats = [float(val) for val in line_floats_res]
52 row_res = re.match("\s+(\d+)\s+(\d+)", line)
53 if row_res is not None:
54 seq1 = int(row_res.group(1))
55 seq2 = int(row_res.group(2))
56 seq_name1 = sequences[seq1-1]
57 seq_name2 = sequences[seq2-1]
58 YN00 = {}
59 YN00["S"] = line_floats[0]
60 YN00["N"] = line_floats[1]
61 YN00["t"] = line_floats[2]
62 YN00["kappa"] = line_floats[3]
63 YN00["omega"] = line_floats[4]
64 YN00["dN"] = line_floats[5]
65 YN00["dN SE"] = line_floats[6]
66 YN00["dS"] = line_floats[7]
67 YN00["dS SE"] = line_floats[8]
68 results[seq_name1][seq_name2]["YN00"] = YN00
69 results[seq_name2][seq_name1]["YN00"] = YN00
70 seq_name1 = None
71 seq_name2 = None
72 return results
73
74
76 """Parse the results from the other methods.
77
78 The remaining methods are grouped together. Statistics
79 for all three are listed for each of the pairwise
80 species comparisons, with each method's results on its
81 own line.
82 The stats in this section must be handled differently
83 due to the possible presence of NaN values, which won't
84 get caught by my typical "line_floats" method used above.
85 """
86
87
88
89
90
91
92
93
94
95
96
97 seq_name1 = None
98 seq_name2 = None
99 for line in lines:
100 comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line)
101 if comp_res is not None:
102 seq_name1 = comp_res.group(1)
103 seq_name2 = comp_res.group(2)
104 elif seq_name1 is not None and seq_name2 is not None:
105 if "dS =" in line:
106 stats = {}
107 line_stats = line.split(":")[1].strip()
108
109
110
111
112
113
114 res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?",
115 line_stats)
116 for stat_pair in res_matches:
117 stat = stat_pair.split('=')[0].strip()
118 value = stat_pair.split('=')[1].strip()
119 try:
120 stats[stat] = float(value)
121 except:
122 stats[stat] = None
123 if "LWL85:" in line:
124 results[seq_name1][seq_name2]["LWL85"] = stats
125 results[seq_name2][seq_name1]["LWL85"] = stats
126 elif "LWL85m" in line:
127 results[seq_name1][seq_name2]["LWL85m"] = stats
128 results[seq_name2][seq_name1]["LWL85m"] = stats
129 elif "LPB93" in line:
130 results[seq_name1][seq_name2]["LPB93"] = stats
131 results[seq_name2][seq_name1]["LPB93"] = stats
132 return results
133