Package Bio :: Package Phylo :: Package PAML :: Module _parse_yn00
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_yn00

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8   
9 -def parse_ng86(lines, results):
10 """ Parse the Nei & Gojobori (1986) section of the results. 11 Nei_Gojobori results are organized in a lower 12 triangular matrix, with the sequence names labeling 13 the rows and statistics in the format: 14 w (dN dS) per column 15 Example row (2 columns): 16 0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)""" 17 sequences = [] 18 for line in lines: 19 # Find all floating point numbers in this line 20 line_floats_res = re.findall("-*\d+\.\d+", line) 21 line_floats = [float(val) for val in line_floats_res] 22 matrix_row_res = re.match("(.+)\s{5,15}", line) 23 if matrix_row_res is not None: 24 seq_name = matrix_row_res.group(1).strip() 25 sequences.append(seq_name) 26 results[seq_name] = {} 27 for i in range(0, len(line_floats), 3): 28 NG86 = {} 29 NG86["omega"] = line_floats[i] 30 NG86["dN"] = line_floats[i+1] 31 NG86["dS"] = line_floats[i+2] 32 results[seq_name][sequences[i//3]] = {"NG86": NG86} 33 results[sequences[i//3]][seq_name] = {"NG86": NG86} 34 return (results, sequences)
35 36
37 -def parse_yn00(lines, results, sequences):
38 """ Parse the Yang & Nielsen (2000) part of the results. 39 Yang & Nielsen results are organized in a table with 40 each row comprising one pairwise species comparison. 41 Rows are labeled by spequence number rather than by 42 sequence name.""" 43 44 # Example (header row and first table row): 45 # seq. seq. S N t kappa omega dN +- SE dS +- SE 46 # 2 1 67.3 154.7 0.0136 3.6564 0.0000 -0.0000 +- 0.0000 0.0150 47 # +- 0.0151 48 for line in lines: 49 # Find all floating point numbers in this line 50 line_floats_res = re.findall("-*\d+\.\d+", line) 51 line_floats = [float(val) for val in line_floats_res] 52 row_res = re.match("\s+(\d+)\s+(\d+)", line) 53 if row_res is not None: 54 seq1 = int(row_res.group(1)) 55 seq2 = int(row_res.group(2)) 56 seq_name1 = sequences[seq1-1] 57 seq_name2 = sequences[seq2-1] 58 YN00 = {} 59 YN00["S"] = line_floats[0] 60 YN00["N"] = line_floats[1] 61 YN00["t"] = line_floats[2] 62 YN00["kappa"] = line_floats[3] 63 YN00["omega"] = line_floats[4] 64 YN00["dN"] = line_floats[5] 65 YN00["dN SE"] = line_floats[6] 66 YN00["dS"] = line_floats[7] 67 YN00["dS SE"] = line_floats[8] 68 results[seq_name1][seq_name2]["YN00"] = YN00 69 results[seq_name2][seq_name1]["YN00"] = YN00 70 seq_name1 = None 71 seq_name2 = None 72 return results
73 74
75 -def parse_others(lines, results, sequences):
76 """Parse the results from the other methods. 77 78 The remaining methods are grouped together. Statistics 79 for all three are listed for each of the pairwise 80 species comparisons, with each method's results on its 81 own line. 82 The stats in this section must be handled differently 83 due to the possible presence of NaN values, which won't 84 get caught by my typical "line_floats" method used above. 85 """ 86 # Example: 87 # 2 (Pan_troglo) vs. 1 (Homo_sapie) 88 89 # L(i): 143.0 51.0 28.0 sum= 222.0 90 # Ns(i): 0.0000 1.0000 0.0000 sum= 1.0000 91 # Nv(i): 0.0000 0.0000 0.0000 sum= 0.0000 92 # A(i): 0.0000 0.0200 0.0000 93 # B(i): -0.0000 -0.0000 -0.0000 94 # LWL85: dS = 0.0227 dN = 0.0000 w = 0.0000 S = 45.0 N = 177.0 95 # LWL85m: dS = -nan dN = -nan w = -nan S = -nan N = -nan (rho = -nan) 96 # LPB93: dS = 0.0129 dN = 0.0000 w = 0.0000 97 seq_name1 = None 98 seq_name2 = None 99 for line in lines: 100 comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line) 101 if comp_res is not None: 102 seq_name1 = comp_res.group(1) 103 seq_name2 = comp_res.group(2) 104 elif seq_name1 is not None and seq_name2 is not None: 105 if "dS =" in line: 106 stats = {} 107 line_stats = line.split(":")[1].strip() 108 # Find all of the xx = ###### values in a row 109 # ie dS = 0.0227 110 # For dN and dS, the values have 8 characters from the equals 111 # sign, while the rest have 7 characters. On Windows, 112 # NaNs take on weird values like -1.#IND, which might fill the 113 # entire fixed column width. 114 res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?", 115 line_stats) 116 for stat_pair in res_matches: 117 stat = stat_pair.split('=')[0].strip() 118 value = stat_pair.split('=')[1].strip() 119 try: 120 stats[stat] = float(value) 121 except: 122 stats[stat] = None 123 if "LWL85:" in line: 124 results[seq_name1][seq_name2]["LWL85"] = stats 125 results[seq_name2][seq_name1]["LWL85"] = stats 126 elif "LWL85m" in line: 127 results[seq_name1][seq_name2]["LWL85m"] = stats 128 results[seq_name2][seq_name1]["LWL85m"] = stats 129 elif "LPB93" in line: 130 results[seq_name1][seq_name2]["LPB93"] = stats 131 results[seq_name2][seq_name1]["LPB93"] = stats 132 return results
133