Package Bio :: Package Data :: Module IUPACData
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.IUPACData

  1  # Information about the IUPAC alphabets 
  2   
  3  protein_letters = "ACDEFGHIKLMNPQRSTVWY" 
  4  extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" 
  5  #   B = "Asx";  aspartic acid or asparagine (D or N) 
  6  #   X = "Xxx";  unknown or 'other' amino acid 
  7  #   Z = "Glx";  glutamic acid or glutamine (E or Q) 
  8  #   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 
  9  # 
 10  #   J = "Xle";  leucine or isoleucine (L or I, used in NMR) 
 11  #   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 12  #   Also the International Nucleotide Sequence Database Collaboration (INSDC) 
 13  #   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 
 14  #   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html 
 15  # 
 16  #   Xle (J); Leucine or Isoleucine 
 17  #   The residue abbreviations, Xle (the three-letter abbreviation) and J 
 18  #   (the one-letter abbreviation) are reserved for the case that cannot 
 19  #   experimentally distinguish leucine from isoleucine. 
 20  # 
 21  #   U = "Sec";  selenocysteine 
 22  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 23  # 
 24  #   O = "Pyl";  pyrrolysine 
 25  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 
 26   
 27  protein_letters_1to3  = { 
 28      'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 
 29      'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 
 30      'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 
 31      'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 
 32      'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 
 33      'Y': 'Tyr', 
 34  } 
 35  protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({ 
 36      'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle', 
 37      'U': 'Sel', 'O': 'Pyl', 
 38  }.items())) 
 39   
 40  protein_letters_3to1 = dict((x[1], x[0]) for x in 
 41                              protein_letters_1to3.items()) 
 42  protein_letters_3to1_extended = dict((x[1], x[0]) for x in 
 43                                       protein_letters_1to3_extended.items()) 
 44   
 45  ambiguous_dna_letters = "GATCRYWSMKHBVDN" 
 46  unambiguous_dna_letters = "GATC" 
 47  ambiguous_rna_letters = "GAUCRYWSMKHBVDN" 
 48  unambiguous_rna_letters = "GAUC" 
 49   
 50  #   B == 5-bromouridine 
 51  #   D == 5,6-dihydrouridine 
 52  #   S == thiouridine 
 53  #   W == wyosine 
 54  extended_dna_letters = "GATCBDSW" 
 55   
 56  # are there extended forms? 
 57  #extended_rna_letters = "GAUCBDSW" 
 58   
 59  ambiguous_dna_values = { 
 60      "A": "A", 
 61      "C": "C", 
 62      "G": "G", 
 63      "T": "T", 
 64      "M": "AC", 
 65      "R": "AG", 
 66      "W": "AT", 
 67      "S": "CG", 
 68      "Y": "CT", 
 69      "K": "GT", 
 70      "V": "ACG", 
 71      "H": "ACT", 
 72      "D": "AGT", 
 73      "B": "CGT", 
 74      "X": "GATC", 
 75      "N": "GATC", 
 76      } 
 77  ambiguous_rna_values = { 
 78      "A": "A", 
 79      "C": "C", 
 80      "G": "G", 
 81      "U": "U", 
 82      "M": "AC", 
 83      "R": "AG", 
 84      "W": "AU", 
 85      "S": "CG", 
 86      "Y": "CU", 
 87      "K": "GU", 
 88      "V": "ACG", 
 89      "H": "ACU", 
 90      "D": "AGU", 
 91      "B": "CGU", 
 92      "X": "GAUC", 
 93      "N": "GAUC", 
 94      } 
 95   
 96  ambiguous_dna_complement = { 
 97      "A": "T", 
 98      "C": "G", 
 99      "G": "C", 
100      "T": "A", 
101      "M": "K", 
102      "R": "Y", 
103      "W": "W", 
104      "S": "S", 
105      "Y": "R", 
106      "K": "M", 
107      "V": "B", 
108      "H": "D", 
109      "D": "H", 
110      "B": "V", 
111      "X": "X", 
112      "N": "N", 
113      } 
114   
115  ambiguous_rna_complement = { 
116      "A": "U", 
117      "C": "G", 
118      "G": "C", 
119      "U": "A", 
120      "M": "K", 
121      "R": "Y", 
122      "W": "W", 
123      "S": "S", 
124      "Y": "R", 
125      "K": "M", 
126      "V": "B", 
127      "H": "D", 
128      "D": "H", 
129      "B": "V", 
130      "X": "X", 
131      "N": "N", 
132      } 
133   
134   
135 -def _make_ranges(mydict):
136 d = {} 137 for key, value in mydict.items(): 138 d[key] = (value, value) 139 return d
140 141 # From bioperl's SeqStats.pm 142 unambiguous_dna_weights = { 143 "A": 347., 144 "C": 323., 145 "G": 363., 146 "T": 322., 147 } 148 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) 149 150 unambiguous_rna_weights = { 151 "A": unambiguous_dna_weights["A"] + 16., # 16 for the oxygen 152 "C": unambiguous_dna_weights["C"] + 16., 153 "G": unambiguous_dna_weights["G"] + 16., 154 "U": 340., 155 } 156 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) 157 158
159 -def _make_ambiguous_ranges(mydict, weight_table):
160 range_d = {} 161 avg_d = {} 162 for letter, values in mydict.items(): 163 #Following line is a quick hack to skip undefined weights for U and O 164 if len(values) == 1 and values[0] not in weight_table: 165 continue 166 167 weights = [weight_table.get(x) for x in values] 168 range_d[letter] = (min(weights), max(weights)) 169 total_w = 0.0 170 for w in weights: 171 total_w = total_w + w 172 avg_d[letter] = total_w / len(weights) 173 return range_d, avg_d
174 175 ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ 176 _make_ambiguous_ranges(ambiguous_dna_values, 177 unambiguous_dna_weights) 178 179 ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ 180 _make_ambiguous_ranges(ambiguous_rna_values, 181 unambiguous_rna_weights) 182 183 protein_weights = { 184 "A": 89.09, 185 "C": 121.16, 186 "D": 133.10, 187 "E": 147.13, 188 "F": 165.19, 189 "G": 75.07, 190 "H": 155.16, 191 "I": 131.18, 192 "K": 146.19, 193 "L": 131.18, 194 "M": 149.21, 195 "N": 132.12, 196 #"O": 0.0, # Needs to be recorded! 197 "P": 115.13, 198 "Q": 146.15, 199 "R": 174.20, 200 "S": 105.09, 201 "T": 119.12, 202 #"U": 168.05, # To be confirmed 203 "V": 117.15, 204 "W": 204.23, 205 "Y": 181.19 206 } 207 208 monoisotopic_protein_weights = { 209 "A": 89.05, 210 "C": 121.02, 211 "D": 133.04, 212 "E": 147.05, 213 "F": 165.08, 214 "G": 75.03, 215 "H": 155.07, 216 "I": 131.09, 217 "K": 146.11, 218 "L": 131.09, 219 "M": 149.05, 220 "N": 132.05, 221 "P": 115.06, 222 "Q": 146.07, 223 "R": 174.11, 224 "S": 105.04, 225 "T": 119.06, 226 "V": 117.08, 227 "W": 204.09, 228 "Y": 181.07, 229 } 230 231 extended_protein_values = { 232 "A": "A", 233 "B": "ND", 234 "C": "C", 235 "D": "D", 236 "E": "E", 237 "F": "F", 238 "G": "G", 239 "H": "H", 240 "I": "I", 241 "J": "IL", 242 "K": "K", 243 "L": "L", 244 "M": "M", 245 "N": "N", 246 "O": "O", 247 "P": "P", 248 "Q": "Q", 249 "R": "R", 250 "S": "S", 251 "T": "T", 252 "U": "U", 253 "V": "V", 254 "W": "W", 255 "X": "ACDEFGHIKLMNPQRSTVWY", 256 #TODO - Include U and O in the possible values of X? 257 #This could alter the extended_protein_weight_ranges ... 258 "Y": "Y", 259 "Z": "QE", 260 } 261 262 protein_weight_ranges = _make_ranges(protein_weights) 263 264 extended_protein_weight_ranges, avg_extended_protein_weights = \ 265 _make_ambiguous_ranges(extended_protein_values, 266 protein_weights) 267 268 269 # For Center of Mass Calculation. 270 # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol 271 atom_weights = { 272 'H': 1.00794, 273 'He': 4.002602, 274 'Li': 6.941, 275 'Be': 9.012182, 276 'B': 10.811, 277 'C': 12.0107, 278 'N': 14.0067, 279 'O': 15.9994, 280 'F': 18.9984032, 281 'Ne': 20.1797, 282 'Na': 22.989770, 283 'Mg': 24.3050, 284 'Al': 26.981538, 285 'Si': 28.0855, 286 'P': 30.973761, 287 'S': 32.065, 288 'Cl': 35.453, 289 'Ar': 39.948, 290 'K': 39.0983, 291 'Ca': 40.078, 292 'Sc': 44.955910, 293 'Ti': 47.867, 294 'V': 50.9415, 295 'Cr': 51.9961, 296 'Mn': 54.938049, 297 'Fe': 55.845, 298 'Co': 58.933200, 299 'Ni': 58.6934, 300 'Cu': 63.546, 301 'Zn': 65.39, 302 'Ga': 69.723, 303 'Ge': 72.64, 304 'As': 74.92160, 305 'Se': 78.96, 306 'Br': 79.904, 307 'Kr': 83.80, 308 'Rb': 85.4678, 309 'Sr': 87.62, 310 'Y': 88.90585, 311 'Zr': 91.224, 312 'Nb': 92.90638, 313 'Mo': 95.94, 314 'Tc': 98.0, 315 'Ru': 101.07, 316 'Rh': 102.90550, 317 'Pd': 106.42, 318 'Ag': 107.8682, 319 'Cd': 112.411, 320 'In': 114.818, 321 'Sn': 118.710, 322 'Sb': 121.760, 323 'Te': 127.60, 324 'I': 126.90447, 325 'Xe': 131.293, 326 'Cs': 132.90545, 327 'Ba': 137.327, 328 'La': 138.9055, 329 'Ce': 140.116, 330 'Pr': 140.90765, 331 'Nd': 144.24, 332 'Pm': 145.0, 333 'Sm': 150.36, 334 'Eu': 151.964, 335 'Gd': 157.25, 336 'Tb': 158.92534, 337 'Dy': 162.50, 338 'Ho': 164.93032, 339 'Er': 167.259, 340 'Tm': 168.93421, 341 'Yb': 173.04, 342 'Lu': 174.967, 343 'Hf': 178.49, 344 'Ta': 180.9479, 345 'W': 183.84, 346 'Re': 186.207, 347 'Os': 190.23, 348 'Ir': 192.217, 349 'Pt': 195.078, 350 'Au': 196.96655, 351 'Hg': 200.59, 352 'Tl': 204.3833, 353 'Pb': 207.2, 354 'Bi': 208.98038, 355 'Po': 208.98, 356 'At': 209.99, 357 'Rn': 222.02, 358 'Fr': 223.02, 359 'Ra': 226.03, 360 'Ac': 227.03, 361 'Th': 232.0381, 362 'Pa': 231.03588, 363 'U': 238.02891, 364 'Np': 237.05, 365 'Pu': 244.06, 366 'Am': 243.06, 367 'Cm': 247.07, 368 'Bk': 247.07, 369 'Cf': 251.08, 370 'Es': 252.08, 371 'Fm': 257.10, 372 'Md': 258.10, 373 'No': 259.10, 374 'Lr': 262.11, 375 'Rf': 261.11, 376 'Db': 262.11, 377 'Sg': 266.12, 378 'Bh': 264.12, 379 'Hs': 269.13, 380 'Mt': 268.14, 381 } 382