Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # Revisions copyright 2014 by Markus Piotrowski. 
  6  # Revisions copyright 2014 by Peter Cock. 
  7  # All rights reserved. 
  8  # This code is part of the Biopython distribution and governed by its 
  9  # license.  Please see the LICENSE file that should have been included 
 10  # as part of this package. 
 11   
 12  """Miscellaneous functions for dealing with sequences.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  import re 
 17  from math import pi, sin, cos 
 18   
 19  from Bio.Seq import Seq, MutableSeq 
 20  from Bio import Alphabet 
 21  from Bio.Alphabet import IUPAC 
 22  from Bio.Data import IUPACData 
 23   
 24  __docformat__ = "restructuredtext en" 
 25   
 26  ###################################### 
 27  # DNA 
 28  ###################### 
 29  # {{{ 
 30   
 31   
32 -def GC(seq):
33 """Calculates G+C content, returns the percentage (float between 0 and 100). 34 35 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 36 when counting the G and C content. The percentage is calculated against 37 the full length, e.g.: 38 39 >>> from Bio.SeqUtils import GC 40 >>> GC("ACTGN") 41 40.0 42 43 Note that this will return zero for an empty sequence. 44 """ 45 try: 46 gc = sum(seq.count(x) for x in ['G', 'C', 'g', 'c', 'S', 's']) 47 return gc*100.0/len(seq) 48 except ZeroDivisionError: 49 return 0.0
50 51
52 -def GC123(seq):
53 """Calculates total G+C content plus first, second and third positions. 54 55 Returns a tuple of four floats (percentages between 0 and 100) for the 56 entire sequence, and the three codon positions. e.g. 57 58 >>> from Bio.SeqUtils import GC123 59 >>> GC123("ACTGTN") 60 (40.0, 50.0, 50.0, 0.0) 61 62 Copes with mixed case sequences, but does NOT deal with ambiguous 63 nucleotides. 64 """ 65 d= {} 66 for nt in ['A', 'T', 'G', 'C']: 67 d[nt] = [0, 0, 0] 68 69 for i in range(0, len(seq), 3): 70 codon = seq[i:i+3] 71 if len(codon) < 3: 72 codon += ' ' 73 for pos in range(0, 3): 74 for nt in ['A', 'T', 'G', 'C']: 75 if codon[pos] == nt or codon[pos] == nt.lower(): 76 d[nt][pos] += 1 77 gc = {} 78 gcall = 0 79 nall = 0 80 for i in range(0, 3): 81 try: 82 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 83 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 84 except: 85 gc[i] = 0 86 87 gcall = gcall + d['G'][i] + d['C'][i] 88 nall = nall + n 89 90 gcall = 100.0*gcall/nall 91 return gcall, gc[0], gc[1], gc[2]
92 93
94 -def GC_skew(seq, window=100):
95 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence. 96 97 Returns a list of ratios (floats), controlled by the length of the sequence 98 and the size of the window. 99 100 Does NOT look at any ambiguous nucleotides. 101 """ 102 # 8/19/03: Iddo: added lowercase 103 values = [] 104 for i in range(0, len(seq), window): 105 s = seq[i: i + window] 106 g = s.count('G') + s.count('g') 107 c = s.count('C') + s.count('c') 108 skew = (g-c)/float(g+c) 109 values.append(skew) 110 return values
111 112
113 -def xGC_skew(seq, window=1000, zoom=100, 114 r=300, px=100, py=100):
115 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 116 try: 117 import Tkinter as tkinter # Python 2 118 except ImportError: 119 import tkinter # Python 3 120 121 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) 122 xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) 123 canvas = tkinter.Canvas(yscrollcommand=yscroll.set, 124 xscrollcommand=xscroll.set, background='white') 125 win = canvas.winfo_toplevel() 126 win.geometry('700x700') 127 128 yscroll.config(command=canvas.yview) 129 xscroll.config(command=canvas.xview) 130 yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) 131 xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) 132 canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) 133 canvas.update() 134 135 X0, Y0 = r + px, r + py 136 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 137 138 ty = Y0 139 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 140 ty += 20 141 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) 142 ty += 20 143 canvas.create_text(X0, ty, text='GC Skew', fill='blue') 144 ty += 20 145 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') 146 ty += 20 147 canvas.create_oval(x1, y1, x2, y2) 148 149 acc = 0 150 start = 0 151 for gc in GC_skew(seq, window): 152 r1 = r 153 acc += gc 154 # GC skew 155 alpha = pi - (2*pi*start)/len(seq) 156 r2 = r1 - gc*zoom 157 x1 = X0 + r1 * sin(alpha) 158 y1 = Y0 + r1 * cos(alpha) 159 x2 = X0 + r2 * sin(alpha) 160 y2 = Y0 + r2 * cos(alpha) 161 canvas.create_line(x1, y1, x2, y2, fill='blue') 162 # accumulated GC skew 163 r1 = r - 50 164 r2 = r1 - acc 165 x1 = X0 + r1 * sin(alpha) 166 y1 = Y0 + r1 * cos(alpha) 167 x2 = X0 + r2 * sin(alpha) 168 y2 = Y0 + r2 * cos(alpha) 169 canvas.create_line(x1, y1, x2, y2, fill='magenta') 170 171 canvas.update() 172 start += window 173 174 canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
175 176
177 -def nt_search(seq, subseq):
178 """Search for a DNA subseq in sequence. 179 180 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 181 searches only on forward strand 182 """ 183 pattern = '' 184 for nt in subseq: 185 value = IUPACData.ambiguous_dna_values[nt] 186 if len(value) == 1: 187 pattern += value 188 else: 189 pattern += '[%s]' % value 190 191 pos = -1 192 result = [pattern] 193 l = len(seq) 194 while True: 195 pos += 1 196 s = seq[pos:] 197 m = re.search(pattern, s) 198 if not m: 199 break 200 pos += int(m.start(0)) 201 result.append(pos) 202 return result
203 204 # }}} 205 206 ###################################### 207 # Protein 208 ###################### 209 # {{{ 210 211
212 -def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
213 """Turn a one letter code protein sequence into one with three letter codes. 214 215 The single input argument 'seq' should be a protein sequence using single 216 letter codes, either as a python string or as a Seq or MutableSeq object. 217 218 This function returns the amino acid sequence as a string using the three 219 letter amino acid codes. Output follows the IUPAC standard (including 220 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 221 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 222 Any unknown character (including possible gap characters), is changed into 223 'Xaa'. 224 225 e.g. 226 227 >>> from Bio.SeqUtils import seq3 228 >>> seq3("MAIVMGRWKGAR*") 229 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 230 231 You can set a custom translation of the codon termination code using the 232 "custom_map" argument, e.g. 233 234 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 235 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 236 237 You can also set a custom translation for non-amino acid characters, such 238 as '-', using the "undef_code" argument, e.g. 239 240 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 241 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 242 243 If not given, "undef_code" defaults to "Xaa", e.g. 244 245 >>> seq3("MAIVMGRWKGA--R*") 246 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 247 248 This function was inspired by BioPerl's seq3. 249 """ 250 # not doing .update() on IUPACData dict with custom_map dict 251 # to preserve its initial state (may be imported in other modules) 252 threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) + 253 list(custom_map.items())) 254 # We use a default of 'Xaa' for undefined letters 255 # Note this will map '-' to 'Xaa' which may be undesirable! 256 return ''.join(threecode.get(aa, undef_code) for aa in seq)
257 258
259 -def seq1(seq, custom_map={'Ter': '*'}, undef_code='X'):
260 """Turns a three-letter code protein sequence into one with single letter codes. 261 262 The single input argument 'seq' should be a protein sequence using three- 263 letter codes, either as a python string or as a Seq or MutableSeq object. 264 265 This function returns the amino acid sequence as a string using the one 266 letter amino acid codes. Output follows the IUPAC standard (including 267 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 268 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 269 Any unknown character (including possible gap characters), is changed into 270 '-'. 271 272 e.g. 273 274 >>> from Bio.SeqUtils import seq3 275 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 276 'MAIVMGRWKGAR*' 277 278 The input is case insensitive, e.g. 279 280 >>> from Bio.SeqUtils import seq3 281 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 282 'MAIVMGRWKGAR*' 283 284 You can set a custom translation of the codon termination code using the 285 "custom_map" argument, e.g. 286 287 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"}) 288 'MAIVMGRWKGAR*' 289 290 You can also set a custom translation for non-amino acid characters, such 291 as '-', using the "undef_code" argument, e.g. 292 293 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 294 'MAIVMGRWKGA??R*' 295 296 If not given, "undef_code" defaults to "X", e.g. 297 298 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 299 'MAIVMGRWKGAXXR*' 300 301 """ 302 # reverse map of threecode 303 # upper() on all keys to enable caps-insensitive input seq handling 304 onecode = dict((k.upper(), v) for k, v in 305 IUPACData.protein_letters_3to1_extended.items()) 306 # add the given termination codon code and custom maps 307 onecode.update((k.upper(), v) for (k, v) in custom_map.items()) 308 seqlist = [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)] 309 return ''.join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
310 311 312 # }}} 313 314 ###################################### 315 # Mixed ??? 316 ###################### 317 # {{{ 318 319
320 -def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, 321 monoisotopic=False):
322 """Calculates the molecular weight of a DNA, RNA or protein sequence. 323 324 Only unambiguous letters are allowed. Nucleotide sequences are assumed to 325 have a 5' phosphate. 326 327 - seq: String or Biopython sequence object. 328 - seq_type: The default (None) is to take the alphabet from the seq argument, 329 or assume DNA if the seq argument is a string. Override this with 330 a string 'DNA', 'RNA', or 'protein'. 331 - double_stranded: Calculate the mass for the double stranded molecule? 332 - circular: Is the molecule circular (has no ends)? 333 - monoisotopic: Use the monoisotopic mass tables? 334 335 Note that for backwards compatibility, if the seq argument is a string, 336 or Seq object with a generic alphabet, and no seq_type is specified 337 (i.e. left as None), then DNA is assumed. 338 339 >>> print("%0.2f" % molecular_weight("AGC")) 340 949.61 341 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 342 949.61 343 344 However, it is better to be explicit - for example with strings: 345 346 >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 347 949.61 348 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 349 997.61 350 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 351 249.29 352 353 Or, with the sequence alphabet: 354 355 >>> from Bio.Seq import Seq 356 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 357 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 358 949.61 359 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 360 997.61 361 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 362 249.29 363 364 Also note that contradictory sequence alphabets and seq_type will also 365 give an exception: 366 367 >>> from Bio.Seq import Seq 368 >>> from Bio.Alphabet import generic_dna 369 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) 370 Traceback (most recent call last): 371 ... 372 ValueError: seq_type='RNA' contradicts DNA from seq alphabet 373 374 """ 375 # Rewritten by Markus Piotrowski, 2014 376 377 # Find the alphabet type 378 tmp_type = '' 379 if isinstance(seq, Seq) or isinstance(seq, MutableSeq): 380 base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) 381 if isinstance(base_alphabet, Alphabet.DNAAlphabet): 382 tmp_type = 'DNA' 383 elif isinstance(base_alphabet, Alphabet.RNAAlphabet): 384 tmp_type = 'RNA' 385 elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): 386 tmp_type = 'protein' 387 elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): 388 tmp_type = 'protein' 389 # Convert to one-letter sequence. Have to use a string for seq1 390 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) 391 elif not isinstance(base_alphabet, Alphabet.Alphabet): 392 raise TypeError("%s is not a valid alphabet for mass calculations" 393 % base_alphabet) 394 else: 395 tmp_type = "DNA" # backward compatibity 396 if seq_type and tmp_type and tmp_type != seq_type: 397 raise ValueError("seq_type=%r contradicts %s from seq alphabet" 398 % (seq_type, tmp_type)) 399 seq_type = tmp_type 400 elif isinstance(seq, str): 401 if seq_type is None: 402 seq_type = "DNA" # backward compatibity 403 else: 404 raise TypeError("Expected a string or Seq object, not seq=%r" % seq) 405 406 seq = ''.join(str(seq).split()).upper() # Do the minimum formatting 407 408 if seq_type == 'DNA': 409 if monoisotopic: 410 weight_table = IUPACData.monoisotopic_unambiguous_dna_weights 411 else: 412 weight_table = IUPACData.unambiguous_dna_weights 413 elif seq_type == 'RNA': 414 if monoisotopic: 415 weight_table = IUPACData.monoisotopic_unambiguous_rna_weights 416 else: 417 weight_table = IUPACData.unambiguous_rna_weights 418 elif seq_type == 'protein': 419 if monoisotopic: 420 weight_table = IUPACData.monoisotopic_protein_weights 421 else: 422 weight_table = IUPACData.protein_weights 423 else: 424 raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" 425 % seq_type) 426 427 if monoisotopic: 428 water = 18.010565 429 else: 430 water = 18.0153 431 432 try: 433 weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water 434 if circular: 435 weight -= water 436 except KeyError as e: 437 raise ValueError('%s is not a valid unambiguous letter for %s' 438 %(e, seq_type)) 439 except: 440 raise 441 442 if seq_type in ('DNA', 'RNA') and double_stranded: 443 seq = str(Seq(seq).complement()) 444 weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water 445 if circular: 446 weight -= water 447 elif seq_type == 'protein' and double_stranded: 448 raise ValueError('double-stranded proteins await their discovery') 449 450 return weight
451 452
453 -def six_frame_translations(seq, genetic_code=1):
454 """Formatted string showing the 6 frame translations and GC content. 455 456 nice looking 6 frame translation with GC content - code from xbbtools 457 similar to DNA Striders six-frame translation 458 459 >>> from Bio.SeqUtils import six_frame_translations 460 >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) 461 GC_Frame: a:5 t:0 g:8 c:5 462 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 463 <BLANKLINE> 464 <BLANKLINE> 465 1/1 466 G H C N G P L 467 W P L * W A A 468 M A I V M G R * 469 auggccauuguaaugggccgcuga 54 % 470 uaccgguaacauuacccggcgacu 471 A M T I P R Q 472 H G N Y H A A S 473 P W Q L P G S 474 <BLANKLINE> 475 <BLANKLINE> 476 477 """ 478 from Bio.Seq import reverse_complement, translate 479 anti = reverse_complement(seq) 480 comp = anti[::-1] 481 length = len(seq) 482 frames = {} 483 for i in range(0, 3): 484 fragment_length = 3 * ((length-i) // 3) 485 frames[i+1] = translate(seq[i:i+fragment_length], genetic_code) 486 frames[-(i+1)] = translate(anti[i:i+fragment_length], genetic_code)[::-1] 487 488 # create header 489 if length > 20: 490 short = '%s ... %s' % (seq[:10], seq[-10:]) 491 else: 492 short = seq 493 header = 'GC_Frame: ' 494 for nt in ['a', 't', 'g', 'c']: 495 header += '%s:%d ' % (nt, seq.count(nt.upper())) 496 497 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) 498 res = header 499 500 for i in range(0, length, 60): 501 subseq = seq[i:i+60] 502 csubseq = comp[i:i+60] 503 p = i//3 504 res += '%d/%d\n' % (i+1, i/3+1) 505 res += ' ' + ' '.join(frames[3][p:p+20]) + '\n' 506 res += ' ' + ' '.join(frames[2][p:p+20]) + '\n' 507 res += ' '.join(frames[1][p:p+20]) + '\n' 508 # seq 509 res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) 510 res += csubseq.lower() + '\n' 511 # - frames 512 res += ' '.join(frames[-2][p:p+20]) +' \n' 513 res += ' ' + ' '.join(frames[-1][p:p+20]) + '\n' 514 res += ' ' + ' '.join(frames[-3][p:p+20]) + '\n\n' 515 return res
516 517 # }}} 518 519 ###################################### 520 # FASTA file utilities 521 ###################### 522 # {{{ 523 524
525 -def quick_FASTA_reader(file):
526 """Simple FASTA reader, returning a list of string tuples (DEPRECATED). 527 528 The single argument 'file' should be the filename of a FASTA format file. 529 This function will open and read in the entire file, constructing a list 530 of all the records, each held as a tuple of strings (the sequence name or 531 title, and its sequence). 532 533 >>> seqs = quick_FASTA_reader("Fasta/dups.fasta") 534 >>> for title, sequence in seqs: 535 ... print("%s %s" % (title, sequence)) 536 alpha ACGTA 537 beta CGTC 538 gamma CCGCC 539 alpha (again - this is a duplicate entry to test the indexing code) ACGTA 540 delta CGCGC 541 542 This function was is fast, but because it returns the data as a single in 543 memory list, is unsuitable for large files where an iterator approach is 544 preferable. 545 546 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 547 allows you to iterate over the records one by one (avoiding having all the 548 records in memory at once). Using Bio.SeqIO also makes it easy to switch 549 between different input file formats. However, please note that rather 550 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 551 552 If you want to use simple strings, use the function SimpleFastaParser 553 added to Bio.SeqIO.FastaIO in Biopython 1.61 instead. 554 """ 555 import warnings 556 from Bio import BiopythonDeprecationWarning 557 warnings.warn("The quick_FASTA_reader has been deprecated and will be " 558 "removed in a future release of Biopython. Please try " 559 "function SimpleFastaParser from Bio.SeqIO.FastaIO " 560 "instead.", BiopythonDeprecationWarning) 561 from Bio.SeqIO.FastaIO import SimpleFastaParser 562 with open(file) as handle: 563 entries = list(SimpleFastaParser(handle)) 564 return entries
565 566 567 # }}} 568 569
570 -def _test():
571 """Run the module's doctests (PRIVATE).""" 572 import os 573 import doctest 574 if os.path.isdir(os.path.join("..", "Tests")): 575 print("Running doctests...") 576 cur_dir = os.path.abspath(os.curdir) 577 os.chdir(os.path.join("..", "Tests")) 578 doctest.testmod() 579 os.chdir(cur_dir) 580 del cur_dir 581 print("Done") 582 elif os.path.isdir(os.path.join("Tests")): 583 print("Running doctests...") 584 cur_dir = os.path.abspath(os.curdir) 585 os.chdir(os.path.join("Tests")) 586 doctest.testmod() 587 os.chdir(cur_dir) 588 del cur_dir 589 print("Done")
590 591 592 if __name__ == "__main__": 593 _test() 594