Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re 
 13  from math import pi, sin, cos 
 14   
 15  from Bio.Seq import Seq 
 16  from Bio.Alphabet import IUPAC 
 17  from Bio.Data import IUPACData 
 18   
 19   
 20  ###################################### 
 21  # DNA 
 22  ###################### 
 23  # {{{ 
 24   
 25   
26 -def GC(seq):
27 """Calculates G+C content, returns the percentage (float between 0 and 100). 28 29 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 30 when counting the G and C content. The percentage is calculated against 31 the full length, e.g.: 32 33 >>> from Bio.SeqUtils import GC 34 >>> GC("ACTGN") 35 40.0 36 37 Note that this will return zero for an empty sequence. 38 """ 39 try: 40 gc = sum(map(seq.count, ['G', 'C', 'g', 'c', 'S', 's'])) 41 return gc*100.0/len(seq) 42 except ZeroDivisionError: 43 return 0.0
44 45
46 -def GC123(seq):
47 """Calculates total G+C content plus first, second and third positions. 48 49 Returns a tuple of four floats (percentages between 0 and 100) for the 50 entire sequence, and the three codon positions. e.g. 51 52 >>> from Bio.SeqUtils import GC123 53 >>> GC123("ACTGTN") 54 (40.0, 50.0, 50.0, 0.0) 55 56 Copes with mixed case sequences, but does NOT deal with ambiguous 57 nucleotides. 58 """ 59 d= {} 60 for nt in ['A', 'T', 'G', 'C']: 61 d[nt] = [0, 0, 0] 62 63 for i in range(0, len(seq), 3): 64 codon = seq[i:i+3] 65 if len(codon) < 3: 66 codon += ' ' 67 for pos in range(0, 3): 68 for nt in ['A', 'T', 'G', 'C']: 69 if codon[pos] == nt or codon[pos] == nt.lower(): 70 d[nt][pos] += 1 71 gc = {} 72 gcall = 0 73 nall = 0 74 for i in range(0, 3): 75 try: 76 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 77 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 78 except: 79 gc[i] = 0 80 81 gcall = gcall + d['G'][i] + d['C'][i] 82 nall = nall + n 83 84 gcall = 100.0*gcall/nall 85 return gcall, gc[0], gc[1], gc[2]
86 87
88 -def GC_skew(seq, window=100):
89 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence. 90 91 Returns a list of ratios (floats), controlled by the length of the sequence 92 and the size of the window. 93 94 Does NOT look at any ambiguous nucleotides. 95 """ 96 # 8/19/03: Iddo: added lowercase 97 values = [] 98 for i in range(0, len(seq), window): 99 s = seq[i: i + window] 100 g = s.count('G') + s.count('g') 101 c = s.count('C') + s.count('c') 102 skew = (g-c)/float(g+c) 103 values.append(skew) 104 return values
105 106
107 -def xGC_skew(seq, window=1000, zoom=100, 108 r=300, px=100, py=100):
109 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 110 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ 111 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y 112 yscroll = Scrollbar(orient=VERTICAL) 113 xscroll = Scrollbar(orient=HORIZONTAL) 114 canvas = Canvas(yscrollcommand=yscroll.set, 115 xscrollcommand=xscroll.set, background='white') 116 win = canvas.winfo_toplevel() 117 win.geometry('700x700') 118 119 yscroll.config(command=canvas.yview) 120 xscroll.config(command=canvas.xview) 121 yscroll.pack(side=RIGHT, fill=Y) 122 xscroll.pack(side=BOTTOM, fill=X) 123 canvas.pack(fill=BOTH, side=LEFT, expand=1) 124 canvas.update() 125 126 X0, Y0 = r + px, r + py 127 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 128 129 ty = Y0 130 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 131 ty += 20 132 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) 133 ty += 20 134 canvas.create_text(X0, ty, text='GC Skew', fill='blue') 135 ty += 20 136 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') 137 ty += 20 138 canvas.create_oval(x1, y1, x2, y2) 139 140 acc = 0 141 start = 0 142 for gc in GC_skew(seq, window): 143 r1 = r 144 acc += gc 145 # GC skew 146 alpha = pi - (2*pi*start)/len(seq) 147 r2 = r1 - gc*zoom 148 x1 = X0 + r1 * sin(alpha) 149 y1 = Y0 + r1 * cos(alpha) 150 x2 = X0 + r2 * sin(alpha) 151 y2 = Y0 + r2 * cos(alpha) 152 canvas.create_line(x1, y1, x2, y2, fill='blue') 153 # accumulated GC skew 154 r1 = r - 50 155 r2 = r1 - acc 156 x1 = X0 + r1 * sin(alpha) 157 y1 = Y0 + r1 * cos(alpha) 158 x2 = X0 + r2 * sin(alpha) 159 y2 = Y0 + r2 * cos(alpha) 160 canvas.create_line(x1, y1, x2, y2, fill='magenta') 161 162 canvas.update() 163 start += window 164 165 canvas.configure(scrollregion=canvas.bbox(ALL))
166 167
168 -def molecular_weight(seq):
169 """Calculate the molecular weight of a DNA sequence.""" 170 if isinstance(seq, str): 171 seq = Seq(seq, IUPAC.unambiguous_dna) 172 weight_table = IUPACData.unambiguous_dna_weights 173 return sum(weight_table[x] for x in seq)
174 175
176 -def nt_search(seq, subseq):
177 """Search for a DNA subseq in sequence. 178 179 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 180 searches only on forward strand 181 """ 182 pattern = '' 183 for nt in subseq: 184 value = IUPACData.ambiguous_dna_values[nt] 185 if len(value) == 1: 186 pattern += value 187 else: 188 pattern += '[%s]' % value 189 190 pos = -1 191 result = [pattern] 192 l = len(seq) 193 while True: 194 pos += 1 195 s = seq[pos:] 196 m = re.search(pattern, s) 197 if not m: 198 break 199 pos += int(m.start(0)) 200 result.append(pos) 201 return result
202 203 # }}} 204 205 ###################################### 206 # Protein 207 ###################### 208 # {{{ 209 210 _THREECODE = {'A': 'Ala', 'B': 'Asx', 'C': 'Cys', 'D': 'Asp', 211 'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 212 'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 213 'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 214 'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 215 'Y': 'Tyr', 'Z': 'Glx', 'X': 'Xaa', 216 'U': 'Sel', 'O': 'Pyl', 'J': 'Xle', 217 } 218 219
220 -def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
221 """Turn a one letter code protein sequence into one with three letter codes. 222 223 The single input argument 'seq' should be a protein sequence using single 224 letter codes, either as a python string or as a Seq or MutableSeq object. 225 226 This function returns the amino acid sequence as a string using the three 227 letter amino acid codes. Output follows the IUPAC standard (including 228 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 229 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 230 Any unknown character (including possible gap characters), is changed into 231 'Xaa'. 232 233 e.g. 234 235 >>> from Bio.SeqUtils import seq3 236 >>> seq3("MAIVMGRWKGAR*") 237 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 238 239 You can set a custom translation of the codon termination code using the 240 "custom_map" argument, e.g. 241 242 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 243 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 244 245 You can also set a custom translation for non-amino acid characters, such 246 as '-', using the "undef_code" argument, e.g. 247 248 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 249 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 250 251 If not given, "undef_code" defaults to "Xaa", e.g. 252 253 >>> seq3("MAIVMGRWKGA--R*") 254 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 255 256 This function was inspired by BioPerl's seq3. 257 """ 258 threecode = _THREECODE 259 # add the given termination codon code 260 threecode.update(custom_map) 261 #We use a default of 'Xaa' for undefined letters 262 #Note this will map '-' to 'Xaa' which may be undesirable! 263 return ''.join([threecode.get(aa, undef_code) for aa in seq])
264 265
266 -def seq1(seq, custom_map={'Ter': '*'}, undef_code='X'):
267 """Turns a three-letter code protein sequence into one with single letter codes. 268 269 The single input argument 'seq' should be a protein sequence using three- 270 letter codes, either as a python string or as a Seq or MutableSeq object. 271 272 This function returns the amino acid sequence as a string using the one 273 letter amino acid codes. Output follows the IUPAC standard (including 274 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 275 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 276 Any unknown character (including possible gap characters), is changed into 277 '-'. 278 279 e.g. 280 281 >>> from Bio.SeqUtils import seq3 282 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 283 'MAIVMGRWKGAR*' 284 285 The input is case insensitive, e.g. 286 287 >>> from Bio.SeqUtils import seq3 288 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 289 'MAIVMGRWKGAR*' 290 291 You can set a custom translation of the codon termination code using the 292 "custom_map" argument, e.g. 293 294 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"}) 295 'MAIVMGRWKGAR*' 296 297 You can also set a custom translation for non-amino acid characters, such 298 as '-', using the "undef_code" argument, e.g. 299 300 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 301 'MAIVMGRWKGA??R*' 302 303 If not given, "undef_code" defaults to "X", e.g. 304 305 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 306 'MAIVMGRWKGAXXR*' 307 308 """ 309 # reverse map of threecode 310 onecode = dict([(x[1].upper(), x[0]) for x in _THREECODE.items()]) 311 # add the given termination codon code and custom maps 312 onecode.update((k.upper(), v) for (k, v) in custom_map.iteritems()) 313 seqlist = [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)] 314 return ''.join([onecode.get(aa.upper(), undef_code) for aa in seqlist])
315 316 317 # }}} 318 319 ###################################### 320 # Mixed ??? 321 ###################### 322 # {{{ 323 324
325 -def six_frame_translations(seq, genetic_code=1):
326 """Formatted string showing the 6 frame translations and GC content. 327 328 nice looking 6 frame translation with GC content - code from xbbtools 329 similar to DNA Striders six-frame translation 330 331 >>> from Bio.SeqUtils import six_frame_translations 332 >>> print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 333 GC_Frame: a:5 t:0 g:8 c:5 334 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 335 <BLANKLINE> 336 <BLANKLINE> 337 1/1 338 G H C N G P L 339 W P L * W A A 340 M A I V M G R * 341 auggccauuguaaugggccgcuga 54 % 342 uaccgguaacauuacccggcgacu 343 A M T I P R Q 344 H G N Y H A A S 345 P W Q L P G S 346 <BLANKLINE> 347 <BLANKLINE> 348 349 """ 350 from Bio.Seq import reverse_complement, translate 351 anti = reverse_complement(seq) 352 comp = anti[::-1] 353 length = len(seq) 354 frames = {} 355 for i in range(0, 3): 356 frames[i+1] = translate(seq[i:], genetic_code) 357 frames[-(i+1)] = translate(anti[i:], genetic_code)[::-1] 358 359 # create header 360 if length > 20: 361 short = '%s ... %s' % (seq[:10], seq[-10:]) 362 else: 363 short = seq 364 header = 'GC_Frame: ' 365 for nt in ['a', 't', 'g', 'c']: 366 header += '%s:%d ' % (nt, seq.count(nt.upper())) 367 368 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) 369 res = header 370 371 for i in range(0, length, 60): 372 subseq = seq[i:i+60] 373 csubseq = comp[i:i+60] 374 p = i//3 375 res += '%d/%d\n' % (i+1, i/3+1) 376 res += ' ' + ' '.join(map(None, frames[3][p:p+20])) + '\n' 377 res += ' ' + ' '.join(map(None, frames[2][p:p+20])) + '\n' 378 res += ' '.join(map(None, frames[1][p:p+20])) + '\n' 379 # seq 380 res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) 381 res += csubseq.lower() + '\n' 382 # - frames 383 res += ' '.join(map(None, frames[-2][p:p+20])) +' \n' 384 res += ' ' + ' '.join(map(None, frames[-1][p:p+20])) + '\n' 385 res += ' ' + ' '.join(map(None, frames[-3][p:p+20])) + '\n\n' 386 return res
387 388 # }}} 389 390 ###################################### 391 # FASTA file utilities 392 ###################### 393 # {{{ 394 395
396 -def quick_FASTA_reader(file):
397 """Simple FASTA reader, returning a list of string tuples (OBSOLETE). 398 399 The single argument 'file' should be the filename of a FASTA format file. 400 This function will open and read in the entire file, constructing a list 401 of all the records, each held as a tuple of strings (the sequence name or 402 title, and its sequence). 403 404 >>> seqs = quick_FASTA_reader("Fasta/dups.fasta") 405 >>> for title, sequence in seqs: 406 ... print title, sequence 407 alpha ACGTA 408 beta CGTC 409 gamma CCGCC 410 alpha (again - this is a duplicate entry to test the indexing code) ACGTA 411 delta CGCGC 412 413 This function was is fast, but because it returns the data as a single in 414 memory list, is unsuitable for large files where an iterator approach is 415 preferable. 416 417 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 418 allows you to iterate over the records one by one (avoiding having all the 419 records in memory at once). Using Bio.SeqIO also makes it easy to switch 420 between different input file formats. However, please note that rather 421 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 422 423 If you want to use simple strings, use the function SimpleFastaParser 424 added to Bio.SeqIO.FastaIO in Biopython 1.61 instead. 425 """ 426 handle = open(file) 427 entries = [] 428 from Bio.SeqIO.FastaIO import SimpleFastaParser 429 for title, sequence in SimpleFastaParser(handle): 430 entries.append((title, sequence)) 431 handle.close() 432 return entries
433 434 435 # }}} 436 437
438 -def _test():
439 """Run the module's doctests (PRIVATE).""" 440 import os 441 import doctest 442 if os.path.isdir(os.path.join("..", "Tests")): 443 print "Running doctests..." 444 cur_dir = os.path.abspath(os.curdir) 445 os.chdir(os.path.join("..", "Tests")) 446 doctest.testmod() 447 os.chdir(cur_dir) 448 del cur_dir 449 print "Done" 450 elif os.path.isdir(os.path.join("Tests")): 451 print "Running doctests..." 452 cur_dir = os.path.abspath(os.curdir) 453 os.chdir(os.path.join("Tests")) 454 doctest.testmod() 455 os.chdir(cur_dir) 456 del cur_dir 457 print "Done"
458 459 460 if __name__ == "__main__": 461 _test() 462