Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
15 16 17 -def create(instances, alphabet=None):
18 instances = Instances(instances, alphabet) 19 return Motif(instances=instances, alphabet=alphabet)
20
21 22 -def parse(handle, format):
23 """Parses an output file of motif finding programs. 24 25 Currently supported formats (case is ignored): 26 - AlignAce: AlignAce output file format 27 - MEME: MEME output file motif 28 - MAST: MAST output file motif 29 - TRANSFAC: TRANSFAC database file format 30 - pfm: JASPAR-style position-frequency matrix 31 - sites: JASPAR-style sites file 32 As files in the pfm and sites formats contain only a single motif, 33 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 34 for those. 35 36 For example: 37 38 >>> from Bio import motifs 39 >>> for m in motifs.parse(open("Motif/alignace.out"),"AlignAce"): 40 ... print m.consensus 41 TCTACGATTGAG 42 CTGCAGCTAGCTACGAGTGAG 43 GTGCTCTAAGCATAGTAGGCG 44 GCCACTAGCAGAGCAGGGGGC 45 CGACTCAGAGGTT 46 CCACGCTAAGAGAGGTGCCGGAG 47 GCGCGTCGCTGAGCA 48 GTCCATCGCAAAGCGTGGGGC 49 GGGATCAGAGGGCCG 50 TGGAGGCGGGG 51 GACCAGAGCTTCGCATGGGGG 52 GGCGTGCGTG 53 GCTGGTTGCTGTTCATTAGG 54 GCCGGCGGCAGCTAAAAGGG 55 GAGGCCGGGGAT 56 CGACTCGTGCTTAGAAGG 57 """ 58 format = format.lower() 59 if format=="alignace": 60 from Bio.motifs import alignace 61 record = alignace.read(handle) 62 return record 63 elif format=="meme": 64 from Bio.motifs import meme 65 record = meme.read(handle) 66 return record 67 elif format=="mast": 68 from Bio.motifs import mast 69 record = mast.read(handle) 70 return record 71 elif format=="transfac": 72 from Bio.motifs import transfac 73 record = transfac.read(handle) 74 return record 75 elif format in ('pfm', 'sites'): 76 from Bio.motifs import jaspar 77 motif = jaspar.read(handle, format) 78 motifs = [motif] 79 return motifs 80 else: 81 raise ValueError("Unknown format %s" % format)
82
83 84 -def read(handle, format):
85 """Reads a motif from a handle using a specified file-format. 86 87 This supports the same formats as Bio.motifs.parse(), but 88 only for files containing exactly one motif. For example, 89 reading a JASPAR-style pfm file: 90 91 >>> from Bio import motifs 92 >>> m = motifs.read(open("motifs/SRF.pfm"), "pfm") 93 >>> m.consensus 94 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 95 96 Or a single-motif MEME file, 97 98 >>> from Bio import motifs 99 >>> m = motifs.read(open("motifs/meme.out"),"meme") 100 >>> m.consensus 101 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 102 103 If the handle contains no records, or more than one record, 104 an exception is raised: 105 106 >>> from Bio import motifs 107 >>> motif = motifs.read(open("motifs/alignace.out"),"AlignAce") 108 Traceback (most recent call last): 109 ... 110 ValueError: More than one motif found in handle 111 112 If however you want the first motif from a file containing 113 multiple motifs this function would raise an exception (as 114 shown in the example above). Instead use: 115 116 >>> from Bio import motifs 117 >>> record = motifs.parse(open("motifs/alignace.out"),"alignace") 118 >>> motif = record[0] 119 >>> motif.consensus 120 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 121 122 Use the Bio.motifs.parse(handle, format) function if you want 123 to read multiple records from the handle. 124 """ 125 format = format.lower() 126 motifs = parse(handle, format) 127 if len(motifs)==0: 128 raise ValueError("No motifs found in handle") 129 if len(motifs) > 1: 130 raise ValueError("More than one motif found in handle") 131 motif = motifs[0] 132 return motif
133
134 135 -class Instances(list):
136 """ 137 A class representing instances of sequence motifs. 138 """
139 - def __init__(self, instances=[], alphabet=None):
140 from Bio.Alphabet import IUPAC 141 from Bio.Seq import Seq 142 self.length = None 143 for instance in instances: 144 if self.length is None: 145 self.length = len(instance) 146 elif self.length != len(instance): 147 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 148 raise ValueError(message) 149 try: 150 a = instance.alphabet 151 except AttributeError: 152 # The instance is a plain string 153 continue 154 if alphabet is None: 155 alphabet = a 156 elif alphabet != a: 157 raise ValueError("Alphabets are inconsistent") 158 if alphabet is None or alphabet.letters is None: 159 # If we didn't get a meaningful alphabet from the instances, 160 # assume it is DNA. 161 alphabet = IUPAC.unambiguous_dna 162 for instance in instances: 163 if not isinstance(instance, Seq): 164 sequence = str(instance) 165 instance = Seq(sequence, alphabet=alphabet) 166 self.append(instance) 167 self.alphabet = alphabet
168
169 - def __str__(self):
170 text = "" 171 for instance in self: 172 text += str(instance) + "\n" 173 return text
174
175 - def count(self):
176 counts = {} 177 for letter in self.alphabet.letters: 178 counts[letter] = [0] * self.length 179 for instance in self: 180 for position, letter in enumerate(instance): 181 counts[letter][position] += 1 182 return counts
183
184 - def search(self, sequence):
185 """ 186 a generator function, returning found positions of motif instances in a given sequence 187 """ 188 for pos in xrange(0,len(sequence)-self.length+1): 189 for instance in self: 190 if str(instance) == str(sequence[pos:pos+self.length]): 191 yield(pos,instance) 192 break # no other instance will fit (we don't want to return multiple hits)
193 - def reverse_complement(self):
194 instances = Instances(alphabet=self.alphabet) 195 instances.length = self.length 196 for instance in self: 197 instance = instance.reverse_complement() 198 instances.append(instance) 199 return instances
200
201 202 -class Motif(object):
203 """ 204 A class representing sequence motifs. 205 """
206 - def __init__(self, alphabet=None, instances=None, counts=None):
207 import matrix 208 from Bio.Alphabet import IUPAC 209 self.name="" 210 if counts is not None and instances is not None: 211 raise Exception(ValueError, 212 "Specify either instances or counts, don't specify both") 213 elif counts is not None: 214 if alphabet is None: 215 alphabet = IUPAC.unambiguous_dna 216 self.instances = None 217 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 218 self.length = self.counts.length 219 elif instances is not None: 220 self.instances = instances 221 alphabet = self.instances.alphabet 222 counts = self.instances.count() 223 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 224 self.length = self.counts.length 225 else: 226 self.counts = None 227 self.instances = None 228 self.length = None 229 if alphabet is None: 230 alphabet = IUPAC.unambiguous_dna 231 self.alphabet = alphabet 232 self.pseudocounts = None 233 self.background = None 234 self.mask = None
235
236 - def __get_mask(self):
237 return self.__mask
238
239 - def __set_mask(self, mask):
240 if self.length is None: 241 self.__mask = () 242 elif mask is None: 243 self.__mask = (1,) * self.length 244 elif len(mask)!=self.length: 245 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 246 elif isinstance(mask, str): 247 self.__mask=[] 248 for char in mask: 249 if char=="*": 250 self.__mask.append(1) 251 elif char==" ": 252 self.__mask.append(0) 253 else: 254 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char) 255 self.__mask = tuple(self.__mask) 256 else: 257 self.__mask = tuple(int(bool(c)) for c in mask)
258 259 mask = property(__get_mask, __set_mask) 260 del __get_mask 261 del __set_mask 262
263 - def __get_pseudocounts(self):
264 return self._pseudocounts
265
266 - def __set_pseudocounts(self, value):
267 self._pseudocounts = {} 268 if isinstance(value, dict): 269 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 270 else: 271 if value is None: 272 value = 0.0 273 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
274 275 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 276 del __get_pseudocounts 277 del __set_pseudocounts 278
279 - def __get_background(self):
280 return self._background
281
282 - def __set_background(self, value):
283 if isinstance(value, dict): 284 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 285 elif value is None: 286 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 287 else: 288 if sorted(self.alphabet.letters)!=["A", "C", "G", "T"]: 289 raise Exception("Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content") 290 self._background['A'] = (1.0-value)/2.0 291 self._background['C'] = value/2.0 292 self._background['G'] = value/2.0 293 self._background['T'] = (1.0-value)/2.0 294 total = sum(self._background.values()) 295 for letter in self.alphabet.letters: 296 self._background[letter] /= total
297 298 background = property(__get_background, __set_background) 299 del __get_background 300 del __set_background 301 302 @property
303 - def pwm(self):
304 return self.counts.normalize(self._pseudocounts)
305 306 @property
307 - def pssm(self):
308 return self.pwm.log_odds(self._background)
309
310 - def __str__(self,masked=False):
311 """ string representation of a motif. 312 """ 313 text = "" 314 if self.instances is not None: 315 text += str(self.instances) 316 317 if masked: 318 for i in xrange(self.length): 319 if self.__mask[i]: 320 text += "*" 321 else: 322 text += " " 323 text += "\n" 324 return text
325
326 - def __len__(self):
327 """return the length of a motif 328 329 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 330 """ 331 if self.length is None: 332 return 0 333 else: 334 return self.length
335
336 - def reverse_complement(self):
337 """ 338 Gives the reverse complement of the motif 339 """ 340 alphabet = self.alphabet 341 if self.instances is not None: 342 instances = self.instances.reverse_complement() 343 res = Motif(instances=instances, alphabet=alphabet) 344 else: # has counts 345 res = Motif(alphabet) 346 res.counts={} 347 res.counts["A"]=self.counts["T"][::-1] 348 res.counts["T"]=self.counts["A"][::-1] 349 res.counts["G"]=self.counts["C"][::-1] 350 res.counts["C"]=self.counts["G"][::-1] 351 res.length=self.length 352 res.__mask = self.__mask[::-1] 353 return res
354 355 @property
356 - def consensus(self):
357 """Returns the consensus sequence. 358 """ 359 return self.counts.consensus
360 361 @property
362 - def anticonsensus(self):
363 """returns the least probable pattern to be generated from this motif. 364 """ 365 return self.counts.anticonsensus
366 367 @property
368 - def degenerate_consensus(self):
369 """Following the rules adapted from 370 D. R. Cavener: "Comparison of the consensus sequence flanking 371 translational start sites in Drosophila and vertebrates." 372 Nucleic Acids Research 15(4): 1353-1361. (1987). 373 The same rules are used by TRANSFAC.""" 374 return self.counts.degenerate_consensus
375
376 - def weblogo(self,fname,format="PNG",version="2.8.2", **kwds):
377 """ 378 uses the Berkeley weblogo service to download and save a weblogo of 379 itself 380 381 requires an internet connection. 382 The parameters from **kwds are passed directly to the weblogo server. 383 384 Currently, this method uses WebLogo version 3.3. 385 These are the arguments and their default values passed to 386 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 387 for more information: 388 389 'stack_width' : 'medium', 390 'stack_per_line' : '40', 391 'alphabet' : 'alphabet_dna', 392 'ignore_lower_case' : True, 393 'unit_name' : "bits", 394 'first_index' : '1', 395 'logo_start' : '1', 396 'logo_end': str(self.length), 397 'composition' : "comp_auto", 398 'percentCG' : '', 399 'scale_width' : True, 400 'show_errorbars' : True, 401 'logo_title' : '', 402 'logo_label' : '', 403 'show_xaxis': True, 404 'xaxis_label': '', 405 'show_yaxis': True, 406 'yaxis_label': '', 407 'yaxis_scale': 'auto', 408 'yaxis_tic_interval' : '1.0', 409 'show_ends' : True, 410 'show_fineprint' : True, 411 'symbols0': '', 412 'symbols1': '', 413 'symbols2': '', 414 'symbols3': '', 415 'symbols4': '', 416 'color0': '', 417 'color1': '', 418 'color2': '', 419 'color3': '', 420 'color4': '', 421 422 """ 423 import urllib 424 import urllib2 425 frequencies = self.format('transfac') 426 url = 'http://weblogo.threeplusone.com/create.cgi' 427 values = {'sequences' : frequencies, 428 'format' : format.lower(), 429 'stack_width' : 'medium', 430 'stack_per_line' : '40', 431 'alphabet' : 'alphabet_dna', 432 'ignore_lower_case' : True, 433 'unit_name' : "bits", 434 'first_index' : '1', 435 'logo_start' : '1', 436 'logo_end': str(self.length), 437 'composition' : "comp_auto", 438 'percentCG' : '', 439 'scale_width' : True, 440 'show_errorbars' : True, 441 'logo_title' : '', 442 'logo_label' : '', 443 'show_xaxis': True, 444 'xaxis_label': '', 445 'show_yaxis': True, 446 'yaxis_label': '', 447 'yaxis_scale': 'auto', 448 'yaxis_tic_interval' : '1.0', 449 'show_ends' : True, 450 'show_fineprint' : True, 451 'symbols0': '', 452 'symbols1': '', 453 'symbols2': '', 454 'symbols3': '', 455 'symbols4': '', 456 'color0': '', 457 'color1': '', 458 'color2': '', 459 'color3': '', 460 'color4': '', 461 } 462 for k,v in kwds.iteritems(): 463 if type(values[k])==bool: 464 if not v: 465 v = "" 466 values[k]=str(v) 467 468 data = urllib.urlencode(values) 469 req = urllib2.Request(url, data) 470 response = urllib2.urlopen(req) 471 f = open(fname,"w") 472 im = response.read() 473 474 f.write(im) 475 f.close()
476
477 - def format(self, format):
478 """Returns a string representation of the Motif in a given format 479 480 Currently supported fromats: 481 - pfm : JASPAR Position Frequency Matrix 482 - transfac : TRANSFAC like files 483 """ 484 485 if format=="pfm": 486 from Bio.motifs import jaspar 487 return jaspar.write(self) 488 elif format=="transfac": 489 from Bio.motifs import transfac 490 motifs = [self] 491 return transfac.write(motifs) 492 else: 493 raise ValueError("Unknown format type %s" % format)
494
495 496 -def write(motifs, format):
497 """Returns a string representation of motifs in a given format 498 499 Currently supported fromats (case is ignored): 500 - pfm : JASPAR Position Frequency Matrix 501 [only if len(motifs)==1] 502 - transfac : TRANSFAC like files 503 """ 504 505 format = format.lower() 506 if format=="pfm": 507 from Bio.motifs import jaspar 508 if len(motifs)!=1: 509 raise Exception("Only a single motif can be written in the JASPAR Position Frequency Matrix (pfm) format") 510 motif = motifs[0] 511 return jaspar.write(motif) 512 elif format=="transfac": 513 from Bio.motifs import transfac 514 return transfac.write(motifs) 515 else: 516 raise ValueError("Unknown format type %s" % format)
517