Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
 19   
 20  import math 
 21   
 22  __docformat__ = "restructuredtext en" 
23 24 -def create(instances, alphabet=None):
25 instances = Instances(instances, alphabet) 26 return Motif(instances=instances, alphabet=alphabet)
27
28 29 -def parse(handle, format):
30 """Parses an output file of motif finding programs. 31 32 Currently supported formats (case is ignored): 33 34 - AlignAce: AlignAce output file format 35 - MEME: MEME output file motif 36 - MAST: MAST output file motif 37 - TRANSFAC: TRANSFAC database file format 38 - pfm: JASPAR-style position-frequency matrix 39 - jaspar: JASPAR-style multiple PFM format 40 - sites: JASPAR-style sites file 41 42 As files in the pfm and sites formats contain only a single motif, 43 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 44 for those. 45 46 For example: 47 48 >>> from Bio import motifs 49 >>> with open("Motif/alignace.out") as handle: 50 ... for m in motifs.parse(handle, "AlignAce"): 51 ... print(m.consensus) 52 ... 53 TCTACGATTGAG 54 CTGCAGCTAGCTACGAGTGAG 55 GTGCTCTAAGCATAGTAGGCG 56 GCCACTAGCAGAGCAGGGGGC 57 CGACTCAGAGGTT 58 CCACGCTAAGAGAGGTGCCGGAG 59 GCGCGTCGCTGAGCA 60 GTCCATCGCAAAGCGTGGGGC 61 GGGATCAGAGGGCCG 62 TGGAGGCGGGG 63 GACCAGAGCTTCGCATGGGGG 64 GGCGTGCGTG 65 GCTGGTTGCTGTTCATTAGG 66 GCCGGCGGCAGCTAAAAGGG 67 GAGGCCGGGGAT 68 CGACTCGTGCTTAGAAGG 69 """ 70 format = format.lower() 71 if format=="alignace": 72 from Bio.motifs import alignace 73 record = alignace.read(handle) 74 return record 75 elif format=="meme": 76 from Bio.motifs import meme 77 record = meme.read(handle) 78 return record 79 elif format=="mast": 80 from Bio.motifs import mast 81 record = mast.read(handle) 82 return record 83 elif format=="transfac": 84 from Bio.motifs import transfac 85 record = transfac.read(handle) 86 return record 87 elif format in ('pfm', 'sites', 'jaspar'): 88 from Bio.motifs import jaspar 89 record = jaspar.read(handle, format) 90 return record 91 else: 92 raise ValueError("Unknown format %s" % format)
93
94 95 -def read(handle, format):
96 """Reads a motif from a handle using a specified file-format. 97 98 This supports the same formats as Bio.motifs.parse(), but 99 only for files containing exactly one motif. For example, 100 reading a JASPAR-style pfm file: 101 102 >>> from Bio import motifs 103 >>> with open("motifs/SRF.pfm") as handle: 104 ... m = motifs.read(handle, "pfm") 105 >>> m.consensus 106 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 107 108 Or a single-motif MEME file, 109 110 >>> from Bio import motifs 111 >>> with open("motifs/meme.out") as handle: 112 ... m = motifs.read(handle, "meme") 113 >>> m.consensus 114 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 115 116 If the handle contains no records, or more than one record, 117 an exception is raised: 118 119 >>> from Bio import motifs 120 >>> with open("motifs/alignace.out") as handle: 121 ... motif = motifs.read(handle, "AlignAce") 122 Traceback (most recent call last): 123 ... 124 ValueError: More than one motif found in handle 125 126 If however you want the first motif from a file containing 127 multiple motifs this function would raise an exception (as 128 shown in the example above). Instead use: 129 130 >>> from Bio import motifs 131 >>> with open("motifs/alignace.out") as handle: 132 ... record = motifs.parse(handle, "alignace") 133 >>> motif = record[0] 134 >>> motif.consensus 135 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 136 137 Use the Bio.motifs.parse(handle, format) function if you want 138 to read multiple records from the handle. 139 """ 140 format = format.lower() 141 motifs = parse(handle, format) 142 if len(motifs)==0: 143 raise ValueError("No motifs found in handle") 144 if len(motifs) > 1: 145 raise ValueError("More than one motif found in handle") 146 motif = motifs[0] 147 return motif
148
149 150 -class Instances(list):
151 """ 152 A class representing instances of sequence motifs. 153 """
154 - def __init__(self, instances=[], alphabet=None):
155 from Bio.Alphabet import IUPAC 156 from Bio.Seq import Seq 157 self.length = None 158 for instance in instances: 159 if self.length is None: 160 self.length = len(instance) 161 elif self.length != len(instance): 162 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 163 raise ValueError(message) 164 try: 165 a = instance.alphabet 166 except AttributeError: 167 # The instance is a plain string 168 continue 169 if alphabet is None: 170 alphabet = a 171 elif alphabet != a: 172 raise ValueError("Alphabets are inconsistent") 173 if alphabet is None or alphabet.letters is None: 174 # If we didn't get a meaningful alphabet from the instances, 175 # assume it is DNA. 176 alphabet = IUPAC.unambiguous_dna 177 for instance in instances: 178 if not isinstance(instance, Seq): 179 sequence = str(instance) 180 instance = Seq(sequence, alphabet=alphabet) 181 self.append(instance) 182 self.alphabet = alphabet
183
184 - def __str__(self):
185 text = "" 186 for instance in self: 187 text += str(instance) + "\n" 188 return text
189
190 - def count(self):
191 counts = {} 192 for letter in self.alphabet.letters: 193 counts[letter] = [0] * self.length 194 for instance in self: 195 for position, letter in enumerate(instance): 196 counts[letter][position] += 1 197 return counts
198
199 - def search(self, sequence):
200 """ 201 a generator function, returning found positions of motif instances in a given sequence 202 """ 203 for pos in range(0, len(sequence)-self.length+1): 204 for instance in self: 205 if str(instance) == str(sequence[pos:pos+self.length]): 206 yield(pos, instance) 207 break # no other instance will fit (we don't want to return multiple hits)
208
209 - def reverse_complement(self):
210 instances = Instances(alphabet=self.alphabet) 211 instances.length = self.length 212 for instance in self: 213 instance = instance.reverse_complement() 214 instances.append(instance) 215 return instances
216
217 218 -class Motif(object):
219 """ 220 A class representing sequence motifs. 221 """
222 - def __init__(self, alphabet=None, instances=None, counts=None):
223 from . import matrix 224 from Bio.Alphabet import IUPAC 225 self.name="" 226 if counts is not None and instances is not None: 227 raise Exception(ValueError, 228 "Specify either instances or counts, don't specify both") 229 elif counts is not None: 230 if alphabet is None: 231 alphabet = IUPAC.unambiguous_dna 232 self.instances = None 233 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 234 self.length = self.counts.length 235 elif instances is not None: 236 self.instances = instances 237 alphabet = self.instances.alphabet 238 counts = self.instances.count() 239 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 240 self.length = self.counts.length 241 else: 242 self.counts = None 243 self.instances = None 244 self.length = None 245 if alphabet is None: 246 alphabet = IUPAC.unambiguous_dna 247 self.alphabet = alphabet 248 self.pseudocounts = None 249 self.background = None 250 self.mask = None
251
252 - def __get_mask(self):
253 return self.__mask
254
255 - def __set_mask(self, mask):
256 if self.length is None: 257 self.__mask = () 258 elif mask is None: 259 self.__mask = (1,) * self.length 260 elif len(mask)!=self.length: 261 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 262 elif isinstance(mask, str): 263 self.__mask=[] 264 for char in mask: 265 if char=="*": 266 self.__mask.append(1) 267 elif char==" ": 268 self.__mask.append(0) 269 else: 270 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char) 271 self.__mask = tuple(self.__mask) 272 else: 273 self.__mask = tuple(int(bool(c)) for c in mask)
274 275 mask = property(__get_mask, __set_mask) 276 del __get_mask 277 del __set_mask 278
279 - def __get_pseudocounts(self):
280 return self._pseudocounts
281
282 - def __set_pseudocounts(self, value):
283 self._pseudocounts = {} 284 if isinstance(value, dict): 285 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 286 else: 287 if value is None: 288 value = 0.0 289 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
290 291 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 292 del __get_pseudocounts 293 del __set_pseudocounts 294
295 - def __get_background(self):
296 return self._background
297
298 - def __set_background(self, value):
299 if isinstance(value, dict): 300 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 301 elif value is None: 302 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 303 else: 304 if sorted(self.alphabet.letters)!=["A", "C", "G", "T"]: 305 raise Exception("Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content") 306 self._background['A'] = (1.0-value)/2.0 307 self._background['C'] = value/2.0 308 self._background['G'] = value/2.0 309 self._background['T'] = (1.0-value)/2.0 310 total = sum(self._background.values()) 311 for letter in self.alphabet.letters: 312 self._background[letter] /= total
313 314 background = property(__get_background, __set_background) 315 del __get_background 316 del __set_background 317 318 @property
319 - def pwm(self):
320 return self.counts.normalize(self._pseudocounts)
321 322 @property
323 - def pssm(self):
324 return self.pwm.log_odds(self._background)
325
326 - def __str__(self, masked=False):
327 """ string representation of a motif. 328 """ 329 text = "" 330 if self.instances is not None: 331 text += str(self.instances) 332 333 if masked: 334 for i in range(self.length): 335 if self.__mask[i]: 336 text += "*" 337 else: 338 text += " " 339 text += "\n" 340 return text
341
342 - def __len__(self):
343 """return the length of a motif 344 345 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 346 """ 347 if self.length is None: 348 return 0 349 else: 350 return self.length
351
352 - def reverse_complement(self):
353 """ 354 Gives the reverse complement of the motif 355 """ 356 alphabet = self.alphabet 357 if self.instances is not None: 358 instances = self.instances.reverse_complement() 359 res = Motif(instances=instances, alphabet=alphabet) 360 else: # has counts 361 res = Motif(alphabet) 362 res.counts={} 363 res.counts["A"]=self.counts["T"][::-1] 364 res.counts["T"]=self.counts["A"][::-1] 365 res.counts["G"]=self.counts["C"][::-1] 366 res.counts["C"]=self.counts["G"][::-1] 367 res.length=self.length 368 res.__mask = self.__mask[::-1] 369 return res
370 371 @property
372 - def consensus(self):
373 """Returns the consensus sequence. 374 """ 375 return self.counts.consensus
376 377 @property
378 - def anticonsensus(self):
379 """returns the least probable pattern to be generated from this motif. 380 """ 381 return self.counts.anticonsensus
382 383 @property
384 - def degenerate_consensus(self):
385 """Following the rules adapted from 386 D. R. Cavener: "Comparison of the consensus sequence flanking 387 translational start sites in Drosophila and vertebrates." 388 Nucleic Acids Research 15(4): 1353-1361. (1987). 389 The same rules are used by TRANSFAC.""" 390 return self.counts.degenerate_consensus
391
392 - def weblogo(self, fname, format="PNG", version="2.8.2", **kwds):
393 """ 394 uses the Berkeley weblogo service to download and save a weblogo of 395 itself 396 397 requires an internet connection. 398 The parameters from ``**kwds`` are passed directly to the weblogo server. 399 400 Currently, this method uses WebLogo version 3.3. 401 These are the arguments and their default values passed to 402 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 403 for more information: 404 405 'stack_width' : 'medium', 406 'stack_per_line' : '40', 407 'alphabet' : 'alphabet_dna', 408 'ignore_lower_case' : True, 409 'unit_name' : "bits", 410 'first_index' : '1', 411 'logo_start' : '1', 412 'logo_end': str(self.length), 413 'composition' : "comp_auto", 414 'percentCG' : '', 415 'scale_width' : True, 416 'show_errorbars' : True, 417 'logo_title' : '', 418 'logo_label' : '', 419 'show_xaxis': True, 420 'xaxis_label': '', 421 'show_yaxis': True, 422 'yaxis_label': '', 423 'yaxis_scale': 'auto', 424 'yaxis_tic_interval' : '1.0', 425 'show_ends' : True, 426 'show_fineprint' : True, 427 'color_scheme': 'color_auto', 428 'symbols0': '', 429 'symbols1': '', 430 'symbols2': '', 431 'symbols3': '', 432 'symbols4': '', 433 'color0': '', 434 'color1': '', 435 'color2': '', 436 'color3': '', 437 'color4': '', 438 439 """ 440 from Bio._py3k import urlopen, urlencode, Request 441 442 frequencies = self.format('transfac') 443 url = 'http://weblogo.threeplusone.com/create.cgi' 444 values = {'sequences': frequencies, 445 'format': format.lower(), 446 'stack_width': 'medium', 447 'stack_per_line': '40', 448 'alphabet': 'alphabet_dna', 449 'ignore_lower_case': True, 450 'unit_name': "bits", 451 'first_index': '1', 452 'logo_start': '1', 453 'logo_end': str(self.length), 454 'composition': "comp_auto", 455 'percentCG': '', 456 'scale_width': True, 457 'show_errorbars': True, 458 'logo_title': '', 459 'logo_label': '', 460 'show_xaxis': True, 461 'xaxis_label': '', 462 'show_yaxis': True, 463 'yaxis_label': '', 464 'yaxis_scale': 'auto', 465 'yaxis_tic_interval': '1.0', 466 'show_ends': True, 467 'show_fineprint': True, 468 'color_scheme': 'color_auto', 469 'symbols0': '', 470 'symbols1': '', 471 'symbols2': '', 472 'symbols3': '', 473 'symbols4': '', 474 'color0': '', 475 'color1': '', 476 'color2': '', 477 'color3': '', 478 'color4': '', 479 } 480 for k, v in kwds.items(): 481 if isinstance(values[k], bool): 482 if not v: 483 v = "" 484 values[k]=str(v) 485 486 data = urlencode(values) 487 req = Request(url, data) 488 response = urlopen(req) 489 with open(fname, "w") as f: 490 im = response.read() 491 f.write(im)
492
493 - def format(self, format):
494 """Returns a string representation of the Motif in a given format 495 496 Currently supported fromats: 497 - pfm : JASPAR single Position Frequency Matrix 498 - jaspar : JASPAR multiple Position Frequency Matrix 499 - transfac : TRANSFAC like files 500 """ 501 502 if format in ('pfm', 'jaspar'): 503 from Bio.motifs import jaspar 504 motifs = [self] 505 return jaspar.write(motifs, format) 506 elif format=="transfac": 507 from Bio.motifs import transfac 508 motifs = [self] 509 return transfac.write(motifs) 510 else: 511 raise ValueError("Unknown format type %s" % format)
512
513 514 -def write(motifs, format):
515 """Returns a string representation of motifs in a given format 516 517 Currently supported formats (case is ignored): 518 - pfm : JASPAR simple single Position Frequency Matrix 519 - jaspar : JASPAR multiple PFM format 520 - transfac : TRANSFAC like files 521 """ 522 523 format = format.lower() 524 if format in ("pfm", "jaspar"): 525 from Bio.motifs import jaspar 526 return jaspar.write(motifs, format) 527 elif format=="transfac": 528 from Bio.motifs import transfac 529 return transfac.write(motifs) 530 else: 531 raise ValueError("Unknown format type %s" % format)
532 533 534 if __name__ == "__main__": 535 from Bio._utils import run_doctest 536 run_doctest(verbose=0) 537