Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
 19   
 20  import math 
21 22 23 -def create(instances, alphabet=None):
24 instances = Instances(instances, alphabet) 25 return Motif(instances=instances, alphabet=alphabet)
26
27 28 -def parse(handle, format):
29 """Parses an output file of motif finding programs. 30 31 Currently supported formats (case is ignored): 32 - AlignAce: AlignAce output file format 33 - MEME: MEME output file motif 34 - MAST: MAST output file motif 35 - TRANSFAC: TRANSFAC database file format 36 - pfm: JASPAR-style position-frequency matrix 37 - jaspar: JASPAR-style multiple PFM format 38 - sites: JASPAR-style sites file 39 As files in the pfm and sites formats contain only a single motif, 40 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 41 for those. 42 43 For example: 44 45 >>> from Bio import motifs 46 >>> with open("Motif/alignace.out") as handle: 47 ... for m in motifs.parse(handle, "AlignAce"): 48 ... print(m.consensus) 49 ... 50 TCTACGATTGAG 51 CTGCAGCTAGCTACGAGTGAG 52 GTGCTCTAAGCATAGTAGGCG 53 GCCACTAGCAGAGCAGGGGGC 54 CGACTCAGAGGTT 55 CCACGCTAAGAGAGGTGCCGGAG 56 GCGCGTCGCTGAGCA 57 GTCCATCGCAAAGCGTGGGGC 58 GGGATCAGAGGGCCG 59 TGGAGGCGGGG 60 GACCAGAGCTTCGCATGGGGG 61 GGCGTGCGTG 62 GCTGGTTGCTGTTCATTAGG 63 GCCGGCGGCAGCTAAAAGGG 64 GAGGCCGGGGAT 65 CGACTCGTGCTTAGAAGG 66 """ 67 format = format.lower() 68 if format=="alignace": 69 from Bio.motifs import alignace 70 record = alignace.read(handle) 71 return record 72 elif format=="meme": 73 from Bio.motifs import meme 74 record = meme.read(handle) 75 return record 76 elif format=="mast": 77 from Bio.motifs import mast 78 record = mast.read(handle) 79 return record 80 elif format=="transfac": 81 from Bio.motifs import transfac 82 record = transfac.read(handle) 83 return record 84 elif format in ('pfm', 'sites', 'jaspar'): 85 from Bio.motifs import jaspar 86 record = jaspar.read(handle, format) 87 return record 88 else: 89 raise ValueError("Unknown format %s" % format)
90
91 92 -def read(handle, format):
93 """Reads a motif from a handle using a specified file-format. 94 95 This supports the same formats as Bio.motifs.parse(), but 96 only for files containing exactly one motif. For example, 97 reading a JASPAR-style pfm file: 98 99 >>> from Bio import motifs 100 >>> with open("motifs/SRF.pfm") as handle: 101 ... m = motifs.read(handle, "pfm") 102 >>> m.consensus 103 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 104 105 Or a single-motif MEME file, 106 107 >>> from Bio import motifs 108 >>> with open("motifs/meme.out") as handle: 109 ... m = motifs.read(handle, "meme") 110 >>> m.consensus 111 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 112 113 If the handle contains no records, or more than one record, 114 an exception is raised: 115 116 >>> from Bio import motifs 117 >>> with open("motifs/alignace.out") as handle: 118 ... motif = motifs.read(handle, "AlignAce") 119 Traceback (most recent call last): 120 ... 121 ValueError: More than one motif found in handle 122 123 If however you want the first motif from a file containing 124 multiple motifs this function would raise an exception (as 125 shown in the example above). Instead use: 126 127 >>> from Bio import motifs 128 >>> with open("motifs/alignace.out") as handle: 129 ... record = motifs.parse(handle, "alignace") 130 >>> motif = record[0] 131 >>> motif.consensus 132 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 133 134 Use the Bio.motifs.parse(handle, format) function if you want 135 to read multiple records from the handle. 136 """ 137 format = format.lower() 138 motifs = parse(handle, format) 139 if len(motifs)==0: 140 raise ValueError("No motifs found in handle") 141 if len(motifs) > 1: 142 raise ValueError("More than one motif found in handle") 143 motif = motifs[0] 144 return motif
145
146 147 -class Instances(list):
148 """ 149 A class representing instances of sequence motifs. 150 """
151 - def __init__(self, instances=[], alphabet=None):
152 from Bio.Alphabet import IUPAC 153 from Bio.Seq import Seq 154 self.length = None 155 for instance in instances: 156 if self.length is None: 157 self.length = len(instance) 158 elif self.length != len(instance): 159 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 160 raise ValueError(message) 161 try: 162 a = instance.alphabet 163 except AttributeError: 164 # The instance is a plain string 165 continue 166 if alphabet is None: 167 alphabet = a 168 elif alphabet != a: 169 raise ValueError("Alphabets are inconsistent") 170 if alphabet is None or alphabet.letters is None: 171 # If we didn't get a meaningful alphabet from the instances, 172 # assume it is DNA. 173 alphabet = IUPAC.unambiguous_dna 174 for instance in instances: 175 if not isinstance(instance, Seq): 176 sequence = str(instance) 177 instance = Seq(sequence, alphabet=alphabet) 178 self.append(instance) 179 self.alphabet = alphabet
180
181 - def __str__(self):
182 text = "" 183 for instance in self: 184 text += str(instance) + "\n" 185 return text
186
187 - def count(self):
188 counts = {} 189 for letter in self.alphabet.letters: 190 counts[letter] = [0] * self.length 191 for instance in self: 192 for position, letter in enumerate(instance): 193 counts[letter][position] += 1 194 return counts
195
196 - def search(self, sequence):
197 """ 198 a generator function, returning found positions of motif instances in a given sequence 199 """ 200 for pos in range(0, len(sequence)-self.length+1): 201 for instance in self: 202 if str(instance) == str(sequence[pos:pos+self.length]): 203 yield(pos, instance) 204 break # no other instance will fit (we don't want to return multiple hits)
205 - def reverse_complement(self):
206 instances = Instances(alphabet=self.alphabet) 207 instances.length = self.length 208 for instance in self: 209 instance = instance.reverse_complement() 210 instances.append(instance) 211 return instances
212
213 214 -class Motif(object):
215 """ 216 A class representing sequence motifs. 217 """
218 - def __init__(self, alphabet=None, instances=None, counts=None):
219 from . import matrix 220 from Bio.Alphabet import IUPAC 221 self.name="" 222 if counts is not None and instances is not None: 223 raise Exception(ValueError, 224 "Specify either instances or counts, don't specify both") 225 elif counts is not None: 226 if alphabet is None: 227 alphabet = IUPAC.unambiguous_dna 228 self.instances = None 229 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 230 self.length = self.counts.length 231 elif instances is not None: 232 self.instances = instances 233 alphabet = self.instances.alphabet 234 counts = self.instances.count() 235 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 236 self.length = self.counts.length 237 else: 238 self.counts = None 239 self.instances = None 240 self.length = None 241 if alphabet is None: 242 alphabet = IUPAC.unambiguous_dna 243 self.alphabet = alphabet 244 self.pseudocounts = None 245 self.background = None 246 self.mask = None
247
248 - def __get_mask(self):
249 return self.__mask
250
251 - def __set_mask(self, mask):
252 if self.length is None: 253 self.__mask = () 254 elif mask is None: 255 self.__mask = (1,) * self.length 256 elif len(mask)!=self.length: 257 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 258 elif isinstance(mask, str): 259 self.__mask=[] 260 for char in mask: 261 if char=="*": 262 self.__mask.append(1) 263 elif char==" ": 264 self.__mask.append(0) 265 else: 266 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char) 267 self.__mask = tuple(self.__mask) 268 else: 269 self.__mask = tuple(int(bool(c)) for c in mask)
270 271 mask = property(__get_mask, __set_mask) 272 del __get_mask 273 del __set_mask 274
275 - def __get_pseudocounts(self):
276 return self._pseudocounts
277
278 - def __set_pseudocounts(self, value):
279 self._pseudocounts = {} 280 if isinstance(value, dict): 281 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 282 else: 283 if value is None: 284 value = 0.0 285 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
286 287 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 288 del __get_pseudocounts 289 del __set_pseudocounts 290
291 - def __get_background(self):
292 return self._background
293
294 - def __set_background(self, value):
295 if isinstance(value, dict): 296 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 297 elif value is None: 298 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 299 else: 300 if sorted(self.alphabet.letters)!=["A", "C", "G", "T"]: 301 raise Exception("Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content") 302 self._background['A'] = (1.0-value)/2.0 303 self._background['C'] = value/2.0 304 self._background['G'] = value/2.0 305 self._background['T'] = (1.0-value)/2.0 306 total = sum(self._background.values()) 307 for letter in self.alphabet.letters: 308 self._background[letter] /= total
309 310 background = property(__get_background, __set_background) 311 del __get_background 312 del __set_background 313 314 @property
315 - def pwm(self):
316 return self.counts.normalize(self._pseudocounts)
317 318 @property
319 - def pssm(self):
320 return self.pwm.log_odds(self._background)
321
322 - def __str__(self,masked=False):
323 """ string representation of a motif. 324 """ 325 text = "" 326 if self.instances is not None: 327 text += str(self.instances) 328 329 if masked: 330 for i in range(self.length): 331 if self.__mask[i]: 332 text += "*" 333 else: 334 text += " " 335 text += "\n" 336 return text
337
338 - def __len__(self):
339 """return the length of a motif 340 341 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 342 """ 343 if self.length is None: 344 return 0 345 else: 346 return self.length
347
348 - def reverse_complement(self):
349 """ 350 Gives the reverse complement of the motif 351 """ 352 alphabet = self.alphabet 353 if self.instances is not None: 354 instances = self.instances.reverse_complement() 355 res = Motif(instances=instances, alphabet=alphabet) 356 else: # has counts 357 res = Motif(alphabet) 358 res.counts={} 359 res.counts["A"]=self.counts["T"][::-1] 360 res.counts["T"]=self.counts["A"][::-1] 361 res.counts["G"]=self.counts["C"][::-1] 362 res.counts["C"]=self.counts["G"][::-1] 363 res.length=self.length 364 res.__mask = self.__mask[::-1] 365 return res
366 367 @property
368 - def consensus(self):
369 """Returns the consensus sequence. 370 """ 371 return self.counts.consensus
372 373 @property
374 - def anticonsensus(self):
375 """returns the least probable pattern to be generated from this motif. 376 """ 377 return self.counts.anticonsensus
378 379 @property
380 - def degenerate_consensus(self):
381 """Following the rules adapted from 382 D. R. Cavener: "Comparison of the consensus sequence flanking 383 translational start sites in Drosophila and vertebrates." 384 Nucleic Acids Research 15(4): 1353-1361. (1987). 385 The same rules are used by TRANSFAC.""" 386 return self.counts.degenerate_consensus
387
388 - def weblogo(self,fname,format="PNG",version="2.8.2", **kwds):
389 """ 390 uses the Berkeley weblogo service to download and save a weblogo of 391 itself 392 393 requires an internet connection. 394 The parameters from **kwds are passed directly to the weblogo server. 395 396 Currently, this method uses WebLogo version 3.3. 397 These are the arguments and their default values passed to 398 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 399 for more information: 400 401 'stack_width' : 'medium', 402 'stack_per_line' : '40', 403 'alphabet' : 'alphabet_dna', 404 'ignore_lower_case' : True, 405 'unit_name' : "bits", 406 'first_index' : '1', 407 'logo_start' : '1', 408 'logo_end': str(self.length), 409 'composition' : "comp_auto", 410 'percentCG' : '', 411 'scale_width' : True, 412 'show_errorbars' : True, 413 'logo_title' : '', 414 'logo_label' : '', 415 'show_xaxis': True, 416 'xaxis_label': '', 417 'show_yaxis': True, 418 'yaxis_label': '', 419 'yaxis_scale': 'auto', 420 'yaxis_tic_interval' : '1.0', 421 'show_ends' : True, 422 'show_fineprint' : True, 423 'color_scheme': 'color_auto', 424 'symbols0': '', 425 'symbols1': '', 426 'symbols2': '', 427 'symbols3': '', 428 'symbols4': '', 429 'color0': '', 430 'color1': '', 431 'color2': '', 432 'color3': '', 433 'color4': '', 434 435 """ 436 from Bio._py3k import urlopen, urlencode, Request 437 438 frequencies = self.format('transfac') 439 url = 'http://weblogo.threeplusone.com/create.cgi' 440 values = {'sequences': frequencies, 441 'format': format.lower(), 442 'stack_width': 'medium', 443 'stack_per_line': '40', 444 'alphabet': 'alphabet_dna', 445 'ignore_lower_case': True, 446 'unit_name': "bits", 447 'first_index': '1', 448 'logo_start': '1', 449 'logo_end': str(self.length), 450 'composition': "comp_auto", 451 'percentCG': '', 452 'scale_width': True, 453 'show_errorbars': True, 454 'logo_title': '', 455 'logo_label': '', 456 'show_xaxis': True, 457 'xaxis_label': '', 458 'show_yaxis': True, 459 'yaxis_label': '', 460 'yaxis_scale': 'auto', 461 'yaxis_tic_interval': '1.0', 462 'show_ends': True, 463 'show_fineprint': True, 464 'color_scheme': 'color_auto', 465 'symbols0': '', 466 'symbols1': '', 467 'symbols2': '', 468 'symbols3': '', 469 'symbols4': '', 470 'color0': '', 471 'color1': '', 472 'color2': '', 473 'color3': '', 474 'color4': '', 475 } 476 for k, v in kwds.items(): 477 if isinstance(values[k], bool): 478 if not v: 479 v = "" 480 values[k]=str(v) 481 482 data = urlencode(values) 483 req = Request(url, data) 484 response = urlopen(req) 485 with open(fname,"w") as f: 486 im = response.read() 487 f.write(im)
488
489 - def format(self, format):
490 """Returns a string representation of the Motif in a given format 491 492 Currently supported fromats: 493 - pfm : JASPAR single Position Frequency Matrix 494 - jaspar : JASPAR multiple Position Frequency Matrix 495 - transfac : TRANSFAC like files 496 """ 497 498 if format in ('pfm', 'jaspar'): 499 from Bio.motifs import jaspar 500 motifs = [self] 501 return jaspar.write(motifs, format) 502 elif format=="transfac": 503 from Bio.motifs import transfac 504 motifs = [self] 505 return transfac.write(motifs) 506 else: 507 raise ValueError("Unknown format type %s" % format)
508
509 510 -def write(motifs, format):
511 """Returns a string representation of motifs in a given format 512 513 Currently supported formats (case is ignored): 514 - pfm : JASPAR simple single Position Frequency Matrix 515 - jaspar : JASPAR multiple PFM format 516 - transfac : TRANSFAC like files 517 """ 518 519 format = format.lower() 520 if format in ("pfm", "jaspar"): 521 from Bio.motifs import jaspar 522 return jaspar.write(motifs, format) 523 elif format=="transfac": 524 from Bio.motifs import transfac 525 return transfac.write(motifs) 526 else: 527 raise ValueError("Unknown format type %s" % format)
528 529 530 if __name__ == "__main__": 531 from Bio._utils import run_doctest 532 run_doctest(verbose=0) 533