1
2
3
4
5
6 """Tools for sequence motif analysis.
7
8 Bio.motifs contains the core Motif class containing various I/O methods
9 as well as methods for motif comparisons and motif searching in sequences.
10 It also includes functionality for parsing output from the AlignACE, MEME,
11 and MAST programs, as well as files in the TRANSFAC format.
12
13 Bio.motifs is replacing the older and now obsolete Bio.Motif module.
14 """
15
16
17 -def create(instances, alphabet=None):
20
21
22 -def parse(handle, format):
23 """Parses an output file of motif finding programs.
24
25 Currently supported formats (case is ignored):
26 - AlignAce: AlignAce output file format
27 - MEME: MEME output file motif
28 - MAST: MAST output file motif
29 - TRANSFAC: TRANSFAC database file format
30 - pfm: JASPAR-style position-frequency matrix
31 - sites: JASPAR-style sites file
32 As files in the pfm and sites formats contain only a single motif,
33 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
34 for those.
35
36 For example:
37
38 >>> from Bio import motifs
39 >>> for m in motifs.parse(open("Motif/alignace.out"),"AlignAce"):
40 ... print m.consensus
41 TCTACGATTGAG
42 CTGCAGCTAGCTACGAGTGAG
43 GTGCTCTAAGCATAGTAGGCG
44 GCCACTAGCAGAGCAGGGGGC
45 CGACTCAGAGGTT
46 CCACGCTAAGAGAGGTGCCGGAG
47 GCGCGTCGCTGAGCA
48 GTCCATCGCAAAGCGTGGGGC
49 GGGATCAGAGGGCCG
50 TGGAGGCGGGG
51 GACCAGAGCTTCGCATGGGGG
52 GGCGTGCGTG
53 GCTGGTTGCTGTTCATTAGG
54 GCCGGCGGCAGCTAAAAGGG
55 GAGGCCGGGGAT
56 CGACTCGTGCTTAGAAGG
57 """
58 format = format.lower()
59 if format=="alignace":
60 from Bio.motifs import alignace
61 record = alignace.read(handle)
62 return record
63 elif format=="meme":
64 from Bio.motifs import meme
65 record = meme.read(handle)
66 return record
67 elif format=="mast":
68 from Bio.motifs import mast
69 record = mast.read(handle)
70 return record
71 elif format=="transfac":
72 from Bio.motifs import transfac
73 record = transfac.read(handle)
74 return record
75 elif format in ('pfm', 'sites'):
76 from Bio.motifs import jaspar
77 motif = jaspar.read(handle, format)
78 motifs = [motif]
79 return motifs
80 else:
81 raise ValueError("Unknown format %s" % format)
82
83
84 -def read(handle, format):
85 """Reads a motif from a handle using a specified file-format.
86
87 This supports the same formats as Bio.motifs.parse(), but
88 only for files containing exactly one motif. For example,
89 reading a JASPAR-style pfm file:
90
91 >>> from Bio import motifs
92 >>> m = motifs.read(open("motifs/SRF.pfm"), "pfm")
93 >>> m.consensus
94 Seq('GCCCATATATGG', IUPACUnambiguousDNA())
95
96 Or a single-motif MEME file,
97
98 >>> from Bio import motifs
99 >>> m = motifs.read(open("motifs/meme.out"),"meme")
100 >>> m.consensus
101 Seq('CTCAATCGTA', IUPACUnambiguousDNA())
102
103 If the handle contains no records, or more than one record,
104 an exception is raised:
105
106 >>> from Bio import motifs
107 >>> motif = motifs.read(open("motifs/alignace.out"),"AlignAce")
108 Traceback (most recent call last):
109 ...
110 ValueError: More than one motif found in handle
111
112 If however you want the first motif from a file containing
113 multiple motifs this function would raise an exception (as
114 shown in the example above). Instead use:
115
116 >>> from Bio import motifs
117 >>> record = motifs.parse(open("motifs/alignace.out"),"alignace")
118 >>> motif = record[0]
119 >>> motif.consensus
120 Seq('TCTACGATTGAG', IUPACUnambiguousDNA())
121
122 Use the Bio.motifs.parse(handle, format) function if you want
123 to read multiple records from the handle.
124 """
125 format = format.lower()
126 motifs = parse(handle, format)
127 if len(motifs)==0:
128 raise ValueError("No motifs found in handle")
129 if len(motifs) > 1:
130 raise ValueError("More than one motif found in handle")
131 motif = motifs[0]
132 return motif
133
136 """
137 A class representing instances of sequence motifs.
138 """
139 - def __init__(self, instances=[], alphabet=None):
168
170 text = ""
171 for instance in self:
172 text += str(instance) + "\n"
173 return text
174
183
185 """
186 a generator function, returning found positions of motif instances in a given sequence
187 """
188 for pos in xrange(0,len(sequence)-self.length+1):
189 for instance in self:
190 if str(instance) == str(sequence[pos:pos+self.length]):
191 yield(pos,instance)
192 break
200
203 """
204 A class representing sequence motifs.
205 """
206 - def __init__(self, alphabet=None, instances=None, counts=None):
235
238
240 if self.length is None:
241 self.__mask = ()
242 elif mask is None:
243 self.__mask = (1,) * self.length
244 elif len(mask)!=self.length:
245 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length))
246 elif isinstance(mask, str):
247 self.__mask=[]
248 for char in mask:
249 if char=="*":
250 self.__mask.append(1)
251 elif char==" ":
252 self.__mask.append(0)
253 else:
254 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
255 self.__mask = tuple(self.__mask)
256 else:
257 self.__mask = tuple(int(bool(c)) for c in mask)
258
259 mask = property(__get_mask, __set_mask)
260 del __get_mask
261 del __set_mask
262
264 return self._pseudocounts
265
274
275 pseudocounts = property(__get_pseudocounts, __set_pseudocounts)
276 del __get_pseudocounts
277 del __set_pseudocounts
278
280 return self._background
281
283 if isinstance(value, dict):
284 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters)
285 elif value is None:
286 self._background = dict.fromkeys(self.alphabet.letters, 1.0)
287 else:
288 if sorted(self.alphabet.letters)!=["A", "C", "G", "T"]:
289 raise Exception("Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content")
290 self._background['A'] = (1.0-value)/2.0
291 self._background['C'] = value/2.0
292 self._background['G'] = value/2.0
293 self._background['T'] = (1.0-value)/2.0
294 total = sum(self._background.values())
295 for letter in self.alphabet.letters:
296 self._background[letter] /= total
297
298 background = property(__get_background, __set_background)
299 del __get_background
300 del __set_background
301
302 @property
304 return self.counts.normalize(self._pseudocounts)
305
306 @property
309
311 """ string representation of a motif.
312 """
313 text = ""
314 if self.instances is not None:
315 text += str(self.instances)
316
317 if masked:
318 for i in xrange(self.length):
319 if self.__mask[i]:
320 text += "*"
321 else:
322 text += " "
323 text += "\n"
324 return text
325
327 """return the length of a motif
328
329 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly.
330 """
331 if self.length is None:
332 return 0
333 else:
334 return self.length
335
337 """
338 Gives the reverse complement of the motif
339 """
340 alphabet = self.alphabet
341 if self.instances is not None:
342 instances = self.instances.reverse_complement()
343 res = Motif(instances=instances, alphabet=alphabet)
344 else:
345 res = Motif(alphabet)
346 res.counts={}
347 res.counts["A"]=self.counts["T"][::-1]
348 res.counts["T"]=self.counts["A"][::-1]
349 res.counts["G"]=self.counts["C"][::-1]
350 res.counts["C"]=self.counts["G"][::-1]
351 res.length=self.length
352 res.__mask = self.__mask[::-1]
353 return res
354
355 @property
357 """Returns the consensus sequence.
358 """
359 return self.counts.consensus
360
361 @property
363 """returns the least probable pattern to be generated from this motif.
364 """
365 return self.counts.anticonsensus
366
367 @property
369 """Following the rules adapted from
370 D. R. Cavener: "Comparison of the consensus sequence flanking
371 translational start sites in Drosophila and vertebrates."
372 Nucleic Acids Research 15(4): 1353-1361. (1987).
373 The same rules are used by TRANSFAC."""
374 return self.counts.degenerate_consensus
375
376 - def weblogo(self,fname,format="PNG",version="2.8.2", **kwds):
377 """
378 uses the Berkeley weblogo service to download and save a weblogo of
379 itself
380
381 requires an internet connection.
382 The parameters from **kwds are passed directly to the weblogo server.
383
384 Currently, this method uses WebLogo version 3.3.
385 These are the arguments and their default values passed to
386 WebLogo 3.3; see their website at http://weblogo.threeplusone.com
387 for more information:
388
389 'stack_width' : 'medium',
390 'stack_per_line' : '40',
391 'alphabet' : 'alphabet_dna',
392 'ignore_lower_case' : True,
393 'unit_name' : "bits",
394 'first_index' : '1',
395 'logo_start' : '1',
396 'logo_end': str(self.length),
397 'composition' : "comp_auto",
398 'percentCG' : '',
399 'scale_width' : True,
400 'show_errorbars' : True,
401 'logo_title' : '',
402 'logo_label' : '',
403 'show_xaxis': True,
404 'xaxis_label': '',
405 'show_yaxis': True,
406 'yaxis_label': '',
407 'yaxis_scale': 'auto',
408 'yaxis_tic_interval' : '1.0',
409 'show_ends' : True,
410 'show_fineprint' : True,
411 'symbols0': '',
412 'symbols1': '',
413 'symbols2': '',
414 'symbols3': '',
415 'symbols4': '',
416 'color0': '',
417 'color1': '',
418 'color2': '',
419 'color3': '',
420 'color4': '',
421
422 """
423 import urllib
424 import urllib2
425 frequencies = self.format('transfac')
426 url = 'http://weblogo.threeplusone.com/create.cgi'
427 values = {'sequences' : frequencies,
428 'format' : format.lower(),
429 'stack_width' : 'medium',
430 'stack_per_line' : '40',
431 'alphabet' : 'alphabet_dna',
432 'ignore_lower_case' : True,
433 'unit_name' : "bits",
434 'first_index' : '1',
435 'logo_start' : '1',
436 'logo_end': str(self.length),
437 'composition' : "comp_auto",
438 'percentCG' : '',
439 'scale_width' : True,
440 'show_errorbars' : True,
441 'logo_title' : '',
442 'logo_label' : '',
443 'show_xaxis': True,
444 'xaxis_label': '',
445 'show_yaxis': True,
446 'yaxis_label': '',
447 'yaxis_scale': 'auto',
448 'yaxis_tic_interval' : '1.0',
449 'show_ends' : True,
450 'show_fineprint' : True,
451 'symbols0': '',
452 'symbols1': '',
453 'symbols2': '',
454 'symbols3': '',
455 'symbols4': '',
456 'color0': '',
457 'color1': '',
458 'color2': '',
459 'color3': '',
460 'color4': '',
461 }
462 for k,v in kwds.iteritems():
463 if type(values[k])==bool:
464 if not v:
465 v = ""
466 values[k]=str(v)
467
468 data = urllib.urlencode(values)
469 req = urllib2.Request(url, data)
470 response = urllib2.urlopen(req)
471 f = open(fname,"w")
472 im = response.read()
473
474 f.write(im)
475 f.close()
476
494
495
496 -def write(motifs, format):
497 """Returns a string representation of motifs in a given format
498
499 Currently supported fromats (case is ignored):
500 - pfm : JASPAR Position Frequency Matrix
501 [only if len(motifs)==1]
502 - transfac : TRANSFAC like files
503 """
504
505 format = format.lower()
506 if format=="pfm":
507 from Bio.motifs import jaspar
508 if len(motifs)!=1:
509 raise Exception("Only a single motif can be written in the JASPAR Position Frequency Matrix (pfm) format")
510 motif = motifs[0]
511 return jaspar.write(motif)
512 elif format=="transfac":
513 from Bio.motifs import transfac
514 return transfac.write(motifs)
515 else:
516 raise ValueError("Unknown format type %s" % format)
517