Package Bio :: Package Motif :: Package Parsers :: Module MEME
[hide private]
[frames] | no frames]

Source Code for Module Bio.Motif.Parsers.MEME

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from __future__ import print_function 
  8   
  9  from Bio.Alphabet import IUPAC 
 10  from Bio import Seq 
 11  import re 
 12  from math import sqrt 
 13  import sys 
 14  from Bio.Motif import Motif 
 15   
 16   
17 -def read(handle):
18 """Parses the text output of the MEME program into MEME.Record object. 19 20 Example: 21 22 >>> from Bio.Motif.Parsers import MEME 23 >>> with open("meme.output.txt") as f: 24 ... record = MEME.read(f) 25 >>> for motif in record.motifs: 26 ... for instance in motif.instances: 27 ... print(instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue) 28 29 """ 30 record = MEMERecord() 31 __read_version(record, handle) 32 __read_datafile(record, handle) 33 __read_alphabet(record, handle) 34 __read_sequence_names(record, handle) 35 __read_command(record, handle) 36 for line in handle: 37 if line.startswith('MOTIF 1'): 38 break 39 else: 40 raise ValueError('Unexpected end of stream') 41 while True: 42 motif = __create_motif(line) 43 motif.alphabet = record.alphabet 44 record.motifs.append(motif) 45 __read_motif_name(motif, handle) 46 __read_motif_sequences(motif, handle, 'revcomp' in record.command) 47 __skip_unused_lines(handle) 48 try: 49 line = next(handle) 50 except StopIteration: 51 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 52 if line.startswith("SUMMARY OF MOTIFS"): 53 break 54 if not line.startswith('MOTIF'): 55 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 56 return record
57 58
59 -class MEMEMotif (Motif):
60 """A subclass of Motif used in parsing MEME (and MAST) output. 61 62 This sublcass defines functions and data specific to MEME motifs. 63 This includes the evalue for a motif and the PSSM of the motif. 64 65 Methods: 66 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values. 67 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies 68 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position. 69 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif 70 """
71 - def __init__(self):
72 Motif.__init__(self) 73 self.evalue = 0.0
74
75 - def _numoccurrences(self, number):
76 if isinstance(number, int): 77 self.num_occurrences = number 78 else: 79 number = int(number) 80 self.num_occurrences = number
81
82 - def get_instance_by_name(self, name):
83 for i in self.instances: 84 if i.sequence_name == name: 85 return i 86 return None
87
88 - def add_instance_from_values(self, name='default', pvalue=1, sequence='ATA', start=0, strand='+'):
89 inst = MEMEInstance(sequence, self.alphabet) 90 inst._pvalue(pvalue) 91 inst._seqname(name) 92 inst._start(start) 93 inst._strand(strand) 94 if self.length: 95 inst._length(self.length) 96 else: 97 inst._length(len(sequence)) 98 if self.name: 99 inst._motifname(self.name) 100 self.add_instance(inst)
101
102 - def _evalue(self, evalue):
103 if isinstance(evalue, float): 104 self.evalue = evalue 105 else: 106 evalue = float(evalue) 107 self.evalue = evalue
108 109
110 -class MEMEInstance(Seq.Seq):
111 """A class describing the instances of a MEME motif, and the data thereof. 112 """
113 - def __init__(self, *args, **kwds):
114 Seq.Seq.__init__(self, *args, **kwds) 115 self.sequence_name = "" 116 self.start = 0 117 self.pvalue = 1.0 118 self.strand = 0 119 self.length = 0 120 self.motif_name = ""
121
122 - def _seqname(self, name):
123 self.sequence_name = name
124
125 - def _motifname(self, name):
126 self.motif_name = name
127
128 - def _start(self, start):
129 start = int(start) 130 self.start = start
131
132 - def _pvalue(self, pval):
133 pval = float(pval) 134 self.pvalue = pval
135
136 - def _score(self, score):
137 score = float(score) 138 self.score = score
139
140 - def _strand(self, strand):
141 self.strand = strand
142
143 - def _length(self, length):
144 self.length = length
145 146
147 -class MEMERecord(object):
148 """A class for holding the results of a MEME run. 149 150 A MEMERecord is an object that holds the results from running 151 MEME. It implements no methods of its own. 152 153 """
154 - def __init__(self):
155 """__init__(self)""" 156 self.motifs = [] 157 self.version = "" 158 self.datafile = "" 159 self.command = "" 160 self.alphabet = None 161 self.sequence_names = []
162
163 - def get_motif_by_name(self, name):
164 for m in self.motifs: 165 if m.name == name: 166 return m
167 168 169 # Everything below is private 170 171
172 -def __read_version(record, handle):
173 for line in handle: 174 if line.startswith('MEME version'): 175 break 176 else: 177 raise ValueError("Improper input file. File should contain a line starting MEME version.") 178 line = line.strip() 179 ls = line.split() 180 record.version = ls[2]
181 182
183 -def __read_datafile(record, handle):
184 for line in handle: 185 if line.startswith('TRAINING SET'): 186 break 187 else: 188 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.") 189 try: 190 line = next(handle) 191 except StopIteration: 192 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 193 if not line.startswith('****'): 194 raise ValueError("Line does not start with '****':\n%s" % line) 195 try: 196 line = next(handle) 197 except StopIteration: 198 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 199 if not line.startswith('DATAFILE'): 200 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 201 line = line.strip() 202 line = line.replace('DATAFILE= ', '') 203 record.datafile = line
204 205
206 -def __read_alphabet(record, handle):
207 try: 208 line = next(handle) 209 except StopIteration: 210 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 211 if not line.startswith('ALPHABET'): 212 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 213 line = line.strip() 214 line = line.replace('ALPHABET= ', '') 215 if line == 'ACGT': 216 al = IUPAC.unambiguous_dna 217 else: 218 al = IUPAC.protein 219 record.alphabet = al
220 221
222 -def __read_sequence_names(record, handle):
223 try: 224 line = next(handle) 225 except StopIteration: 226 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 227 if not line.startswith('Sequence name'): 228 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 229 try: 230 line = next(handle) 231 except StopIteration: 232 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 233 if not line.startswith('----'): 234 raise ValueError("Line does not start with '----':\n%s" % line) 235 for line in handle: 236 if line.startswith('***'): 237 break 238 line = line.strip() 239 ls = line.split() 240 record.sequence_names.append(ls[0]) 241 if len(ls) == 6: 242 record.sequence_names.append(ls[3]) 243 else: 244 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
245 246
247 -def __read_command(record, handle):
248 for line in handle: 249 if line.startswith('command:'): 250 break 251 else: 252 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 253 line = line.strip() 254 line = line.replace('command: ', '') 255 record.command = line
256 257
258 -def __create_motif(line):
259 line = line[5:].strip() 260 ls = line.split() 261 motif = MEMEMotif() 262 motif.length = int(ls[3]) 263 motif._numoccurrences(ls[6]) 264 motif._evalue(ls[12]) 265 return motif
266 267
268 -def __read_motif_name(motif, handle):
269 for line in handle: 270 if 'sorted by position p-value' in line: 271 break 272 else: 273 raise ValueError('Unexpected end of stream: Failed to find motif name') 274 line = line.strip() 275 ls = line.split() 276 name = " ".join(ls[0:2]) 277 motif.name=name
278 279
280 -def __read_motif_sequences(motif, handle, rv):
281 try: 282 line = next(handle) 283 except StopIteration: 284 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 285 if not line.startswith('---'): 286 raise ValueError("Line does not start with '---':\n%s" % line) 287 try: 288 line = next(handle) 289 except StopIteration: 290 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 291 if not line.startswith('Sequence name'): 292 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 293 try: 294 line = next(handle) 295 except StopIteration: 296 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 297 if not line.startswith('---'): 298 raise ValueError("Line does not start with '---':\n%s" % line) 299 for line in handle: 300 if line.startswith('---'): 301 break 302 line = line.strip() 303 ls = line.split() 304 if rv: 305 # seq = Seq.Seq(ls[5], record.alphabet) 306 motif.add_instance_from_values(name=ls[0], sequence=ls[5], start=ls[2], pvalue=ls[3], strand=ls[1]) 307 else: 308 # seq = Seq.Seq(ls[4], record.alphabet) 309 motif.add_instance_from_values(name=ls[0], sequence=ls[4], start=ls[1], pvalue=ls[2]) 310 else: 311 raise ValueError('Unexpected end of stream')
312 313
314 -def __skip_unused_lines(handle):
315 for line in handle: 316 if line.startswith('log-odds matrix'): 317 break 318 else: 319 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 320 for line in handle: 321 if line.startswith('---'): 322 break 323 else: 324 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 325 for line in handle: 326 if line.startswith('letter-probability matrix'): 327 break 328 else: 329 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 330 for line in handle: 331 if line.startswith('---'): 332 break 333 else: 334 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 335 for line in handle: 336 if line.startswith('Time'): 337 break 338 else: 339 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 340 try: 341 line = next(handle) 342 except StopIteration: 343 raise ValueError('Unexpected end of stream: Expected to find blank line') 344 if line.strip(): 345 raise ValueError("Expected blank line, but got:\n%s" % line) 346 try: 347 line = next(handle) 348 except StopIteration: 349 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 350 if not line.startswith('***'): 351 raise ValueError("Line does not start with '***':\n%s" % line) 352 for line in handle: 353 if line.strip(): 354 break 355 else: 356 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 357 if not line.startswith('***'): 358 raise ValueError("Line does not start with '***':\n%s" % line)
359