Package Bio :: Package motifs :: Package jaspar
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs.jaspar

  1  # Copyright 2013 by Anthony Mathelier and David Arenillas. All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ JASPAR2014 module. """ 
  7   
  8  from Bio.Seq import Seq 
  9  from Bio.Alphabet.IUPAC import unambiguous_dna as dna 
 10  import re 
 11  import math 
 12   
 13  from Bio._py3k import range 
 14   
 15  from Bio import motifs 
16 17 18 -class Motif(motifs.Motif):
19 20 """ 21 A subclass of Bio.motifs.Motif used to represent a JASPAR profile. 22 23 Additional metadata information are stored if available. The metadata 24 availability depends on the source of the JASPAR motif (a 'pfm' format 25 file, a 'jaspar' format file or a JASPAR database). 26 27 """ 28
29 - def __init__(self, matrix_id, name, alphabet=dna, instances=None, 30 counts=None, collection=None, tf_class=None, tf_family=None, 31 species=None, tax_group=None, acc=None, data_type=None, 32 medline=None, pazar_id=None, comment=None):
33 """ Construct a JASPAR Motif instance. """ 34 35 motifs.Motif.__init__(self, alphabet, instances, counts) 36 self.name = name 37 self.matrix_id = matrix_id 38 self.collection = collection 39 self.tf_class = tf_class 40 self.tf_family = tf_family 41 self.species = species # May have multiple so species is a list. 42 # The species are actually specified as 43 # taxonomy IDs. 44 self.tax_group = tax_group 45 self.acc = acc # May have multiple so acc is a list. 46 self.data_type = data_type 47 self.medline = medline 48 self.pazar_id = pazar_id 49 self.comment = comment
50 51 @property
52 - def base_id(self):
53 """ Return the JASPAR base matrix ID. """ 54 55 (base_id, __) = split_jaspar_id(self.matrix_id) 56 return base_id
57 58 @property
59 - def version(self):
60 """ Return the JASPAR matrix version. """ 61 62 (__, version) = split_jaspar_id(self.matrix_id) 63 return version
64
65 - def __str__(self):
66 """ 67 Return a string represention of the JASPAR profile. 68 69 We choose to provide only the filled metadata information. 70 71 """ 72 tf_name_str = "TF name\t{0}\n".format(self.name) 73 matrix_id_str = "Matrix ID\t{0}\n".format(self.matrix_id) 74 the_string = "".join([tf_name_str, matrix_id_str]) 75 if self.collection: 76 collection_str = "Collection\t{0}\n".format(self.collection) 77 the_string = "".join([the_string, collection_str]) 78 if self.tf_class: 79 tf_class_str = "TF class\t{0}\n".format(self.tf_class) 80 the_string = "".join([the_string, tf_class_str]) 81 if self.tf_family: 82 tf_family_str = "TF family\t{0}\n".format(self.tf_family) 83 the_string = "".join([the_string, tf_family_str]) 84 if self.species: 85 species_str = "Species\t{0}\n".format(",".join(self.species)) 86 the_string = "".join([the_string, species_str]) 87 if self.tax_group: 88 tax_group_str = "Taxonomic group\t{0}\n".format(self.tax_group) 89 the_string = "".join([the_string, tax_group_str]) 90 if self.acc: 91 acc_str = "Accession\t{0}\n".format(self.acc) 92 the_string = "".join([the_string, acc_str]) 93 if self.data_type: 94 data_type_str = "Data type used\t{0}\n".format(self.data_type) 95 the_string = "".join([the_string, data_type_str]) 96 if self.medline: 97 medline_str = "Medline\t{0}\n".format(self.medline) 98 the_string = "".join([the_string, medline_str]) 99 if self.pazar_id: 100 pazar_id_str = "PAZAR ID\t{0}\n".format(self.pazar_id) 101 the_string = "".join([the_string, pazar_id_str]) 102 if self.comment: 103 comment_str = "Comments\t{0}\n".format(self.comment) 104 the_string = "".join([the_string, comment_str]) 105 matrix_str = "Matrix:\n{0}\n\n".format(self.counts) 106 the_string = "".join([the_string, matrix_str]) 107 return the_string
108
109 - def __hash__(self):
110 """ 111 Return the hash key corresponding to the JASPAR profile. 112 113 :note: We assume the unicity of matrix IDs 114 115 """ 116 return self.matrix_id.__hash__()
117
118 - def __eq__(self, other):
119 return self.matrix_id == other.matrix_id
120
121 122 -class Record(list):
123 124 """ 125 Represent a list of jaspar motifs. 126 127 Attribute: 128 o version: The JASPAR version used 129 130 """ 131
132 - def __init__(self):
133 self.version = None
134
135 - def __str__(self):
136 return "\n".join(str(the_motif) for the_motif in self)
137
138 - def to_dict(self):
139 """ Return the list of matrices as a dictionnary of matrices. """ 140 141 dic = {} 142 for motif in self: 143 dic[motif.matrix_id] = motif 144 return dic
145
146 147 -def read(handle, format):
148 """ 149 Read motif(s) from a file in one of several different JASPAR formats. 150 151 Return the record of PFM(s). 152 Call the appropriate routine based on the format passed. 153 154 """ 155 156 format = format.lower() 157 if format == "pfm": 158 record = _read_pfm(handle) 159 return record 160 elif format == "sites": 161 record = _read_sites(handle) 162 return record 163 elif format == "jaspar": 164 record = _read_jaspar(handle) 165 return record 166 else: 167 raise ValueError("Unknown JASPAR format %s" % format)
168
169 170 -def write(motifs, format):
171 """ Return the representation of motifs in "pfm" or "jaspar" format. """ 172 letters = "ACGT" 173 lines = [] 174 if format == 'pfm': 175 motif = motifs[0] 176 counts = motif.counts 177 for letter in letters: 178 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 179 line = "{0}\n".format(" ".join(terms)) 180 lines.append(line) 181 elif format == 'jaspar': 182 for m in motifs: 183 counts = m.counts 184 line = ">{0} {1}\n".format(m.matrix_id, m.name) 185 lines.append(line) 186 for letter in letters: 187 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 188 line = "{0} [{1}]\n".format(letter, " ".join(terms)) 189 lines.append(line) 190 else: 191 raise ValueError("Unknown JASPAR format %s" % format) 192 193 # Finished; glue the lines together 194 text = "".join(lines) 195 196 return text
197
198 199 -def _read_pfm(handle):
200 """ Read the motif from a JASPAR .pfm file. """ 201 alphabet = dna 202 counts = {} 203 204 letters = "ACGT" 205 for letter, line in zip(letters, handle): 206 words = line.split() 207 #if there is a letter in the beginning, ignore it 208 if words[0] == letter: 209 words = words[1:] 210 counts[letter] = [float(x) for x in words] 211 212 motif = Motif(matrix_id=None, name=None, alphabet=alphabet, counts=counts) 213 motif.mask = "*" * motif.length 214 record = Record() 215 record.append(motif) 216 217 return record
218
219 220 -def _read_sites(handle):
221 """ Read the motif from JASPAR .sites file. """ 222 223 alphabet = dna 224 instances = [] 225 226 for line in handle: 227 if not line.startswith(">"): 228 break 229 # line contains the header ">...." 230 # now read the actual sequence 231 line = next(handle) 232 instance = "" 233 for c in line.strip(): 234 if c == c.upper(): 235 instance += c 236 instance = Seq(instance, alphabet) 237 instances.append(instance) 238 239 instances = motifs.Instances(instances, alphabet) 240 motif = Motif( 241 matrix_id=None, name=None, alphabet=alphabet, instances=instances 242 ) 243 motif.mask = "*" * motif.length 244 record = Record() 245 record.append(motif) 246 247 return record
248
249 250 -def _read_jaspar(handle):
251 """ 252 Read motifs from a JASPAR formatted file. 253 254 Format is one or more records of the form, e.g.: 255 >MA0001.1 AGL3 256 A [ 0 3 79 40 66 48 65 11 65 0 ] 257 C [94 75 4 3 1 2 5 2 3 3 ] 258 G [ 1 0 3 4 1 0 5 3 28 88 ] 259 T [ 2 19 11 50 29 47 22 81 1 6 ] 260 261 or 262 >MA0001.1 AGL3 263 0 3 79 40 66 48 65 11 65 0 264 4 75 4 3 1 2 5 2 3 3 265 1 0 3 4 1 0 5 3 28 88 266 2 19 11 50 29 47 22 81 1 6 267 268 """ 269 270 alphabet = dna 271 counts = {} 272 273 record = Record() 274 275 head_pat = re.compile(r"^>\s*(\S+)(\s+(\S+))?") 276 row_pat_long = re.compile(r"\s*([ACGT])\s*\[\s*(.*)\s*\]") 277 row_pat_short = re.compile(r"\s*(.*)\s*") 278 279 identifier = None 280 name = None 281 row_count = 0 282 nucleotides = ['A', 'C', 'G', 'T'] 283 for line in handle: 284 line.rstrip('\r\n') 285 286 head_match = head_pat.match(line) 287 row_match_long = row_pat_long.match(line) 288 row_match_short = row_pat_short.match(line) 289 290 if head_match: 291 identifier = head_match.group(1) 292 if head_match.group(2): 293 name = head_match.group(2) 294 else: 295 name = identifier 296 elif row_match_long: 297 (letter, counts_str) = row_match_long.group(1, 2) 298 words = counts_str.split() 299 counts[letter] = [float(x) for x in words] 300 row_count += 1 301 if row_count == 4: 302 record.append(Motif(identifier, name, alphabet=alphabet, 303 counts=counts)) 304 identifier = None 305 name = None 306 counts = {} 307 row_count = 0 308 elif row_match_short: 309 words = row_match_short.group(1).split() 310 counts[nucleotides[row_count]] = [float(x) for x in words] 311 row_count += 1 312 if row_count == 4: 313 record.append(Motif(identifier, name, alphabet=alphabet, 314 counts=counts)) 315 identifier = None 316 name = None 317 counts = {} 318 row_count = 0 319 320 return record
321
322 323 -def calculate_pseudocounts(motif):
324 alphabet = motif.alphabet 325 background = motif.background 326 327 # It is possible to have unequal column sums so use the average 328 # number of instances. 329 total = 0 330 for i in range(motif.length): 331 total += sum(float(motif.counts[letter][i]) 332 for letter in alphabet.letters) 333 334 avg_nb_instances = total / motif.length 335 sq_nb_instances = math.sqrt(avg_nb_instances) 336 337 if background: 338 background = dict(background) 339 else: 340 background = dict.fromkeys(sorted(alphabet.letters), 1.0) 341 342 total = sum(background.values()) 343 pseudocounts = {} 344 345 for letter in alphabet.letters: 346 background[letter] /= total 347 pseudocounts[letter] = sq_nb_instances * background[letter] 348 349 return pseudocounts
350
351 352 -def split_jaspar_id(id):
353 """ 354 Utility function to split a JASPAR matrix ID into its component. 355 356 Components are base ID and version number, e.g. 'MA0047.2' is returned as 357 ('MA0047', 2). 358 359 """ 360 361 id_split = id.split('.') 362 363 base_id = None 364 version = None 365 if len(id_split) == 2: 366 base_id = id_split[0] 367 version = id_split[1] 368 else: 369 base_id = id 370 371 return (base_id, version)
372