Package Bio :: Package motifs :: Package jaspar
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs.jaspar

  1  # Copyright 2013 by Anthony Mathelier and David Arenillas. All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  from Bio.Seq import Seq 
  6  from Bio.Alphabet.IUPAC import unambiguous_dna as dna 
  7  import re 
  8  import math 
  9   
 10  from Bio._py3k import range 
 11   
 12  from Bio import motifs 
13 14 15 -class Motif(motifs.Motif):
16 """ 17 A subclass of Bio.motifs.Motif used to represent a JASPAR profile with 18 additional metadata information if available. The metadata availability 19 depends on the source of the JASPAR motif (a 'pfm' format file, a 'jaspar' 20 format file or a JASPAR database). 21 22 """
23 - def __init__(self, matrix_id, name, alphabet=dna, instances=None, 24 counts=None, collection=None, tf_class=None, tf_family=None, 25 species=None, tax_group=None, acc=None, data_type=None, 26 medline=None, pazar_id=None, comment=None):
27 """ 28 Construct a JASPAR Motif instance. 29 30 """ 31 motifs.Motif.__init__(self, alphabet, instances, counts) 32 self.name = name 33 self.matrix_id = matrix_id 34 self.collection = collection 35 self.tf_class = tf_class 36 self.tf_family = tf_family 37 self.species = species # May have multiple so species is a list. 38 # The species are actually specified as 39 # taxonomy IDs. 40 self.tax_group = tax_group 41 self.acc = acc # May have multiple so acc is a list. 42 self.data_type = data_type 43 self.medline = medline 44 self.pazar_id = pazar_id 45 self.comment = comment
46 47 @property
48 - def base_id(self):
49 """ 50 Return the JASPAR base matrix ID 51 """ 52 (base_id, version) = split_jaspar_id(self.matrix_id) 53 return base_id
54 55 @property
56 - def version(self):
57 """ 58 Return the JASPAR matrix version 59 """ 60 (base_id, version) = split_jaspar_id(self.matrix_id) 61 return version
62
63 - def __str__(self):
64 """ 65 Return a string represention of the JASPAR profile. We choose to 66 provide only the filled metadata information. 67 68 """ 69 tf_name_str = "TF name\t{0}\n".format(self.name) 70 matrix_id_str = "Matrix ID\t{0}\n".format(self.matrix_id) 71 the_string = "".join([tf_name_str, matrix_id_str]) 72 if self.collection: 73 collection_str = "Collection\t{0}\n".format(self.collection) 74 the_string = "".join([the_string, collection_str]) 75 if self.tf_class: 76 tf_class_str = "TF class\t{0}\n".format(self.tf_class) 77 the_string = "".join([the_string, tf_class_str]) 78 if self.tf_family: 79 tf_family_str = "TF family\t{0}\n".format(self.tf_family) 80 the_string = "".join([the_string, tf_family_str]) 81 if self.species: 82 species_str = "Species\t{0}\n".format(",".join(self.species)) 83 the_string = "".join([the_string, species_str]) 84 if self.tax_group: 85 tax_group_str = "Taxonomic group\t{0}\n".format(self.tax_group) 86 the_string = "".join([the_string, tax_group_str]) 87 if self.acc: 88 acc_str = "Accession\t{0}\n".format(self.acc) 89 the_string = "".join([the_string, acc_str]) 90 if self.data_type: 91 data_type_str = "Data type used\t{0}\n".format(self.data_type) 92 the_string = "".join([the_string, data_type_str]) 93 if self.medline: 94 medline_str = "Medline\t{0}\n".format(self.medline) 95 the_string = "".join([the_string, medline_str]) 96 if self.pazar_id: 97 pazar_id_str = "PAZAR ID\t{0}\n".format(self.pazar_id) 98 the_string = "".join([the_string, pazar_id_str]) 99 if self.comment: 100 comment_str = "Comments\t{0}\n".format(self.comment) 101 the_string = "".join([the_string, comment_str]) 102 matrix_str = "Matrix:\n{0}\n\n".format(self.counts) 103 the_string = "".join([the_string, matrix_str]) 104 return the_string
105
106 - def __hash__(self):
107 """ 108 Return the hash key corresponding to the JASPAR profile 109 110 :note: We assume the unicity of matrix IDs 111 112 """ 113 return self.matrix_id.__hash__()
114
115 - def __eq__(self, other):
116 return self.matrix_id == other.matrix_id
117
118 119 -class Record(list):
120 """ 121 Represents a list of jaspar motifs 122 123 Attribute: 124 o version: The JASPAR version used 125 126 """ 127
128 - def __init__(self):
129 self.version = None
130
131 - def __str__(self):
132 return "\n".join(str(the_motif) for the_motif in self)
133
134 - def to_dict(self):
135 """ 136 Return the list of matrices as a dictionnary of matrices 137 138 """ 139 140 dic = {} 141 for motif in self: 142 dic[motif.matrix_id] = motif 143 return dic
144
145 146 -def read(handle, format):
147 """ 148 Read motif(s) from a file in one of several different JASPAR formats. 149 Call the appropriate routine based on the format passed. 150 """ 151 152 format = format.lower() 153 if format == "pfm": 154 record = _read_pfm(handle) 155 return record 156 elif format == "sites": 157 record = _read_sites(handle) 158 return record 159 elif format == "jaspar": 160 record = _read_jaspar(handle) 161 return record 162 else: 163 raise ValueError("Unknown JASPAR format %s" % format)
164
165 166 -def write(motifs, format):
167 """Returns the representation of the motifs in "pfm" or "jaspar" format 168 """ 169 letters = "ACGT" 170 lines = [] 171 if format == 'pfm': 172 motif = motifs[0] 173 counts = motif.counts 174 for letter in letters: 175 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 176 line = "{0}\n".format(" ".join(terms)) 177 lines.append(line) 178 elif format == 'jaspar': 179 for m in motifs: 180 counts = m.counts 181 line = ">{0} {1}\n".format(m.matrix_id, m.name) 182 lines.append(line) 183 for letter in letters: 184 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 185 line = "{0} [{1}]\n".format(letter, " ".join(terms)) 186 lines.append(line) 187 else: 188 raise ValueError("Unknown JASPAR format %s" % format) 189 190 # Finished; glue the lines together 191 text = "".join(lines) 192 193 return text
194
195 196 -def _read_pfm(handle):
197 """ 198 Reads the motif from a JASPAR .pfm file 199 """ 200 alphabet = dna 201 counts = {} 202 203 letters = "ACGT" 204 for letter, line in zip(letters, handle): 205 words = line.split() 206 #if there is a letter in the beginning, ignore it 207 if words[0] == letter: 208 words = words[1:] 209 counts[letter] = [float(x) for x in words] 210 211 motif = Motif(matrix_id=None, name=None, alphabet=alphabet, counts=counts) 212 motif.mask = "*" * motif.length 213 record = Record() 214 record.append(motif) 215 216 return record
217
218 219 -def _read_sites(handle):
220 """ 221 Reads the motif from JASPAR .sites file 222 """ 223 224 alphabet = dna 225 instances = [] 226 227 for line in handle: 228 if not line.startswith(">"): 229 break 230 # line contains the header ">...." 231 # now read the actual sequence 232 line = next(handle) 233 instance = "" 234 for c in line.strip(): 235 if c == c.upper(): 236 instance += c 237 instance = Seq(instance, alphabet) 238 instances.append(instance) 239 240 instances = motifs.Instances(instances, alphabet) 241 motif = Motif( 242 matrix_id=None, name=None, alphabet=alphabet, instances=instances 243 ) 244 motif.mask = "*" * motif.length 245 record = Record() 246 record.append(motif) 247 248 return record
249
250 251 -def _read_jaspar(handle):
252 """ 253 Read motifs from a JASPAR formatted file 254 255 Format is one or more records of the form, e.g.: 256 >MA0001.1 AGL3 257 A [ 0 3 79 40 66 48 65 11 65 0 ] 258 C [94 75 4 3 1 2 5 2 3 3 ] 259 G [ 1 0 3 4 1 0 5 3 28 88 ] 260 T [ 2 19 11 50 29 47 22 81 1 6 ] 261 262 """ 263 264 alphabet = dna 265 counts = {} 266 267 record = Record() 268 269 head_pat = re.compile(r"^>\s*(\S+)(\s+(\S+))?") 270 row_pat = re.compile(r"\s*([ACGT])\s*\[\s*(.*)\s*\]") 271 272 identifier = None 273 name = None 274 row_count = 0 275 for line in handle: 276 line.rstrip('\r\n') 277 278 head_match = head_pat.match(line) 279 row_match = row_pat.match(line) 280 281 if head_match: 282 identifier = head_match.group(1) 283 if head_match.group(2): 284 name = head_match.group(2) 285 else: 286 name = identifier 287 elif row_match: 288 (letter, counts_str) = row_match.group(1, 2) 289 290 words = counts_str.split() 291 292 counts[letter] = [float(x) for x in words] 293 294 row_count += 1 295 296 if row_count == 4: 297 record.append(Motif(identifier, name, alphabet=alphabet, 298 counts=counts)) 299 300 identifier = None 301 name = None 302 counts = {} 303 row_count = 0 304 305 return record
306
307 -def calculate_pseudocounts(motif):
308 alphabet = motif.alphabet 309 background = motif.background 310 311 # It is possible to have unequal column sums so use the average 312 # number of instances. 313 total = 0 314 for i in range(motif.length): 315 total += sum(float(motif.counts[letter][i]) for letter in alphabet.letters) 316 317 avg_nb_instances = total / motif.length 318 sq_nb_instances = math.sqrt(avg_nb_instances) 319 320 if background: 321 background = dict(background) 322 else: 323 background = dict.fromkeys(sorted(alphabet.letters), 1.0) 324 325 total = sum(background.values()) 326 pseudocounts = {} 327 328 for letter in alphabet.letters: 329 background[letter] /= total 330 pseudocounts[letter] = sq_nb_instances * background[letter] 331 332 return pseudocounts
333
334 -def split_jaspar_id(id):
335 """ 336 Utility function to split a JASPAR matrix ID into its component base ID 337 and version number, e.g. 'MA0047.2' is returned as ('MA0047', 2). 338 """ 339 340 id_split = id.split('.') 341 342 base_id = None 343 version = None 344 if len(id_split) == 2: 345 base_id = id_split[0] 346 version = id_split[1] 347 else: 348 base_id = id 349 350 return (base_id, version)
351