Package Bio :: Package motifs :: Package jaspar
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs.jaspar

  1  # Copyright 2013 by Anthony Mathelier and David Arenillas. All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """JASPAR2014 module.""" 
  7   
  8  from Bio.Seq import Seq 
  9  from Bio.Alphabet.IUPAC import unambiguous_dna as dna 
 10  import re 
 11  import math 
 12   
 13  from Bio._py3k import range 
 14   
 15  from Bio import motifs 
16 17 18 -class Motif(motifs.Motif):
19 """A subclass of Bio.motifs.Motif used to represent a JASPAR profile. 20 21 Additional metadata information are stored if available. The metadata 22 availability depends on the source of the JASPAR motif (a 'pfm' format 23 file, a 'jaspar' format file or a JASPAR database). 24 """ 25
26 - def __init__(self, matrix_id, name, alphabet=dna, instances=None, 27 counts=None, collection=None, tf_class=None, tf_family=None, 28 species=None, tax_group=None, acc=None, data_type=None, 29 medline=None, pazar_id=None, comment=None):
30 """Construct a JASPAR Motif instance.""" 31 32 motifs.Motif.__init__(self, alphabet, instances, counts) 33 self.name = name 34 self.matrix_id = matrix_id 35 self.collection = collection 36 self.tf_class = tf_class 37 self.tf_family = tf_family 38 # May have multiple so species is a list. 39 # The species are actually specified as 40 # taxonomy IDs. 41 self.species = species 42 self.tax_group = tax_group 43 self.acc = acc # May have multiple so acc is a list. 44 self.data_type = data_type 45 self.medline = medline 46 self.pazar_id = pazar_id 47 self.comment = comment
48 49 @property
50 - def base_id(self):
51 """Return the JASPAR base matrix ID.""" 52 (base_id, __) = split_jaspar_id(self.matrix_id) 53 return base_id
54 55 @property
56 - def version(self):
57 """Return the JASPAR matrix version.""" 58 (__, version) = split_jaspar_id(self.matrix_id) 59 return version
60
61 - def __str__(self):
62 """Return a string represention of the JASPAR profile. 63 64 We choose to provide only the filled metadata information. 65 """ 66 tf_name_str = "TF name\t{0}\n".format(self.name) 67 matrix_id_str = "Matrix ID\t{0}\n".format(self.matrix_id) 68 the_string = "".join([tf_name_str, matrix_id_str]) 69 if self.collection: 70 collection_str = "Collection\t{0}\n".format(self.collection) 71 the_string = "".join([the_string, collection_str]) 72 if self.tf_class: 73 tf_class_str = "TF class\t{0}\n".format(self.tf_class) 74 the_string = "".join([the_string, tf_class_str]) 75 if self.tf_family: 76 tf_family_str = "TF family\t{0}\n".format(self.tf_family) 77 the_string = "".join([the_string, tf_family_str]) 78 if self.species: 79 species_str = "Species\t{0}\n".format(",".join(self.species)) 80 the_string = "".join([the_string, species_str]) 81 if self.tax_group: 82 tax_group_str = "Taxonomic group\t{0}\n".format(self.tax_group) 83 the_string = "".join([the_string, tax_group_str]) 84 if self.acc: 85 acc_str = "Accession\t{0}\n".format(self.acc) 86 the_string = "".join([the_string, acc_str]) 87 if self.data_type: 88 data_type_str = "Data type used\t{0}\n".format(self.data_type) 89 the_string = "".join([the_string, data_type_str]) 90 if self.medline: 91 medline_str = "Medline\t{0}\n".format(self.medline) 92 the_string = "".join([the_string, medline_str]) 93 if self.pazar_id: 94 pazar_id_str = "PAZAR ID\t{0}\n".format(self.pazar_id) 95 the_string = "".join([the_string, pazar_id_str]) 96 if self.comment: 97 comment_str = "Comments\t{0}\n".format(self.comment) 98 the_string = "".join([the_string, comment_str]) 99 matrix_str = "Matrix:\n{0}\n\n".format(self.counts) 100 the_string = "".join([the_string, matrix_str]) 101 return the_string
102
103 - def __hash__(self):
104 """Return the hash key corresponding to the JASPAR profile. 105 106 :note: We assume the unicity of matrix IDs 107 """ 108 return self.matrix_id.__hash__()
109
110 - def __eq__(self, other):
111 return self.matrix_id == other.matrix_id
112
113 114 -class Record(list):
115 """Represent a list of jaspar motifs. 116 117 Attributes: 118 119 - version: The JASPAR version used 120 121 """ 122
123 - def __init__(self):
124 self.version = None
125
126 - def __str__(self):
127 return "\n".join(str(the_motif) for the_motif in self)
128
129 - def to_dict(self):
130 """Return the list of matrices as a dictionnary of matrices.""" 131 dic = {} 132 for motif in self: 133 dic[motif.matrix_id] = motif 134 return dic
135
136 137 -def read(handle, format):
138 """Read motif(s) from a file in one of several different JASPAR formats. 139 140 Return the record of PFM(s). 141 Call the appropriate routine based on the format passed. 142 """ 143 format = format.lower() 144 if format == "pfm": 145 record = _read_pfm(handle) 146 return record 147 elif format == "sites": 148 record = _read_sites(handle) 149 return record 150 elif format == "jaspar": 151 record = _read_jaspar(handle) 152 return record 153 else: 154 raise ValueError("Unknown JASPAR format %s" % format)
155
156 157 -def write(motifs, format):
158 """Return the representation of motifs in "pfm" or "jaspar" format.""" 159 letters = "ACGT" 160 lines = [] 161 if format == 'pfm': 162 motif = motifs[0] 163 counts = motif.counts 164 for letter in letters: 165 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 166 line = "{0}\n".format(" ".join(terms)) 167 lines.append(line) 168 elif format == 'jaspar': 169 for m in motifs: 170 counts = m.counts 171 line = ">{0} {1}\n".format(m.matrix_id, m.name) 172 lines.append(line) 173 for letter in letters: 174 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 175 line = "{0} [{1}]\n".format(letter, " ".join(terms)) 176 lines.append(line) 177 else: 178 raise ValueError("Unknown JASPAR format %s" % format) 179 180 # Finished; glue the lines together 181 text = "".join(lines) 182 183 return text
184
185 186 -def _read_pfm(handle):
187 """Read the motif from a JASPAR .pfm file (PRIVATE).""" 188 alphabet = dna 189 counts = {} 190 191 letters = "ACGT" 192 for letter, line in zip(letters, handle): 193 words = line.split() 194 # if there is a letter in the beginning, ignore it 195 if words[0] == letter: 196 words = words[1:] 197 counts[letter] = [float(x) for x in words] 198 199 motif = Motif(matrix_id=None, name=None, alphabet=alphabet, counts=counts) 200 motif.mask = "*" * motif.length 201 record = Record() 202 record.append(motif) 203 204 return record
205
206 207 -def _read_sites(handle):
208 """Read the motif from JASPAR .sites file (PRIVATE).""" 209 alphabet = dna 210 instances = [] 211 212 for line in handle: 213 if not line.startswith(">"): 214 break 215 # line contains the header ">...." 216 # now read the actual sequence 217 line = next(handle) 218 instance = "" 219 for c in line.strip(): 220 if c == c.upper(): 221 instance += c 222 instance = Seq(instance, alphabet) 223 instances.append(instance) 224 225 instances = motifs.Instances(instances, alphabet) 226 motif = Motif( 227 matrix_id=None, name=None, alphabet=alphabet, instances=instances 228 ) 229 motif.mask = "*" * motif.length 230 record = Record() 231 record.append(motif) 232 233 return record
234
235 236 -def _read_jaspar(handle):
237 """Read motifs from a JASPAR formatted file (PRIVATE). 238 239 Format is one or more records of the form, e.g.:: 240 241 >MA0001.1 AGL3 242 A [ 0 3 79 40 66 48 65 11 65 0 ] 243 C [94 75 4 3 1 2 5 2 3 3 ] 244 G [ 1 0 3 4 1 0 5 3 28 88 ] 245 T [ 2 19 11 50 29 47 22 81 1 6 ] 246 247 or:: 248 249 >MA0001.1 AGL3 250 0 3 79 40 66 48 65 11 65 0 251 4 75 4 3 1 2 5 2 3 3 252 1 0 3 4 1 0 5 3 28 88 253 2 19 11 50 29 47 22 81 1 6 254 255 """ 256 257 alphabet = dna 258 counts = {} 259 260 record = Record() 261 262 head_pat = re.compile(r"^>\s*(\S+)(\s+(\S+))?") 263 row_pat_long = re.compile(r"\s*([ACGT])\s*\[\s*(.*)\s*\]") 264 row_pat_short = re.compile(r"\s*(.*)\s*") 265 266 identifier = None 267 name = None 268 row_count = 0 269 nucleotides = ['A', 'C', 'G', 'T'] 270 for line in handle: 271 line.rstrip('\r\n') 272 273 head_match = head_pat.match(line) 274 row_match_long = row_pat_long.match(line) 275 row_match_short = row_pat_short.match(line) 276 277 if head_match: 278 identifier = head_match.group(1) 279 if head_match.group(2): 280 name = head_match.group(2) 281 else: 282 name = identifier 283 elif row_match_long: 284 (letter, counts_str) = row_match_long.group(1, 2) 285 words = counts_str.split() 286 counts[letter] = [float(x) for x in words] 287 row_count += 1 288 if row_count == 4: 289 record.append(Motif(identifier, name, alphabet=alphabet, 290 counts=counts)) 291 identifier = None 292 name = None 293 counts = {} 294 row_count = 0 295 elif row_match_short: 296 words = row_match_short.group(1).split() 297 counts[nucleotides[row_count]] = [float(x) for x in words] 298 row_count += 1 299 if row_count == 4: 300 record.append(Motif(identifier, name, alphabet=alphabet, 301 counts=counts)) 302 identifier = None 303 name = None 304 counts = {} 305 row_count = 0 306 307 return record
308
309 310 -def calculate_pseudocounts(motif):
311 alphabet = motif.alphabet 312 background = motif.background 313 314 # It is possible to have unequal column sums so use the average 315 # number of instances. 316 total = 0 317 for i in range(motif.length): 318 total += sum(float(motif.counts[letter][i]) 319 for letter in alphabet.letters) 320 321 avg_nb_instances = total / motif.length 322 sq_nb_instances = math.sqrt(avg_nb_instances) 323 324 if background: 325 background = dict(background) 326 else: 327 background = dict.fromkeys(sorted(alphabet.letters), 1.0) 328 329 total = sum(background.values()) 330 pseudocounts = {} 331 332 for letter in alphabet.letters: 333 background[letter] /= total 334 pseudocounts[letter] = sq_nb_instances * background[letter] 335 336 return pseudocounts
337
338 339 -def split_jaspar_id(id):
340 """Utility function to split a JASPAR matrix ID into its component. 341 342 Components are base ID and version number, e.g. 'MA0047.2' is returned as 343 ('MA0047', 2). 344 """ 345 346 id_split = id.split('.') 347 348 base_id = None 349 version = None 350 if len(id_split) == 2: 351 base_id = id_split[0] 352 version = id_split[1] 353 else: 354 base_id = id 355 356 return (base_id, version)
357