Package Bio :: Package motifs :: Package jaspar :: Module db
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.jaspar.db

  1  # Copyright 2013 by David Arenillas and Anthony Mathelier. All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Provides read access to a JASPAR5 formatted database. 
  6   
  7  This modules requires MySQLdb to be installed. 
  8   
  9  Example, substitute the your database credentials as 
 10  appropriate: 
 11   
 12      >>> from Bio.motifs.jaspar.db import JASPAR5 
 13      >>>  
 14      >>> JASPAR_DB_HOST = "hostname.example.org" 
 15      >>> JASPAR_DB_NAME = "JASPAR_2013" 
 16      >>> JASPAR_DB_USER = "guest" 
 17      >>> JASPAR_DB_PASS = "guest" 
 18      >>>  
 19      >>> DFLT_COLLECTION = 'CORE' 
 20      >>> jdb = JASPAR5( 
 21      ...     host=JASPAR_DB_HOST, 
 22      ...     name=JASPAR_DB_NAME, 
 23      ...     user=JASPAR_DB_USER, 
 24      ...     password=JASPAR_DB_PASS 
 25      ... ) 
 26      >>>  
 27      >>>  
 28      >>> ets1 = jdb.fetch_motif_by_id('MA0098') 
 29      >>> print(ets1) 
 30      TF name ETS1 
 31      Matrix ID   MA0098.1 
 32      Collection  CORE 
 33      TF class    Winged Helix-Turn-Helix 
 34      TF family   Ets 
 35      Species 9606 
 36      Taxonomic group vertebrates 
 37      Accession   ['CAG47050'] 
 38      Data type used  SELEX 
 39      Medline 1542566 
 40      PAZAR ID    TF0000070 
 41      Comments    - 
 42      Matrix: 
 43              0      1      2      3      4      5 
 44      A:   4.00  17.00   0.00   0.00   0.00   5.00 
 45      C:  16.00   0.00   1.00  39.00  39.00   3.00 
 46      G:   4.00   0.00   0.00   1.00   0.00  17.00 
 47      T:  16.00  23.00  39.00   0.00   1.00  15.00 
 48   
 49   
 50      >>>  
 51      >>> motifs = jdb.fetch_motifs( 
 52      ...     collection = 'CORE', 
 53      ...     tax_group = ['vertebrates', 'insects'], 
 54      ...     tf_class = 'Winged Helix-Turn-Helix', 
 55      ...     tf_family = ['Forkhead', 'Ets'], 
 56      ...     min_ic = 12 
 57      ... ) 
 58      >>>  
 59      >>> for motif in motifs: 
 60      ...     pass # do something with the motif 
 61   
 62  """ 
 63   
 64  from __future__ import print_function 
 65   
 66  from Bio import MissingPythonDependencyError 
 67   
 68  try: 
 69      import MySQLdb as mdb 
 70  except: 
 71      raise MissingPythonDependencyError("Install MySQLdb if you want to use " 
 72                                         "Bio.motifs.jaspar.db") 
 73   
 74   
 75  from Bio.Alphabet.IUPAC import unambiguous_dna as dna 
 76   
 77  from Bio.motifs import jaspar, matrix 
 78  from warnings import warn 
 79   
 80  JASPAR_DFLT_COLLECTION = 'CORE' 
 81   
82 -class JASPAR5(object):
83 """ 84 Class representing a JASPAR5 DB. The methods within are loosely based 85 on the perl TFBS::DB::JASPAR5 module. 86 87 Note: We will only implement reading of JASPAR motifs from the DB. 88 Unlike the perl module, we will not attempt to implement any methods to 89 store JASPAR motifs or create a new DB at this time. 90 91 """ 92
93 - def __init__(self, host=None, name=None, user=None, password=None):
94 """ 95 Construct a JASPAR5 instance and connect to specified DB 96 97 Arguments: 98 host - host name of the the JASPAR DB server 99 name - name of the JASPAR database 100 user - user name to connect to the JASPAR DB 101 password - JASPAR DB password 102 103 """ 104 105 self.name = name 106 self.host = host 107 self.user = user 108 self.password = password 109 110 self.dbh = mdb.connect(host, user, password, name)
111
112 - def __str__(self):
113 """ 114 Return a string represention of the JASPAR5 DB connection. 115 116 """ 117 118 text = "%s\@%s:%s" % (self.user, self.host, self.name) 119 120 return text
121
122 - def fetch_motif_by_id(self, id):
123 """ 124 Fetch a single JASPAR motif from the DB by it's JASPAR matrix ID 125 (e.g. 'MA0001.1'). 126 127 Arguments: 128 id - JASPAR matrix ID. This may be a fully specified ID including the 129 version number (e.g. MA0049.2) or just the base ID (e.g. MA0049). 130 If only a base ID is provided, the latest version is returned. 131 Returns: 132 A Bio.motifs.jaspar.Motif object 133 134 NOTE: The perl TFBS module allows you to specify the type of matrix to 135 return (PFM, PWM, ICM) but matrices are always stored in JASAPR as 136 PFMs so this does not really belong here. Once a PFM is fetched the 137 pwm() and pssm() methods can be called to return the normalized and 138 log-odds matrices. 139 140 """ 141 142 # separate stable ID and version number 143 (base_id, version) = jaspar.split_jaspar_id(id) 144 if not version: 145 # if ID contains no version portion, fetch latest version by default 146 version = self._fetch_latest_version(base_id) 147 148 # fetch internal JASPAR matrix ID - also a check for validity 149 int_id = self._fetch_internal_id(base_id, version) 150 151 # fetch JASPAR motif using internal ID 152 motif = self._fetch_motif_by_internal_id(int_id) 153 154 return motif
155
156 - def fetch_motifs_by_name(self, name):
157 """ 158 Fetch a list of JASPAR motifs from a JASPAR DB by the given TF name(s). 159 160 Arguments: 161 name - a single name or list of names 162 Returns: 163 A list of Bio.motifs.Motif.japar objects 164 165 Notes: 166 Names are not guaranteed to be unique. There may be more than one 167 motif with the same name. Therefore even if name specifies a single 168 name, a list of motifs is returned. This just calls 169 self.fetch_motifs(collection = None, tf_name = name). 170 171 This behaviour is different from the TFBS perl module's 172 get_Matrix_by_name() method which always returns a single matrix, 173 issuing a warning message and returning the first matrix retrieved 174 in the case where multiple matrices have the same name. 175 176 """ 177 178 return self.fetch_motifs(collection=None, tf_name=name)
179
180 - def fetch_motifs( 181 self, collection=JASPAR_DFLT_COLLECTION, tf_name=None, tf_class=None, 182 tf_family=None, matrix_id=None, tax_group=None, species=None, 183 pazar_id=None, data_type=None, medline=None, min_ic=0, min_length=0, 184 min_sites=0, all=False, all_versions=False 185 ):
186 """ 187 Fetch a jaspar.Record (list) of motifs based on the provided selection 188 criteria. 189 190 Arguments: 191 Except where obvious, all selection criteria arguments may be specified 192 as a single value or a list of values. Motifs must meet ALL the 193 specified selection criteria to be returned with the precedent 194 exceptions noted below. 195 196 all - Takes precedent of all other selection criteria. 197 Every motif is returned. If 'all_versions' is also 198 specified, all versions of every motif are returned, 199 otherwise just the latest version of every motif is 200 returned. 201 matrix_id - Takes precedence over all other selection criteria except 202 'all'. Only motifs with the given JASPAR matrix ID(s) 203 are returned. A matrix ID may be specified as just a base 204 ID or full JASPAR IDs including version number. If only a 205 base ID is provided for specific motif(s), then just the 206 latest version of those motif(s) are returned unless 207 'all_versions' is also specified. 208 collection - Only motifs from the specified JASPAR collection(s) 209 are returned. NOTE - if not specified, the collection 210 defaults to CORE for all other selection criteria except 211 'all' and 'matrix_id'. To apply the other selection 212 criteria across all JASPAR collections, explicitly set 213 collection=None. 214 tf_name - Only motifs with the given name(s) are returned. 215 tf_class - Only motifs of the given TF class(es) are returned. 216 tf_family - Only motifs from the given TF families are returned. 217 tax_group - Only motifs belonging to the given taxonomic supergroups 218 are returned (e.g. 'vertebrates', 'insects', 'nematodes' 219 etc.) 220 species - Only motifs derived from the given species are returned. 221 Species are specified as taxonomy IDs. 222 data_type - Only motifs generated with the given data type (e.g. 223 ('ChIP-seq', 'PBM', 'SELEX' etc.) are returned. NOTE - 224 must match exactly as stored in the database. 225 pazar_id - Only motifs with the given PAZAR TF ID are returned. 226 medline - Only motifs with the given medline (PubmMed IDs) are 227 returned. 228 min_ic - Only motifs whose profile matrices have at least this 229 information content (specificty) are returned. 230 min_length - Only motifs whose profiles are of at least this length 231 are returned. 232 min_sites - Only motifs compiled from at least these many binding 233 sites are returned. 234 all_versions- Unless specified, just the latest version of motifs 235 determined by the other selection criteria are returned 236 otherwise all versions of the selected motifs are 237 returned. 238 239 Returns: 240 A Bio.motifs.jaspar.Record (list) of motifs. 241 242 """ 243 244 # Fetch the internal IDs of the motifs using the criteria provided 245 int_ids = self._fetch_internal_id_list( 246 collection = collection, 247 tf_name = tf_name, 248 tf_class = tf_class, 249 tf_family = tf_family, 250 matrix_id = matrix_id, 251 tax_group = tax_group, 252 species = species, 253 pazar_id = pazar_id, 254 data_type = data_type, 255 medline = medline, 256 all = all, 257 all_versions = all_versions 258 ) 259 260 record = jaspar.Record() 261 262 """ 263 Now further filter motifs returned above based on any specified 264 matrix specific criteria. 265 """ 266 for int_id in int_ids: 267 motif = self._fetch_motif_by_internal_id(int_id) 268 269 # Filter motifs to those with matrix IC greater than min_ic 270 if min_ic: 271 if motif.pssm.mean() < min_ic: 272 continue 273 274 # Filter motifs to those with minimum length of min_length 275 if min_length: 276 if motif.length < min_length: 277 continue 278 279 # XXX We could also supply a max_length filter. 280 281 """ 282 Filter motifs to those composed of at least this many sites. 283 The perl TFBS module assumes column sums may be different but 284 this should be strictly enforced here we will ignore this and 285 just use the first column sum. 286 """ 287 if min_sites: 288 num_sites = sum( 289 [motif.counts[nt][0] for nt in motif.alphabet.letters] 290 ) 291 if num_sites < min_sites: 292 continue 293 294 record.append(motif) 295 296 return record
297
298 - def _fetch_latest_version(self, base_id):
299 """ 300 Get the latest version number for the given base_id, 301 302 """ 303 304 sql = "select VERSION from MATRIX where BASE_id = '%s' order by VERSION desc limit 1" % base_id 305 306 cur = self.dbh.cursor() 307 cur.execute(sql) 308 309 latest = cur.fetchone()[0] 310 311 return latest
312
313 - def _fetch_internal_id(self, base_id, version):
314 """ 315 Fetch the internal id for a base id + version. Also checks if this 316 combo exists or not 317 318 """ 319 320 sql = "select id from MATRIX where BASE_id = '%s' and VERSION = '%s'" % (base_id, version) 321 322 cur = self.dbh.cursor() 323 cur.execute(sql) 324 325 int_id = cur.fetchone()[0] 326 327 return int_id
328 329
330 - def _fetch_motif_by_internal_id(self, int_id):
331 # fetch basic motif information 332 sql = "select BASE_ID, VERSION, COLLECTION, NAME from MATRIX where id = %d" % int_id 333 334 cur = self.dbh.cursor() 335 cur.execute(sql) 336 337 row = cur.fetchone() 338 339 base_id = row[0] 340 version = row[1] 341 collection = row[2] 342 name = row[3] 343 344 matrix_id = "".join([base_id, '.', str(version)]) 345 346 # fetch the counts matrix 347 counts = self._fetch_counts_matrix(int_id) 348 349 # Create new JASPAR motif 350 motif = jaspar.Motif( 351 matrix_id, name, collection = collection, counts = counts 352 ) 353 354 # fetch species 355 sql = "select TAX_ID from MATRIX_SPECIES where id = %d" % int_id 356 cur.execute(sql) 357 tax_ids = [] 358 rows = cur.fetchall() 359 for row in rows: 360 tax_ids.append(row[0]) 361 362 motif.species = tax_ids 363 364 # fetch protein accession numbers 365 sql = "select ACC FROM MATRIX_PROTEIN where id = %d" % int_id 366 cur.execute(sql) 367 accs = [] 368 rows = cur.fetchall() 369 for row in rows: 370 accs.append(row[0]) 371 372 motif.acc = accs 373 374 # fetch remaining annotation as tags from the ANNOTATION table 375 sql = "select TAG, VAL from MATRIX_ANNOTATION where id = %d" % int_id 376 cur.execute(sql) 377 rows = cur.fetchall() 378 for row in rows: 379 attr = row[0] 380 val = row[1] 381 if attr == 'class': 382 motif.tf_class = val 383 elif attr == 'family': 384 motif.tf_family = val 385 elif attr == 'tax_group': 386 motif.tax_group = val 387 elif attr == 'type': 388 motif.data_type = val 389 elif attr == 'pazar_tf_id': 390 motif.pazar_id = val 391 elif attr == 'medline': 392 motif.medline = val 393 elif attr == 'comment': 394 motif.comment = val 395 else: 396 """ 397 TODO If we were to implement additional abitrary tags 398 motif.tag(attr, val) 399 """ 400 pass 401 402 return motif
403
404 - def _fetch_counts_matrix(self, int_id):
405 """ 406 Fetch the counts matrix from the JASPAR DB by the internal ID 407 408 Returns a Bio.motifs.matrix.GenericPositionMatrix 409 410 """ 411 counts = {} 412 cur = self.dbh.cursor() 413 414 for base in dna.letters: 415 base_counts = [] 416 417 cur.execute("select val from MATRIX_DATA where ID = %s and row = %s order by col", (int_id, base)) 418 419 rows = cur.fetchall() 420 for row in rows: 421 base_counts.append(row[0]) 422 423 counts[base] = [float(x) for x in base_counts] 424 425 return matrix.GenericPositionMatrix(dna, counts)
426
427 - def _fetch_internal_id_list( 428 self, collection=JASPAR_DFLT_COLLECTION, tf_name=None, tf_class=None, 429 tf_family=None, matrix_id=None, tax_group=None, species=None, 430 pazar_id=None, data_type=None, medline=None, all=False, 431 all_versions=False 432 ):
433 """ 434 Fetch a list of internal JASPAR motif IDs based on various passed 435 parameters which may then be used to fetch the rest of the motif data. 436 437 Caller: 438 fetch_motifs() 439 440 Arguments: 441 See arguments sections of fetch_motifs() 442 443 Returns: 444 A list of internal JASPAR motif IDs which match the given 445 selection criteria arguments. 446 447 448 Build an SQL query based on the selection arguments provided. 449 450 1: First add table joins and sub-clauses for criteria corresponding to 451 named fields from the MATRIX and MATRIX_SPECIES tables such as 452 collection, matrix ID, name, species etc. 453 454 2: Then add joins/sub-clauses for tag/value parameters from the 455 MATRIX_ANNOTATION table. 456 457 For the surviving matrices, the responsibility to do matrix-based 458 feature filtering such as ic, number of sites etc, fall on the 459 calling fetch_motifs() method. 460 461 """ 462 463 int_ids = [] 464 465 cur = self.dbh.cursor() 466 467 """ 468 Special case 1: fetch ALL motifs. Highest priority. 469 Ignore all other selection arguments. 470 """ 471 if all: 472 cur.execute("select ID from MATRIX") 473 rows = cur.fetchall() 474 475 for row in rows: 476 int_ids.append(row[0]) 477 478 return int_ids 479 480 """ 481 Special case 2: fetch specific motifs by their JASPAR IDs. This 482 has higher priority than any other except the above 'all' case. 483 Ignore all other selection arguments. 484 """ 485 if matrix_id: 486 """ 487 These might be either stable IDs or stable_ID.version. 488 If just stable ID and if all_versions == 1, return all versions, 489 otherwise just the latest 490 """ 491 if all_versions: 492 for id in matrix_id: 493 # ignore vesion here, this is a stupidity filter 494 (base_id, version) = jaspar.split_jaspar_id(id) 495 cur.execute( 496 "select ID from MATRIX where BASE_ID = %s", base_id 497 ) 498 499 rows = cur.fetchall() 500 for row in rows: 501 int_ids.append(row[0]) 502 else: 503 # only the lastest version, or the requested version 504 for id in matrix_id: 505 (base_id, version) = jaspar.split_jaspar_id(id) 506 507 if not version: 508 version = self._fetch_latest_version(base_id) 509 510 int_id = self._fetch_internal_id(base_id, version) 511 512 if int_id: 513 int_ids.append(int_id) 514 515 return int_ids 516 517 tables = ["MATRIX m"] 518 where_clauses = [] 519 520 # Select by MATRIX.COLLECTION 521 if collection: 522 if isinstance(collection, list): 523 # Multiple collections passed in as a list 524 clause = "m.COLLECTION in ('" 525 clause = "".join([clause, "','".join(collection)]) 526 clause = "".join([clause, "')"]) 527 else: 528 # A single collection - typical usage 529 clause = "m.COLLECTION = '%s'" % collection 530 531 where_clauses.append(clause) 532 533 # Select by MATRIX.NAME 534 if tf_name: 535 if isinstance(tf_name, list): 536 # Multiple names passed in as a list 537 clause = "m.NAME in ('" 538 clause = "".join([clause, "','".join(tf_name)]) 539 clause = "".join([clause, "')"]) 540 else: 541 # A single name 542 clause = "m.NAME = '%s'" % tf_name 543 544 where_clauses.append(clause) 545 546 # Select by MATRIX_SPECIES.TAX_ID 547 if species: 548 tables.append("MATRIX_SPECIES ms") 549 where_clauses.append("m.ID = ms.ID") 550 551 """ 552 NOTE: species are numeric taxonomy IDs but stored as varchars 553 in the DB. 554 """ 555 if isinstance(species, list): 556 # Multiple tax IDs passed in as a list 557 clause = "ms.TAX_ID in ('" 558 clause = "".join([clause, "','".join(str(s) for s in species)]) 559 clause = "".join([clause, "')"]) 560 else: 561 # A single tax ID 562 clause = "ms.TAX_ID = '%s'" % str(species) 563 564 where_clauses.append(clause) 565 566 """ 567 Tag based selection from MATRIX_ANNOTATION 568 Differs from perl TFBS module in that the matrix class explicitly 569 has a tag attribute corresponding to the tags in the database. This 570 provides tremendous flexibility in adding new tags to the DB and 571 being able to select based on those tags with out adding new code. 572 In the JASPAR Motif class we have elected to use specific attributes 573 for the most commonly used tags and here correspondingly only allow 574 selection on these attributes. 575 576 The attributes corresponding to the tags for which selection is 577 provided are: 578 579 Attribute Tag 580 tf_class class 581 tf_family family 582 pazar_id pazar_tf_id 583 medline medline 584 data_type type 585 tax_group tax_group 586 """ 587 588 # Select by TF class(es) (MATRIX_ANNOTATION.TAG="class") 589 if tf_class: 590 tables.append("MATRIX_ANNOTATION ma1") 591 where_clauses.append("m.ID = ma1.ID") 592 593 clause = "ma1.TAG = 'class'" 594 if isinstance(tf_class, list): 595 # A list of TF classes 596 clause = "".join([clause, " and ma1.VAL in ('"]) 597 clause = "".join([clause, "','".join(tf_class)]) 598 clause = "".join([clause, "')"]) 599 else: 600 # A single TF class 601 clause = "".join([clause, " and ma1.VAL = '%s' " % tf_class]) 602 603 where_clauses.append(clause) 604 605 # Select by TF families (MATRIX_ANNOTATION.TAG="family") 606 if tf_family: 607 tables.append("MATRIX_ANNOTATION ma2") 608 where_clauses.append("m.ID = ma2.ID") 609 610 clause = "ma2.TAG = 'family'" 611 if isinstance(tf_family, list): 612 # A list of TF families 613 clause = "".join([clause, " and ma2.VAL in ('"]) 614 clause = "".join([clause, "','".join(tf_family)]) 615 clause = "".join([clause, "')"]) 616 else: 617 # A single TF family 618 clause = "".join([clause, " and ma2.VAL = '%s' " % tf_family]) 619 620 where_clauses.append(clause) 621 622 # Select by PAZAR TF ID(s) (MATRIX_ANNOTATION.TAG="pazar_tf_id") 623 if pazar_id: 624 tables.append("MATRIX_ANNOTATION ma3") 625 where_clauses.append("m.ID = ma3.ID") 626 627 clause = "ma3.TAG = 'pazar_tf_id'" 628 if isinstance(pazar_id, list): 629 # A list of PAZAR IDs 630 clause = "".join([clause, " and ma3.VAL in ('"]) 631 clause = "".join([clause, "','".join(pazar_id)]) 632 clause = "".join([clause, "')"]) 633 else: 634 # A single PAZAR ID 635 clause = "".join([" and ma3.VAL = '%s' " % pazar_id]) 636 637 where_clauses.append(clause) 638 639 # Select by PubMed ID(s) (MATRIX_ANNOTATION.TAG="medline") 640 if medline: 641 tables.append("MATRIX_ANNOTATION ma4") 642 where_clauses.append("m.ID = ma4.ID") 643 644 clause = "ma4.TAG = 'medline'" 645 if isinstance(medline, list): 646 # A list of PubMed IDs 647 clause = "".join([clause, " and ma4.VAL in ('"]) 648 clause = "".join([clause, "','".join(medline)]) 649 clause = "".join([clause, "')"]) 650 else: 651 # A single PubMed ID 652 clause = "".join([" and ma4.VAL = '%s' " % medline]) 653 654 where_clauses.append(clause) 655 656 # Select by data type(s) used to compile the matrix 657 # (MATRIX_ANNOTATION.TAG="type") 658 if data_type: 659 tables.append("MATRIX_ANNOTATION ma5") 660 where_clauses.append("m.ID = ma5.ID") 661 662 clause = "ma5.TAG = 'type'" 663 if isinstance(data_type, list): 664 # A list of data types 665 clause = "".join([clause, " and ma5.VAL in ('"]) 666 clause = "".join([clause, "','".join(data_type)]) 667 clause = "".join([clause, "')"]) 668 else: 669 # A single data type 670 clause = "".join([" and ma5.VAL = '%s' " % data_type]) 671 672 where_clauses.append(clause) 673 674 # Select by taxonomic supergroup(s) (MATRIX_ANNOTATION.TAG="tax_group") 675 if tax_group: 676 tables.append("MATRIX_ANNOTATION ma6") 677 where_clauses.append("m.ID = ma6.ID") 678 679 clause = "ma6.TAG = 'tax_group'" 680 if isinstance(tax_group, list): 681 # A list of tax IDs 682 clause = "".join([clause, " and ma6.VAL in ('"]) 683 clause = "".join([clause, "','".join(tax_group)]) 684 clause = "".join([clause, "')"]) 685 else: 686 # A single tax ID 687 clause = "".join([clause, " and ma6.VAL = '%s' " % tax_group]) 688 689 where_clauses.append(clause) 690 691 sql = "".join(["select distinct(m.ID) from ", ", ".join(tables)]) 692 693 if where_clauses: 694 sql = "".join([sql, " where ", " and ".join(where_clauses)]) 695 696 #print "sql = %s" % sql 697 698 cur.execute(sql) 699 rows = cur.fetchall() 700 701 for row in rows: 702 id = row[0] 703 if all_versions: 704 int_ids.append(id) 705 else: 706 # is the latest version? 707 if self._is_latest_version(id): 708 int_ids.append(id) 709 710 if len(int_ids) < 1: 711 warn("Warning: Zero motifs returned with current select critera") 712 713 return int_ids
714
715 - def _is_latest_version(self, int_id):
716 """ 717 Does this internal ID represened the latest version of the JASPAR 718 matrix (collapse on base ids) 719 720 """ 721 cur = self.dbh.cursor() 722 723 cur.execute("select count(*) from MATRIX where BASE_ID = (select BASE_ID from MATRIX where ID = %s) and VERSION > (select VERSION from MATRIX where ID = %s)", (int_id, int_id)) 724 725 row = cur.fetchone() 726 727 count = row[0] 728 729 if count == 0: 730 # no matrices with higher version ID and same base id 731 return True 732 733 return False
734