Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from __future__ import print_function 
  20   
  21  from time import gmtime, strftime 
  22   
  23  # biopython 
  24  from Bio import Alphabet 
  25  from Bio.SeqUtils.CheckSum import crc64 
  26  from Bio import Entrez 
  27  from Bio.Seq import UnknownSeq 
  28   
  29  from Bio._py3k import _is_int_or_long 
  30  from Bio._py3k import range 
  31  from Bio._py3k import basestring 
  32   
  33  __docformat__ = "restructuredtext en" 
  34   
  35   
36 -class DatabaseLoader(object):
37 """Object used to load SeqRecord objects into a BioSQL database.""" 38
39 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
40 """Initialize with connection information for the database. 41 42 Creating a DatabaseLoader object is normally handled via the 43 BioSeqDatabase DBServer object, for example:: 44 45 from BioSQL import BioSeqDatabase 46 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse", 47 passwd = "biosql", host = "localhost", db="test_biosql") 48 try: 49 db = server["test"] 50 except KeyError: 51 db = server.new_database("test", description="For testing GBrowse") 52 """ 53 self.adaptor = adaptor 54 self.dbid = dbid 55 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
56
57 - def load_seqrecord(self, record):
58 """Load a Biopython SeqRecord into the database.""" 59 bioentry_id = self._load_bioentry_table(record) 60 self._load_bioentry_date(record, bioentry_id) 61 self._load_biosequence(record, bioentry_id) 62 self._load_comment(record, bioentry_id) 63 self._load_dbxrefs(record, bioentry_id) 64 references = record.annotations.get('references', ()) 65 for reference, rank in zip(references, list(range(len(references)))): 66 self._load_reference(reference, rank, bioentry_id) 67 self._load_annotations(record, bioentry_id) 68 for seq_feature_num in range(len(record.features)): 69 seq_feature = record.features[seq_feature_num] 70 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
71
72 - def _get_ontology_id(self, name, definition=None):
73 """Returns the identifier for the named ontology (PRIVATE). 74 75 This looks through the onotology table for a the given entry name. 76 If it is not found, a row is added for this ontology (using the 77 definition if supplied). In either case, the id corresponding to 78 the provided name is returned, so that you can reference it in 79 another table. 80 """ 81 oids = self.adaptor.execute_and_fetch_col0( 82 "SELECT ontology_id FROM ontology WHERE name = %s", 83 (name,)) 84 if oids: 85 return oids[0] 86 self.adaptor.execute( 87 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 88 (name, definition)) 89 return self.adaptor.last_id("ontology")
90
91 - def _get_term_id(self, 92 name, 93 ontology_id=None, 94 definition=None, 95 identifier=None):
96 """Get the id that corresponds to a term (PRIVATE). 97 98 This looks through the term table for a the given term. If it 99 is not found, a new id corresponding to this term is created. 100 In either case, the id corresponding to that term is returned, so 101 that you can reference it in another table. 102 103 The ontology_id should be used to disambiguate the term. 104 """ 105 106 # try to get the term id 107 sql = r"SELECT term_id FROM term " \ 108 r"WHERE name = %s" 109 fields = [name] 110 if ontology_id: 111 sql += ' AND ontology_id = %s' 112 fields.append(ontology_id) 113 id_results = self.adaptor.execute_and_fetchall(sql, fields) 114 # something is wrong 115 if len(id_results) > 1: 116 raise ValueError("Multiple term ids for %s: %r" % 117 (name, id_results)) 118 elif len(id_results) == 1: 119 return id_results[0][0] 120 else: 121 sql = r"INSERT INTO term (name, definition," \ 122 r" identifier, ontology_id)" \ 123 r" VALUES (%s, %s, %s, %s)" 124 self.adaptor.execute(sql, (name, definition, 125 identifier, ontology_id)) 126 return self.adaptor.last_id("term")
127
128 - def _add_dbxref(self, dbname, accession, version):
129 """Insert a dbxref and return its id.""" 130 self.adaptor.execute( 131 "INSERT INTO dbxref(dbname, accession, version)" 132 " VALUES (%s, %s, %s)", (dbname, accession, version)) 133 return self.adaptor.last_id("dbxref")
134
135 - def _get_taxon_id(self, record):
136 """Get the taxon id for this record (PRIVATE). 137 138 record - a SeqRecord object 139 140 This searches the taxon/taxon_name tables using the 141 NCBI taxon ID, scientific name and common name to find 142 the matching taxon table entry's id. 143 144 If the species isn't in the taxon table, and we have at 145 least the NCBI taxon ID, scientific name or common name, 146 at least a minimal stub entry is created in the table. 147 148 Returns the taxon id (database key for the taxon table, 149 not an NCBI taxon ID), or None if the taxonomy information 150 is missing. 151 152 See also the BioSQL script load_ncbi_taxonomy.pl which 153 will populate and update the taxon/taxon_name tables 154 with the latest information from the NCBI. 155 """ 156 157 # To find the NCBI taxid, first check for a top level annotation 158 ncbi_taxon_id = None 159 if "ncbi_taxid" in record.annotations: 160 # Could be a list of IDs. 161 if isinstance(record.annotations["ncbi_taxid"], list): 162 if len(record.annotations["ncbi_taxid"]) == 1: 163 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 164 else: 165 ncbi_taxon_id = record.annotations["ncbi_taxid"] 166 if not ncbi_taxon_id: 167 # Secondly, look for a source feature 168 for f in record.features: 169 if f.type == 'source': 170 quals = getattr(f, 'qualifiers', {}) 171 if "db_xref" in quals: 172 for db_xref in f.qualifiers["db_xref"]: 173 if db_xref.startswith("taxon:"): 174 ncbi_taxon_id = int(db_xref[6:]) 175 break 176 if ncbi_taxon_id: 177 break 178 179 try: 180 scientific_name = record.annotations["organism"][:255] 181 except KeyError: 182 scientific_name = None 183 try: 184 common_name = record.annotations["source"][:255] 185 except KeyError: 186 common_name = None 187 # Note: The maximum length for taxon names in the schema is 255. 188 # Cropping it now should help in getting a match when searching, 189 # and avoids an error if we try and add these to the database. 190 191 if ncbi_taxon_id: 192 # Good, we have the NCBI taxon to go on - this is unambiguous :) 193 # Note that the scientific name and common name will only be 194 # used if we have to record a stub entry. 195 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 196 scientific_name, 197 common_name) 198 199 if not common_name and not scientific_name: 200 # Nothing to go on... and there is no point adding 201 # a new entry to the database. We'll just leave this 202 # sequence's taxon as a NULL in the database. 203 return None 204 205 # Next, we'll try to find a match based on the species name 206 # (stored in GenBank files as the organism and/or the source). 207 if scientific_name: 208 taxa = self.adaptor.execute_and_fetch_col0( 209 "SELECT taxon_id FROM taxon_name" 210 " WHERE name_class = 'scientific name' AND name = %s", 211 (scientific_name,)) 212 if taxa: 213 # Good, mapped the scientific name to a taxon table entry 214 return taxa[0] 215 216 # Last chance... 217 if common_name: 218 taxa = self.adaptor.execute_and_fetch_col0( 219 "SELECT DISTINCT taxon_id FROM taxon_name" 220 " WHERE name = %s", 221 (common_name,)) 222 # Its natural that several distinct taxa will have the same common 223 # name - in which case we can't resolve the taxon uniquely. 224 if len(taxa) > 1: 225 raise ValueError("Taxa: %d species have name %r" % ( 226 len(taxa), 227 common_name)) 228 if taxa: 229 # Good, mapped the common name to a taxon table entry 230 return taxa[0] 231 232 # At this point, as far as we can tell, this species isn't 233 # in the taxon table already. So we'll have to add it. 234 # We don't have an NCBI taxonomy ID, so if we do record just 235 # a stub entry, there is no simple way to fix this later. 236 # 237 # TODO - Should we try searching the NCBI taxonomy using the 238 # species name? 239 # 240 # OK, let's try inserting the species. 241 # Chances are we don't have enough information ... 242 # Furthermore, it won't be in the hierarchy. 243 244 lineage = [] 245 for c in record.annotations.get("taxonomy", []): 246 lineage.append([None, None, c]) 247 if lineage: 248 lineage[-1][1] = "genus" 249 lineage.append([None, "species", record.annotations["organism"]]) 250 # XXX do we have them? 251 if "subspecies" in record.annotations: 252 lineage.append([None, "subspecies", 253 record.annotations["subspecies"]]) 254 if "variant" in record.annotations: 255 lineage.append([None, "varietas", 256 record.annotations["variant"]]) 257 lineage[-1][0] = ncbi_taxon_id 258 259 left_value = self.adaptor.execute_one( 260 "SELECT MAX(left_value) FROM taxon")[0] 261 if not left_value: 262 left_value = 0 263 left_value += 1 264 265 # XXX -- Brad: Fixing this for now in an ugly way because 266 # I am getting overlaps for right_values. I need to dig into this 267 # more to actually understand how it works. I'm not sure it is 268 # actually working right anyhow. 269 right_start_value = self.adaptor.execute_one( 270 "SELECT MAX(right_value) FROM taxon")[0] 271 if not right_start_value: 272 right_start_value = 0 273 right_value = right_start_value + 2 * len(lineage) - 1 274 275 parent_taxon_id = None 276 for taxon in lineage: 277 self.adaptor.execute( 278 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 279 " left_value, right_value)" 280 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 281 taxon[0], 282 taxon[1], 283 left_value, 284 right_value)) 285 taxon_id = self.adaptor.last_id("taxon") 286 self.adaptor.execute( 287 "INSERT INTO taxon_name(taxon_id, name, name_class)" 288 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 289 # Note the name field is limited to 255, some SwissProt files 290 # have a multi-species name which can be longer. So truncate this. 291 left_value += 1 292 right_value -= 1 293 parent_taxon_id = taxon_id 294 if common_name: 295 self.adaptor.execute( 296 "INSERT INTO taxon_name(taxon_id, name, name_class)" 297 "VALUES (%s, %s, 'common name')", ( 298 taxon_id, common_name)) 299 300 return taxon_id
301
302 - def _fix_name_class(self, entrez_name):
303 """Map Entrez name terms to those used in taxdump (PRIVATE). 304 305 We need to make this conversion to match the taxon_name.name_class 306 values used by the BioSQL load_ncbi_taxonomy.pl script. 307 308 e.g. 309 "ScientificName" -> "scientific name", 310 "EquivalentName" -> "equivalent name", 311 "Synonym" -> "synonym", 312 """ 313 # Add any special cases here: 314 # 315 # known = {} 316 # try: 317 # return known[entrez_name] 318 # except KeyError: 319 # pass 320 321 # Try automatically by adding spaces before each capital 322 def add_space(letter): 323 """Adds a space before a capital letter.""" 324 if letter.isupper(): 325 return " " + letter.lower() 326 else: 327 return letter
328 answer = "".join(add_space(letter) for letter in entrez_name).strip() 329 assert answer == answer.lower() 330 return answer
331
332 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 333 scientific_name=None, 334 common_name=None):
335 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE). 336 337 ncbi_taxon_id - string containing an NCBI taxon id 338 scientific_name - string, used if a stub entry is recorded 339 common_name - string, used if a stub entry is recorded 340 341 This searches the taxon table using ONLY the NCBI taxon ID 342 to find the matching taxon table entry's ID (database key). 343 344 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 345 flag is true, Biopython will attempt to go online using Bio.Entrez 346 to fetch the official NCBI lineage, recursing up the tree until an 347 existing entry is found in the database or the full lineage has been 348 fetched. 349 350 Otherwise the NCBI taxon ID, scientific name and common name are 351 recorded as a minimal stub entry in the taxon and taxon_name tables. 352 Any partial information about the lineage from the SeqRecord is NOT 353 recorded. This should mean that (re)running the BioSQL script 354 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 355 356 Returns the taxon id (database key for the taxon table, not 357 an NCBI taxon ID). 358 """ 359 assert ncbi_taxon_id 360 361 taxon_id = self.adaptor.execute_and_fetch_col0( 362 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 363 (int(ncbi_taxon_id),)) 364 if taxon_id: 365 # Good, we have mapped the NCBI taxid to a taxon table entry 366 return taxon_id[0] 367 368 # At this point, as far as we can tell, this species isn't 369 # in the taxon table already. So we'll have to add it. 370 371 parent_taxon_id = None 372 rank = "species" 373 genetic_code = None 374 mito_genetic_code = None 375 species_names = [] 376 if scientific_name: 377 species_names.append(("scientific name", scientific_name)) 378 if common_name: 379 species_names.append(("common name", common_name)) 380 381 if self.fetch_NCBI_taxonomy: 382 # Go online to get the parent taxon ID! 383 handle = Entrez.efetch( 384 db="taxonomy", id=ncbi_taxon_id, retmode="XML") 385 taxonomic_record = Entrez.read(handle) 386 if len(taxonomic_record) == 1: 387 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 388 "%s versus %s" % (taxonomic_record[0]["TaxId"], 389 ncbi_taxon_id) 390 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( 391 taxonomic_record[0]["LineageEx"]) 392 rank = taxonomic_record[0]["Rank"] 393 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"] 394 mito_genetic_code = taxonomic_record[ 395 0]["MitoGeneticCode"]["MGCId"] 396 species_names = [("scientific name", 397 taxonomic_record[0]["ScientificName"])] 398 try: 399 for name_class, names in taxonomic_record[0]["OtherNames"].items(): 400 name_class = self._fix_name_class(name_class) 401 if not isinstance(names, list): 402 # The Entrez parser seems to return single entry 403 # lists as just a string which is annoying. 404 names = [names] 405 for name in names: 406 # Want to ignore complex things like ClassCDE 407 # entries 408 if isinstance(name, basestring): 409 species_names.append((name_class, name)) 410 except KeyError: 411 # OtherNames isn't always present, 412 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana 413 pass 414 else: 415 pass 416 # If we are not allowed to go online, we will record the bare minimum; 417 # as long as the NCBI taxon id is present, then (re)running 418 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 419 # (and update the species names). 420 # 421 # I am NOT going to try and record the lineage, even if it 422 # is in the record annotation as a list of names, as we won't 423 # know the NCBI taxon IDs for these parent nodes. 424 425 self.adaptor.execute( 426 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 427 " genetic_code, mito_genetic_code, left_value, right_value)" 428 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 429 ncbi_taxon_id, 430 rank, 431 genetic_code, 432 mito_genetic_code, 433 None, 434 None)) 435 taxon_id = self.adaptor.last_id("taxon") 436 437 # Record the scientific name, common name, etc 438 for name_class, name in species_names: 439 self.adaptor.execute( 440 "INSERT INTO taxon_name(taxon_id, name, name_class)" 441 " VALUES (%s, %s, %s)", (taxon_id, 442 name[:255], 443 name_class)) 444 return taxon_id
445
446 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
447 """This is recursive! (PRIVATE). 448 449 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 450 451 First dictionary in list is the taxonomy root, highest would be the species. 452 Each dictionary includes: 453 - TaxID (string, NCBI taxon id) 454 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 455 - ScientificName (string) 456 (and that is all at the time of writing) 457 458 This method will record all the lineage given, returning the taxon id 459 (database key, not NCBI taxon id) of the final entry (the species). 460 """ 461 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"] 462 463 # Is this in the database already? Check the taxon table... 464 taxon_id = self.adaptor.execute_and_fetch_col0( 465 "SELECT taxon_id FROM taxon" 466 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 467 if taxon_id: 468 # we could verify that the Scientific Name etc in the database 469 # is the same and update it or print a warning if not... 470 if isinstance(taxon_id, list): 471 assert len(taxon_id) == 1 472 return taxon_id[0] 473 else: 474 return taxon_id 475 476 # We have to record this. 477 if len(taxonomic_lineage) > 1: 478 # Use recursion to find out the taxon id (database key) of the 479 # parent. 480 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( 481 taxonomic_lineage[:-1]) 482 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id) 483 else: 484 parent_taxon_id = None 485 486 # INSERT new taxon 487 rank = taxonomic_lineage[-1].get("Rank", None) 488 self.adaptor.execute( 489 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)" 490 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank)) 491 taxon_id = self.adaptor.last_id("taxon") 492 assert isinstance(taxon_id, (int, long)), repr(taxon_id) 493 # ... and its name in taxon_name 494 scientific_name = taxonomic_lineage[-1].get("ScientificName", None) 495 if scientific_name: 496 self.adaptor.execute( 497 "INSERT INTO taxon_name(taxon_id, name, name_class)" 498 " VALUES (%s, %s, 'scientific name')", (taxon_id, 499 scientific_name[:255])) 500 return taxon_id
501
502 - def _load_bioentry_table(self, record):
503 """Fill the bioentry table with sequence information (PRIVATE). 504 505 record - SeqRecord object to add to the database. 506 """ 507 # get the pertinent info and insert it 508 509 if record.id.count(".") == 1: # try to get a version from the id 510 # This assumes the string is something like "XXXXXXXX.123" 511 accession, version = record.id.split('.') 512 try: 513 version = int(version) 514 except ValueError: 515 accession = record.id 516 version = 0 517 else: # otherwise just use a version of 0 518 accession = record.id 519 version = 0 520 521 if "accessions" in record.annotations \ 522 and isinstance(record.annotations["accessions"], list) \ 523 and record.annotations["accessions"]: 524 # Take the first accession (one if there is more than one) 525 accession = record.annotations["accessions"][0] 526 527 # Find the taxon id (this is not just the NCBI Taxon ID) 528 # NOTE - If the species isn't defined in the taxon table, 529 # a new minimal entry is created. 530 taxon_id = self._get_taxon_id(record) 531 532 if "gi" in record.annotations: 533 identifier = record.annotations["gi"] 534 else: 535 identifier = record.id 536 537 # Allow description and division to default to NULL as in BioPerl. 538 description = getattr(record, 'description', None) 539 division = record.annotations.get("data_file_division", None) 540 541 sql = """ 542 INSERT INTO bioentry ( 543 biodatabase_id, 544 taxon_id, 545 name, 546 accession, 547 identifier, 548 division, 549 description, 550 version) 551 VALUES ( 552 %s, 553 %s, 554 %s, 555 %s, 556 %s, 557 %s, 558 %s, 559 %s)""" 560 # print self.dbid, taxon_id, record.name, accession, identifier, \ 561 # division, description, version 562 self.adaptor.execute(sql, (self.dbid, 563 taxon_id, 564 record.name, 565 accession, 566 identifier, 567 division, 568 description, 569 version)) 570 # now retrieve the id for the bioentry 571 bioentry_id = self.adaptor.last_id('bioentry') 572 573 return bioentry_id
574
575 - def _load_bioentry_date(self, record, bioentry_id):
576 """Add the effective date of the entry into the database. 577 578 record - a SeqRecord object with an annotated date 579 bioentry_id - corresponding database identifier 580 """ 581 # dates are GenBank style, like: 582 # 14-SEP-2000 583 date = record.annotations.get("date", 584 strftime("%d-%b-%Y", gmtime()).upper()) 585 if isinstance(date, list): 586 date = date[0] 587 annotation_tags_id = self._get_ontology_id("Annotation Tags") 588 date_id = self._get_term_id("date_changed", annotation_tags_id) 589 sql = r"INSERT INTO bioentry_qualifier_value" \ 590 r" (bioentry_id, term_id, value, rank)" \ 591 r" VALUES (%s, %s, %s, 1)" 592 self.adaptor.execute(sql, (bioentry_id, date_id, date))
593
594 - def _load_biosequence(self, record, bioentry_id):
595 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE). 596 597 record - a SeqRecord object with a seq property 598 bioentry_id - corresponding database identifier 599 """ 600 if record.seq is None: 601 # The biosequence table entry is optional, so if we haven't 602 # got a sequence, we don't need to write to the table. 603 return 604 605 # determine the string representation of the alphabet 606 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 607 alphabet = "dna" 608 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 609 alphabet = "rna" 610 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 611 alphabet = "protein" 612 else: 613 alphabet = "unknown" 614 615 if isinstance(record.seq, UnknownSeq): 616 seq_str = None 617 else: 618 seq_str = str(record.seq) 619 620 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 621 r"length, seq, alphabet) " \ 622 r"VALUES (%s, 0, %s, %s, %s)" 623 self.adaptor.execute(sql, (bioentry_id, 624 len(record.seq), 625 seq_str, 626 alphabet))
627
628 - def _load_comment(self, record, bioentry_id):
629 """Record a SeqRecord's annotated comment in the database (PRIVATE). 630 631 record - a SeqRecord object with an annotated comment 632 bioentry_id - corresponding database identifier 633 """ 634 comments = record.annotations.get('comment') 635 if not comments: 636 return 637 if not isinstance(comments, list): 638 # It should be a string then... 639 comments = [comments] 640 641 for index, comment in enumerate(comments): 642 comment = comment.replace('\n', ' ') 643 # TODO - Store each line as a separate entry? This would preserve 644 # the newlines, but we should check BioPerl etc to be consistent. 645 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 646 " VALUES (%s, %s, %s)" 647 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
648
649 - def _load_annotations(self, record, bioentry_id):
650 """Record a SeqRecord's misc annotations in the database (PRIVATE). 651 652 The annotation strings are recorded in the bioentry_qualifier_value 653 table, except for special cases like the reference, comment and 654 taxonomy which are handled with their own tables. 655 656 record - a SeqRecord object with an annotations dictionary 657 bioentry_id - corresponding database identifier 658 """ 659 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 660 "(bioentry_id, term_id, value)" \ 661 " VALUES (%s, %s, %s)" 662 many_sql = "INSERT INTO bioentry_qualifier_value" \ 663 "(bioentry_id, term_id, value, rank)" \ 664 " VALUES (%s, %s, %s, %s)" 665 tag_ontology_id = self._get_ontology_id('Annotation Tags') 666 for key, value in record.annotations.items(): 667 if key in ["references", "comment", "ncbi_taxid", "date"]: 668 # Handled separately 669 continue 670 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 671 if isinstance(value, list) or isinstance(value, tuple): 672 rank = 0 673 for entry in value: 674 if isinstance(entry, str) or isinstance(entry, int): 675 # Easy case 676 rank += 1 677 self.adaptor.execute(many_sql, 678 (bioentry_id, term_id, str(entry), rank)) 679 else: 680 pass 681 # print "Ignoring annotation '%s' sub-entry of type '%s'" \ 682 # % (key, str(type(entry))) 683 elif isinstance(value, str) or isinstance(value, int): 684 # Have a simple single entry, leave rank as the DB default 685 self.adaptor.execute(mono_sql, 686 (bioentry_id, term_id, str(value))) 687 else: 688 pass
689 # print "Ignoring annotation '%s' entry of type '%s'" \ 690 # % (key, type(value)) 691
692 - def _load_reference(self, reference, rank, bioentry_id):
693 """Record a SeqRecord's annotated references in the database (PRIVATE). 694 695 record - a SeqRecord object with annotated references 696 bioentry_id - corresponding database identifier 697 """ 698 699 refs = None 700 if reference.medline_id: 701 refs = self.adaptor.execute_and_fetch_col0( 702 "SELECT reference_id" 703 " FROM reference JOIN dbxref USING (dbxref_id)" 704 " WHERE dbname = 'MEDLINE' AND accession = %s", 705 (reference.medline_id,)) 706 if not refs and reference.pubmed_id: 707 refs = self.adaptor.execute_and_fetch_col0( 708 "SELECT reference_id" 709 " FROM reference JOIN dbxref USING (dbxref_id)" 710 " WHERE dbname = 'PUBMED' AND accession = %s", 711 (reference.pubmed_id,)) 712 if not refs: 713 s = [] 714 for f in reference.authors, reference.title, reference.journal: 715 s.append(f or "<undef>") 716 crc = crc64("".join(s)) 717 refs = self.adaptor.execute_and_fetch_col0( 718 "SELECT reference_id FROM reference" 719 r" WHERE crc = %s", (crc,)) 720 if not refs: 721 if reference.medline_id: 722 dbxref_id = self._add_dbxref("MEDLINE", 723 reference.medline_id, 0) 724 elif reference.pubmed_id: 725 dbxref_id = self._add_dbxref("PUBMED", 726 reference.pubmed_id, 0) 727 else: 728 dbxref_id = None 729 authors = reference.authors or None 730 title = reference.title or None 731 # The location/journal field cannot be Null, so default 732 # to an empty string rather than None: 733 journal = reference.journal or "" 734 self.adaptor.execute( 735 "INSERT INTO reference (dbxref_id, location," 736 " title, authors, crc)" 737 " VALUES (%s, %s, %s, %s, %s)", 738 (dbxref_id, journal, title, 739 authors, crc)) 740 reference_id = self.adaptor.last_id("reference") 741 else: 742 reference_id = refs[0] 743 744 if reference.location: 745 start = 1 + int(str(reference.location[0].start)) 746 end = int(str(reference.location[0].end)) 747 else: 748 start = None 749 end = None 750 751 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 752 " start_pos, end_pos, rank)" \ 753 " VALUES (%s, %s, %s, %s, %s)" 754 self.adaptor.execute(sql, (bioentry_id, reference_id, 755 start, end, rank + 1))
756
757 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
758 """Load a biopython SeqFeature into the database (PRIVATE). 759 """ 760 # records loaded from a gff file using BCBio.GFF will contain the value 761 # of the 2nd column of the gff as a feature qualifier. The BioSQL wiki 762 # suggests that the source should not go in with the other feature 763 # mappings but instead be put in the term table 764 # (http://www.biosql.org/wiki/Annotation_Mapping) 765 try: 766 source = feature.qualifiers['source'] 767 if isinstance(source, list): 768 source = source[0] 769 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 770 bioentry_id, source=source) 771 except KeyError: 772 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 773 bioentry_id) 774 775 self._load_seqfeature_locations(feature, seqfeature_id) 776 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
777
778 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id, source='EMBL/GenBank/SwissProt'):
779 """Load the first tables of a seqfeature and returns the id (PRIVATE). 780 781 This loads the "key" of the seqfeature (ie. CDS, gene) and 782 the basic seqfeature table itself. 783 """ 784 ontology_id = self._get_ontology_id('SeqFeature Keys') 785 seqfeature_key_id = self._get_term_id(feature_type, 786 ontology_id=ontology_id) 787 source_cat_id = self._get_ontology_id('SeqFeature Sources') 788 source_term_id = self._get_term_id(source, 789 ontology_id=source_cat_id) 790 791 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 792 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 793 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 794 source_term_id, feature_rank + 1)) 795 seqfeature_id = self.adaptor.last_id('seqfeature') 796 797 return seqfeature_id
798
799 - def _load_seqfeature_locations(self, feature, seqfeature_id):
800 """Load all of the locations for a SeqFeature into tables (PRIVATE). 801 802 This adds the locations related to the SeqFeature into the 803 seqfeature_location table. Fuzzies are not handled right now. 804 For a simple location, ie (1..2), we have a single table row 805 with seq_start = 1, seq_end = 2, location_rank = 1. 806 807 For split locations, ie (1..2, 3..4, 5..6) we would have three 808 row tables with:: 809 810 start = 1, end = 2, rank = 1 811 start = 3, end = 4, rank = 2 812 start = 5, end = 6, rank = 3 813 """ 814 # TODO - Record an ontology for the locations (using location.term_id) 815 # which for now as in BioPerl we leave defaulting to NULL. 816 if feature.location_operator and feature.location_operator != "join": 817 # e.g. order locations... we don't record "order" so it 818 # will become a "join" on reloading. What does BioPerl do? 819 import warnings 820 from Bio import BiopythonWarning 821 warnings.warn("%s location operators are not fully supported" 822 % feature.location_operator, BiopythonWarning) 823 # This will be a list of length one for simple FeatureLocation: 824 parts = feature.location.parts 825 if parts and set(loc.strand for loc in parts) == set([-1]): 826 # To mimic prior behaviour of Biopython+BioSQL, reverse order 827 parts = parts[::-1] 828 # TODO - Check what BioPerl does; see also BioSeq.py code 829 for rank, loc in enumerate(parts): 830 self._insert_location(loc, rank + 1, seqfeature_id)
831
832 - def _insert_location(self, location, rank, seqfeature_id):
833 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE). 834 835 TODO - Add location operator to location_qualifier_value? 836 """ 837 # convert biopython locations to the 1-based location system 838 # used in bioSQL 839 # XXX This could also handle fuzzies 840 start = int(location.start) + 1 841 end = int(location.end) 842 843 # Biopython uses None when we don't know strand information but 844 # BioSQL requires something (non null) and sets this as zero 845 # So we'll use the strand or 0 if Biopython spits out None 846 strand = location.strand or 0 847 848 # TODO - Record an ontology term for the location (location.term_id) 849 # which for now like BioPerl we'll leave as NULL. 850 # This might allow us to record "between" positions properly, but I 851 # doesn't really see how it could work for before/after fuzzy positions 852 loc_term_id = None 853 854 if location.ref: 855 # sub_feature remote locations when they are in the same db as the current 856 # record do not have a value for ref_db, which the SeqFeature object 857 # stores as None. BioSQL schema requires a varchar and is not NULL 858 dbxref_id = self._get_dbxref_id( 859 location.ref_db or "", location.ref) 860 else: 861 dbxref_id = None 862 863 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 864 r"start_pos, end_pos, strand, rank) " \ 865 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 866 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 867 start, end, strand, rank)) 868 869 """ 870 # See Bug 2677 871 # TODO - Record the location_operator (e.g. "join" or "order") 872 # using the location_qualifier_value table (which we and BioPerl 873 # have historically left empty). 874 # Note this will need an ontology term for the location qualifer 875 # (location_qualifier_value.term_id) for which oddly the schema 876 # does not allow NULL. 877 if feature.location_operator: 878 #e.g. "join" (common), 879 #or "order" (see Tests/GenBank/protein_refseq2.gb) 880 location_id = self.adaptor.last_id('location') 881 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 882 sql = r"INSERT INTO location_qualifier_value" \ 883 r"(location_id, term_id, value)" \ 884 r"VALUES (%s, %s, %s)" 885 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 886 feature.location_operator)) 887 """
888
889 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
890 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE). 891 892 Qualifiers should be a dictionary of the form: 893 {key : [value1, value2]} 894 """ 895 tag_ontology_id = self._get_ontology_id('Annotation Tags') 896 for qualifier_key in qualifiers: 897 # Treat db_xref qualifiers differently to sequence annotation 898 # qualifiers by populating the seqfeature_dbxref and dbxref 899 # tables. Other qualifiers go into the seqfeature_qualifier_value 900 # and (if new) term tables. 901 if qualifier_key != 'db_xref': 902 qualifier_key_id = self._get_term_id(qualifier_key, 903 ontology_id=tag_ontology_id) 904 # now add all of the values to their table 905 entries = qualifiers[qualifier_key] 906 if not isinstance(entries, list): 907 # Could be a plain string, or an int or a float. 908 # However, we exect a list of strings here. 909 entries = [entries] 910 for qual_value_rank in range(len(entries)): 911 qualifier_value = entries[qual_value_rank] 912 sql = r"INSERT INTO seqfeature_qualifier_value "\ 913 r" (seqfeature_id, term_id, rank, value) VALUES"\ 914 r" (%s, %s, %s, %s)" 915 self.adaptor.execute(sql, (seqfeature_id, 916 qualifier_key_id, 917 qual_value_rank + 1, 918 qualifier_value)) 919 else: 920 # The dbxref_id qualifier/value sets go into the dbxref table 921 # as dbname, accession, version tuples, with dbxref.dbxref_id 922 # being automatically assigned, and into the seqfeature_dbxref 923 # table as seqfeature_id, dbxref_id, and rank tuples 924 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 925 seqfeature_id)
926
927 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
928 """Add database crossreferences of a SeqFeature to the database (PRIVATE). 929 930 o dbxrefs List, dbxref data from the source file in the 931 format <database>:<accession> 932 933 o seqfeature_id Int, the identifier for the seqfeature in the 934 seqfeature table 935 936 Insert dbxref qualifier data for a seqfeature into the 937 seqfeature_dbxref and, if required, dbxref tables. 938 The dbxref_id qualifier/value sets go into the dbxref table 939 as dbname, accession, version tuples, with dbxref.dbxref_id 940 being automatically assigned, and into the seqfeature_dbxref 941 table as seqfeature_id, dbxref_id, and rank tuples 942 """ 943 # NOTE - In older versions of Biopython, we would map the GenBank 944 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 945 # for any unknown terms. This was a long term maintainance problem, 946 # and differed from BioPerl and BioJava's implementation. See bug 2405 947 for rank, value in enumerate(dbxrefs): 948 # Split the DB:accession format string at colons. We have to 949 # account for multiple-line and multiple-accession entries 950 try: 951 dbxref_data = value.replace(' ', '').replace('\n', '').split(':') 952 db = dbxref_data[0] 953 accessions = dbxref_data[1:] 954 except: 955 raise ValueError("Parsing of db_xref failed: '%s'" % value) 956 # Loop over all the grabbed accessions, and attempt to fill the 957 # table 958 for accession in accessions: 959 # Get the dbxref_id value for the dbxref data 960 dbxref_id = self._get_dbxref_id(db, accession) 961 # Insert the seqfeature_dbxref data 962 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
963
964 - def _get_dbxref_id(self, db, accession):
965 """ _get_dbxref_id(self, db, accession) -> Int 966 967 o db String, the name of the external database containing 968 the accession number 969 970 o accession String, the accession of the dbxref data 971 972 Finds and returns the dbxref_id for the passed data. The method 973 attempts to find an existing record first, and inserts the data 974 if there is no record. 975 """ 976 # Check for an existing record 977 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 978 r'AND accession = %s' 979 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 980 # If there was a record, return the dbxref_id, else create the 981 # record and return the created dbxref_id 982 if dbxref_id: 983 return dbxref_id[0] 984 return self._add_dbxref(db, accession, 0)
985
986 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
987 """ Check for a pre-existing seqfeature_dbxref entry with the passed 988 seqfeature_id and dbxref_id. If one does not exist, insert new 989 data 990 991 """ 992 # Check for an existing record 993 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 994 r"WHERE seqfeature_id = %s AND dbxref_id = %s" 995 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 996 dbxref_id)) 997 # If there was a record, return without executing anything, else create 998 # the record and return 999 if result: 1000 return result 1001 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
1002
1003 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1004 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 1005 dbxref_id 1006 """ 1007 sql = r'INSERT INTO seqfeature_dbxref ' \ 1008 '(seqfeature_id, dbxref_id, rank) VALUES' \ 1009 r'(%s, %s, %s)' 1010 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 1011 return (seqfeature_id, dbxref_id)
1012
1013 - def _load_dbxrefs(self, record, bioentry_id):
1014 """Load any sequence level cross references into the database (PRIVATE). 1015 1016 See table bioentry_dbxref.""" 1017 for rank, value in enumerate(record.dbxrefs): 1018 # Split the DB:accession string at first colon. 1019 # We have to cope with things like: 1020 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 1021 # "GO:GO:123" (db="GO", accession="GO:123") 1022 # 1023 # Annoyingly I have seen the NCBI use both the style 1024 # "GO:GO:123" and "GO:123" in different vintages. 1025 assert value.count("\n") == 0 1026 try: 1027 db, accession = value.split(':', 1) 1028 db = db.strip() 1029 accession = accession.strip() 1030 except: 1031 raise ValueError( 1032 "Parsing of dbxrefs list failed: '%s'" % value) 1033 # Get the dbxref_id value for the dbxref data 1034 dbxref_id = self._get_dbxref_id(db, accession) 1035 # Insert the bioentry_dbxref data 1036 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
1037
1038 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1039 """ Check for a pre-existing bioentry_dbxref entry with the passed 1040 seqfeature_id and dbxref_id. If one does not exist, insert new 1041 data 1042 1043 """ 1044 # Check for an existing record 1045 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1046 r"WHERE bioentry_id = %s AND dbxref_id = %s" 1047 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1048 dbxref_id)) 1049 # If there was a record, return without executing anything, else create 1050 # the record and return 1051 if result: 1052 return result 1053 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1054
1055 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1056 """ Insert a bioentry_dbxref row and return the seqfeature_id and 1057 dbxref_id 1058 """ 1059 sql = r'INSERT INTO bioentry_dbxref ' \ 1060 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1061 '(%s, %s, %s)' 1062 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1063 return (bioentry_id, dbxref_id)
1064 1065
1066 -class DatabaseRemover(object):
1067 """Complement the Loader functionality by fully removing a database. 1068 1069 This probably isn't really useful for normal purposes, since you 1070 can just do a:: 1071 1072 DROP DATABASE db_name 1073 1074 and then recreate the database. But, it's really useful for testing 1075 purposes. 1076 """ 1077
1078 - def __init__(self, adaptor, dbid):
1079 """Initialize with a database id and adaptor connection.""" 1080 self.adaptor = adaptor 1081 self.dbid = dbid
1082
1083 - def remove(self):
1084 """Remove everything related to the given database id.""" 1085 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1086 self.adaptor.execute(sql, (self.dbid,)) 1087 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1088 self.adaptor.execute(sql, (self.dbid,))
1089