Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like SeqRecord and Seq interface. 
 14   
 15  Note: Currently we do not support recording per-letter-annotations 
 16  (like quality scores) in BioSQL. 
 17  """ 
 18   
 19  from Bio._py3k import unicode 
 20   
 21  from Bio import Alphabet 
 22  from Bio.Seq import Seq, UnknownSeq 
 23  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 24  from Bio import SeqFeature 
 25   
 26   
27 -class DBSeq(Seq):
28 """BioSQL equivalent of the Biopython Seq object.""" 29
30 - def __init__(self, primary_id, adaptor, alphabet, start, length):
31 """Create a new DBSeq object referring to a BioSQL entry. 32 33 You wouldn't normally create a DBSeq object yourself, this is done 34 for you when retreiving a DBSeqRecord object from the database. 35 """ 36 self.primary_id = primary_id 37 self.adaptor = adaptor 38 self.alphabet = alphabet 39 self._length = length 40 self.start = start
41
42 - def __len__(self):
43 return self._length
44
45 - def __getitem__(self, index): # Seq API requirement
46 # Note since Python 2.0, __getslice__ is deprecated 47 # and __getitem__ is used instead. 48 # See http://docs.python.org/ref/sequence-methods.html 49 if isinstance(index, int): 50 # Return a single letter as a string 51 i = index 52 if i < 0: 53 if -i > self._length: 54 raise IndexError(i) 55 i = i + self._length 56 elif i >= self._length: 57 raise IndexError(i) 58 return self.adaptor.get_subseq_as_string(self.primary_id, 59 self.start + i, 60 self.start + i + 1) 61 if not isinstance(index, slice): 62 raise ValueError("Unexpected index type") 63 64 # Return the (sub)sequence as another DBSeq or Seq object 65 # (see the Seq obect's __getitem__ method) 66 if index.start is None: 67 i = 0 68 else: 69 i = index.start 70 if i < 0: 71 # Map to equavilent positive index 72 if -i > self._length: 73 raise IndexError(i) 74 i = i + self._length 75 elif i >= self._length: 76 # Trivial case, should return empty string! 77 i = self._length 78 79 if index.stop is None: 80 j = self._length 81 else: 82 j = index.stop 83 if j < 0: 84 # Map to equavilent positive index 85 if -j > self._length: 86 raise IndexError(j) 87 j = j + self._length 88 elif j >= self._length: 89 j = self._length 90 91 if i >= j: 92 # Trivial case, empty string. 93 return Seq("", self.alphabet) 94 elif index.step is None or index.step == 1: 95 # Easy case - can return a DBSeq with the start and end adjusted 96 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 97 self.start + i, j - i) 98 else: 99 # Tricky. Will have to create a Seq object because of the stride 100 full = self.adaptor.get_subseq_as_string(self.primary_id, 101 self.start + i, 102 self.start + j) 103 return Seq(full[::index.step], self.alphabet)
104
105 - def tostring(self):
106 """Returns the full sequence as a python string (DEPRECATED). 107 108 You are now encouraged to use str(my_seq) instead of 109 my_seq.tostring().""" 110 import warnings 111 warnings.warn("This method is obsolete; please use str(my_seq) " 112 "instead of my_seq.tostring().", 113 PendingDeprecationWarning) 114 return self.adaptor.get_subseq_as_string(self.primary_id, 115 self.start, 116 self.start + self._length)
117
118 - def __str__(self):
119 """Returns the full sequence as a python string.""" 120 return self.adaptor.get_subseq_as_string(self.primary_id, 121 self.start, 122 self.start + self._length)
123 124 data = property(tostring, doc="Sequence as string (DEPRECATED)") 125
126 - def toseq(self):
127 """Returns the full sequence as a Seq object.""" 128 # Note - the method name copies that of the MutableSeq object 129 return Seq(str(self), self.alphabet)
130
131 - def __add__(self, other):
132 # Let the Seq object deal with the alphabet issues etc 133 return self.toseq() + other
134
135 - def __radd__(self, other):
136 # Let the Seq object deal with the alphabet issues etc 137 return other + self.toseq()
138 139
140 -def _retrieve_seq(adaptor, primary_id):
141 # The database schema ensures there will be only one matching 142 # row in the table. 143 144 # If an UnknownSeq was recorded, seq will be NULL, 145 # but length will be populated. This means length(seq) 146 # will return None. 147 seqs = adaptor.execute_and_fetchall( 148 "SELECT alphabet, length, length(seq) FROM biosequence" 149 " WHERE bioentry_id = %s", (primary_id,)) 150 if not seqs: 151 return 152 assert len(seqs) == 1 153 moltype, given_length, length = seqs[0] 154 155 try: 156 length = int(length) 157 given_length = int(length) 158 assert length == given_length 159 have_seq = True 160 except TypeError: 161 assert length is None 162 seqs = adaptor.execute_and_fetchall( 163 "SELECT alphabet, length, seq FROM biosequence" 164 " WHERE bioentry_id = %s", (primary_id,)) 165 assert len(seqs) == 1 166 moltype, given_length, seq = seqs[0] 167 assert seq is None or seq == "" 168 length = int(given_length) 169 have_seq = False 170 del seq 171 del given_length 172 173 moltype = moltype.lower() # might be upper case in database 174 # We have no way of knowing if these sequences will use IUPAC 175 # alphabets, and we certainly can't assume they are unambiguous! 176 if moltype == "dna": 177 alphabet = Alphabet.generic_dna 178 elif moltype == "rna": 179 alphabet = Alphabet.generic_rna 180 elif moltype == "protein": 181 alphabet = Alphabet.generic_protein 182 elif moltype == "unknown": 183 # This is used in BioSQL/Loader.py and would happen 184 # for any generic or nucleotide alphabets. 185 alphabet = Alphabet.single_letter_alphabet 186 else: 187 raise AssertionError("Unknown moltype: %s" % moltype) 188 189 if have_seq: 190 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 191 else: 192 return UnknownSeq(length, alphabet)
193 194
195 -def _retrieve_dbxrefs(adaptor, primary_id):
196 """Retrieve the database cross references for the sequence.""" 197 _dbxrefs = [] 198 dbxrefs = adaptor.execute_and_fetchall( 199 "SELECT dbname, accession, version" 200 " FROM bioentry_dbxref join dbxref using (dbxref_id)" 201 " WHERE bioentry_id = %s" 202 " ORDER BY rank", (primary_id,)) 203 for dbname, accession, version in dbxrefs: 204 if version and version != "0": 205 v = "%s.%s" % (accession, version) 206 else: 207 v = accession 208 _dbxrefs.append("%s:%s" % (dbname, v)) 209 return _dbxrefs
210 211
212 -def _retrieve_features(adaptor, primary_id):
213 sql = "SELECT seqfeature_id, type.name, rank" \ 214 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 215 " WHERE bioentry_id = %s" \ 216 " ORDER BY rank" 217 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 218 seq_feature_list = [] 219 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 220 # Get qualifiers [except for db_xref which is stored separately] 221 qvs = adaptor.execute_and_fetchall( 222 "SELECT name, value" 223 " FROM seqfeature_qualifier_value join term using (term_id)" 224 " WHERE seqfeature_id = %s" 225 " ORDER BY rank", (seqfeature_id,)) 226 qualifiers = {} 227 for qv_name, qv_value in qvs: 228 qualifiers.setdefault(qv_name, []).append(qv_value) 229 # Get db_xrefs [special case of qualifiers] 230 qvs = adaptor.execute_and_fetchall( 231 "SELECT dbxref.dbname, dbxref.accession" 232 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" 233 " WHERE seqfeature_dbxref.seqfeature_id = %s" 234 " ORDER BY rank", (seqfeature_id,)) 235 for qv_name, qv_value in qvs: 236 value = "%s:%s" % (qv_name, qv_value) 237 qualifiers.setdefault("db_xref", []).append(value) 238 # Get locations 239 results = adaptor.execute_and_fetchall( 240 "SELECT location_id, start_pos, end_pos, strand" 241 " FROM location" 242 " WHERE seqfeature_id = %s" 243 " ORDER BY rank", (seqfeature_id,)) 244 locations = [] 245 # convert to Python standard form 246 # Convert strand = 0 to strand = None 247 # re: comment in Loader.py: 248 # Biopython uses None when we don't know strand information but 249 # BioSQL requires something (non null) and sets this as zero 250 # So we'll use the strand or 0 if Biopython spits out None 251 for location_id, start, end, strand in results: 252 if start: 253 start -= 1 254 if strand == 0: 255 strand = None 256 if strand not in (+1, -1, None): 257 raise ValueError("Invalid strand %s found in database for " 258 "seqfeature_id %s" % (strand, seqfeature_id)) 259 if end < start: 260 import warnings 261 from Bio import BiopythonWarning 262 warnings.warn("Inverted location start/end (%i and %i) for " 263 "seqfeature_id %s" % (start, end, seqfeature_id), 264 BiopythonWarning) 265 locations.append((location_id, start, end, strand)) 266 # Get possible remote reference information 267 remote_results = adaptor.execute_and_fetchall( 268 "SELECT location_id, dbname, accession, version" 269 " FROM location join dbxref using (dbxref_id)" 270 " WHERE seqfeature_id = %s", (seqfeature_id,)) 271 lookup = {} 272 for location_id, dbname, accession, version in remote_results: 273 if version and version != "0": 274 v = "%s.%s" % (accession, version) 275 else: 276 v = accession 277 # subfeature remote location db_ref are stored as a empty string when 278 # not present 279 if dbname == "": 280 dbname = None 281 lookup[location_id] = (dbname, v) 282 283 feature = SeqFeature.SeqFeature(type=seqfeature_type) 284 # Store the key as a private property 285 feature._seqfeature_id = seqfeature_id 286 feature.qualifiers = qualifiers 287 if len(locations) == 0: 288 pass 289 elif len(locations) == 1: 290 location_id, start, end, strand = locations[0] 291 # See Bug 2677, we currently don't record the location_operator 292 # For consistency with older versions Biopython, default to "". 293 feature.location_operator = \ 294 _retrieve_location_qualifier_value(adaptor, location_id) 295 dbname, version = lookup.get(location_id, (None, None)) 296 feature.location = SeqFeature.FeatureLocation(start, end) 297 feature.strand = strand 298 feature.ref_db = dbname 299 feature.ref = version 300 else: 301 sub_features = feature.sub_features 302 assert sub_features == [] 303 for location in locations: 304 location_id, start, end, strand = location 305 dbname, version = lookup.get(location_id, (None, None)) 306 subfeature = SeqFeature.SeqFeature() 307 subfeature.type = seqfeature_type 308 subfeature.location = SeqFeature.FeatureLocation(start, end) 309 # subfeature.location_operator = \ 310 # _retrieve_location_qualifier_value(adaptor, location_id) 311 subfeature.strand = strand 312 subfeature.ref_db = dbname 313 subfeature.ref = version 314 sub_features.append(subfeature) 315 # Locations are in order, but because of remote locations for 316 # sub-features they are not necessarily in numerical order: 317 strands = set(sf.strand for sf in sub_features) 318 if len(strands) == 1 and -1 in strands: 319 # Evil hack time for backwards compatibility 320 # TODO - Check if BioPerl and (old) Biopython did the same, 321 # we may have an existing incompatibility lurking here... 322 locs = [f.location for f in sub_features[::-1]] 323 else: 324 # All forward, or mixed strands 325 locs = [f.location for f in sub_features] 326 feature.location = SeqFeature.CompoundLocation( 327 locs, seqfeature_type) 328 # TODO - See Bug 2677 - we don't yet record location_operator, 329 # so for consistency with older versions of Biopython default 330 # to assuming its a join. 331 feature.location_operator = "join" 332 seq_feature_list.append(feature) 333 334 return seq_feature_list
335 336
337 -def _retrieve_location_qualifier_value(adaptor, location_id):
338 value = adaptor.execute_and_fetch_col0( 339 "SELECT value FROM location_qualifier_value" 340 " WHERE location_id = %s", (location_id,)) 341 try: 342 return value[0] 343 except IndexError: 344 return ""
345 346
347 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
348 annotations = {} 349 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 350 annotations.update(_retrieve_reference(adaptor, primary_id)) 351 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 352 annotations.update(_retrieve_comment(adaptor, primary_id)) 353 # Convert values into strings in cases of unicode from the database. 354 # BioSQL could eventually be expanded to be unicode aware. 355 str_anns = {} 356 for key, val in annotations.items(): 357 if isinstance(val, list): 358 val = [_make_unicode_into_string(x) for x in val] 359 elif isinstance(val, unicode): 360 val = str(val) 361 str_anns[key] = val 362 return str_anns
363 364
365 -def _make_unicode_into_string(text):
366 if isinstance(text, unicode): 367 return str(text) 368 else: 369 return text
370 371
372 -def _retrieve_qualifier_value(adaptor, primary_id):
373 qvs = adaptor.execute_and_fetchall( 374 "SELECT name, value" 375 " FROM bioentry_qualifier_value JOIN term USING (term_id)" 376 " WHERE bioentry_id = %s" 377 " ORDER BY rank", (primary_id,)) 378 qualifiers = {} 379 for name, value in qvs: 380 if name == "keyword": 381 name = "keywords" 382 # See handling of "date" in Loader.py 383 elif name == "date_changed": 384 name = "date" 385 elif name == "secondary_accession": 386 name = "accessions" 387 qualifiers.setdefault(name, []).append(value) 388 return qualifiers
389 390
391 -def _retrieve_reference(adaptor, primary_id):
392 # XXX dbxref_qualifier_value 393 394 refs = adaptor.execute_and_fetchall( 395 "SELECT start_pos, end_pos, " 396 " location, title, authors," 397 " dbname, accession" 398 " FROM bioentry_reference" 399 " JOIN reference USING (reference_id)" 400 " LEFT JOIN dbxref USING (dbxref_id)" 401 " WHERE bioentry_id = %s" 402 " ORDER BY rank", (primary_id,)) 403 references = [] 404 for start, end, location, title, authors, dbname, accession in refs: 405 reference = SeqFeature.Reference() 406 # If the start/end are missing, reference.location is an empty list 407 if (start is not None) or (end is not None): 408 if start is not None: 409 start -= 1 # python counting 410 reference.location = [SeqFeature.FeatureLocation(start, end)] 411 # Don't replace the default "" with None. 412 if authors: 413 reference.authors = authors 414 if title: 415 reference.title = title 416 reference.journal = location 417 if dbname == 'PUBMED': 418 reference.pubmed_id = accession 419 elif dbname == 'MEDLINE': 420 reference.medline_id = accession 421 references.append(reference) 422 if references: 423 return {'references': references} 424 else: 425 return {}
426 427
428 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
429 a = {} 430 common_names = adaptor.execute_and_fetch_col0( 431 "SELECT name FROM taxon_name WHERE taxon_id = %s" 432 " AND name_class = 'genbank common name'", (taxon_id,)) 433 if common_names: 434 a['source'] = common_names[0] 435 scientific_names = adaptor.execute_and_fetch_col0( 436 "SELECT name FROM taxon_name WHERE taxon_id = %s" 437 " AND name_class = 'scientific name'", (taxon_id,)) 438 if scientific_names: 439 a['organism'] = scientific_names[0] 440 ncbi_taxids = adaptor.execute_and_fetch_col0( 441 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 442 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 443 a['ncbi_taxid'] = ncbi_taxids[0] 444 445 # Old code used the left/right values in the taxon table to get the 446 # taxonomy lineage in one SQL command. This was actually very slow, 447 # and would fail if the (optional) left/right values were missing. 448 # 449 # The following code is based on a contribution from Eric Gibert, and 450 # relies on the taxon table's parent_taxon_id field only (ignoring the 451 # optional left/right values). This means that it has to make a 452 # separate SQL query for each entry in the lineage, but it does still 453 # appear to be *much* faster. See Bug 2494. 454 taxonomy = [] 455 while taxon_id: 456 name, rank, parent_taxon_id = adaptor.execute_one( 457 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" 458 " FROM taxon, taxon_name" 459 " WHERE taxon.taxon_id=taxon_name.taxon_id" 460 " AND taxon_name.name_class='scientific name'" 461 " AND taxon.taxon_id = %s", (taxon_id,)) 462 if taxon_id == parent_taxon_id: 463 # If the taxon table has been populated by the BioSQL script 464 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 465 # Personally, I would have used a NULL parent_taxon_id here. 466 break 467 if rank != "no rank": 468 # For consistency with older versions of Biopython, we are only 469 # interested in taxonomy entries with a stated rank. 470 # Add this to the start of the lineage list. 471 taxonomy.insert(0, name) 472 taxon_id = parent_taxon_id 473 474 if taxonomy: 475 a['taxonomy'] = taxonomy 476 return a
477 478
479 -def _retrieve_comment(adaptor, primary_id):
480 qvs = adaptor.execute_and_fetchall( 481 "SELECT comment_text FROM comment" 482 " WHERE bioentry_id=%s" 483 " ORDER BY rank", (primary_id,)) 484 comments = [comm[0] for comm in qvs] 485 # Don't want to add an empty list... 486 if comments: 487 return {"comment": comments} 488 else: 489 return {}
490 491
492 -class DBSeqRecord(SeqRecord):
493 """BioSQL equivalent of the Biopython SeqRecord object.""" 494
495 - def __init__(self, adaptor, primary_id):
496 self._adaptor = adaptor 497 self._primary_id = primary_id 498 499 (self._biodatabase_id, self._taxon_id, self.name, 500 accession, version, self._identifier, 501 self._division, self.description) = self._adaptor.execute_one( 502 "SELECT biodatabase_id, taxon_id, name, accession, version," 503 " identifier, division, description" 504 " FROM bioentry" 505 " WHERE bioentry_id = %s", (self._primary_id,)) 506 if version and version != "0": 507 self.id = "%s.%s" % (accession, version) 508 else: 509 self.id = accession 510 # We don't yet record any per-letter-annotations in the 511 # BioSQL database, but we should set this property up 512 # for completeness (and the __str__ method). 513 try: 514 length = len(self.seq) 515 except: 516 # Could be no sequence in the database! 517 length = 0 518 self._per_letter_annotations = _RestrictedDict(length=length)
519
520 - def __get_seq(self):
521 if not hasattr(self, "_seq"): 522 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 523 return self._seq
524
525 - def __set_seq(self, seq):
526 self._seq = seq
527
528 - def __del_seq(self):
529 del self._seq
530 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 531
532 - def __get_dbxrefs(self):
533 if not hasattr(self, "_dbxrefs"): 534 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 535 return self._dbxrefs
536
537 - def __set_dbxrefs(self, dbxrefs):
538 self._dbxrefs = dbxrefs
539
540 - def __del_dbxrefs(self):
541 del self._dbxrefs
542 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 543 "Database cross references") 544
545 - def __get_features(self):
546 if not hasattr(self, "_features"): 547 self._features = _retrieve_features(self._adaptor, 548 self._primary_id) 549 return self._features
550
551 - def __set_features(self, features):
552 self._features = features
553
554 - def __del_features(self):
555 del self._features
556 features = property(__get_features, __set_features, __del_features, 557 "Features") 558
559 - def __get_annotations(self):
560 if not hasattr(self, "_annotations"): 561 self._annotations = _retrieve_annotations(self._adaptor, 562 self._primary_id, 563 self._taxon_id) 564 if self._identifier: 565 self._annotations["gi"] = self._identifier 566 if self._division: 567 self._annotations["data_file_division"] = self._division 568 return self._annotations
569
570 - def __set_annotations(self, annotations):
571 self._annotations = annotations
572
573 - def __del_annotations(self):
574 del self._annotations
575 annotations = property(__get_annotations, __set_annotations, 576 __del_annotations, "Annotations")
577