Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like SeqRecord and Seq interface. 
 14   
 15  Note: Currently we do not support recording per-letter-annotations 
 16  (like quality scores) in BioSQL. 
 17  """ 
 18   
 19  from Bio import Alphabet 
 20  from Bio.Seq import Seq, UnknownSeq 
 21  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 22  from Bio import SeqFeature 
 23   
 24   
25 -class DBSeq(Seq): # This implements the biopython Seq interface
26 - def __init__(self, primary_id, adaptor, alphabet, start, length):
27 """Create a new DBSeq object referring to a BioSQL entry. 28 29 You wouldn't normally create a DBSeq object yourself, this is done 30 for you when retreiving a DBSeqRecord object from the database. 31 """ 32 self.primary_id = primary_id 33 self.adaptor = adaptor 34 self.alphabet = alphabet 35 self._length = length 36 self.start = start
37
38 - def __len__(self):
39 return self._length
40
41 - def __getitem__(self, index): # Seq API requirement
42 #Note since Python 2.0, __getslice__ is deprecated 43 #and __getitem__ is used instead. 44 #See http://docs.python.org/ref/sequence-methods.html 45 if isinstance(index, int): 46 #Return a single letter as a string 47 i = index 48 if i < 0: 49 if -i > self._length: 50 raise IndexError(i) 51 i = i + self._length 52 elif i >= self._length: 53 raise IndexError(i) 54 return self.adaptor.get_subseq_as_string(self.primary_id, 55 self.start + i, 56 self.start + i + 1) 57 if not isinstance(index, slice): 58 raise ValueError("Unexpected index type") 59 60 #Return the (sub)sequence as another DBSeq or Seq object 61 #(see the Seq obect's __getitem__ method) 62 if index.start is None: 63 i = 0 64 else: 65 i = index.start 66 if i < 0: 67 #Map to equavilent positive index 68 if -i > self._length: 69 raise IndexError(i) 70 i = i + self._length 71 elif i >= self._length: 72 #Trivial case, should return empty string! 73 i = self._length 74 75 if index.stop is None: 76 j = self._length 77 else: 78 j = index.stop 79 if j < 0: 80 #Map to equavilent positive index 81 if -j > self._length: 82 raise IndexError(j) 83 j = j + self._length 84 elif j >= self._length: 85 j = self._length 86 87 if i >= j: 88 #Trivial case, empty string. 89 return Seq("", self.alphabet) 90 elif index.step is None or index.step == 1: 91 #Easy case - can return a DBSeq with the start and end adjusted 92 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 93 self.start + i, j - i) 94 else: 95 #Tricky. Will have to create a Seq object because of the stride 96 full = self.adaptor.get_subseq_as_string(self.primary_id, 97 self.start + i, 98 self.start + j) 99 return Seq(full[::index.step], self.alphabet) 100
101 - def tostring(self):
102 """Returns the full sequence as a python string. 103 104 Although not formally deprecated, you are now encouraged to use 105 str(my_seq) instead of my_seq.tostring().""" 106 return self.adaptor.get_subseq_as_string(self.primary_id, 107 self.start, 108 self.start + self._length)
109
110 - def __str__(self):
111 """Returns the full sequence as a python string.""" 112 return self.adaptor.get_subseq_as_string(self.primary_id, 113 self.start, 114 self.start + self._length)
115 116 data = property(tostring, doc="Sequence as string (DEPRECATED)") 117
118 - def toseq(self):
119 """Returns the full sequence as a Seq object.""" 120 #Note - the method name copies that of the MutableSeq object 121 return Seq(str(self), self.alphabet)
122
123 - def __add__(self, other):
124 #Let the Seq object deal with the alphabet issues etc 125 return self.toseq() + other
126
127 - def __radd__(self, other):
128 #Let the Seq object deal with the alphabet issues etc 129 return other + self.toseq()
130 131
132 -def _retrieve_seq(adaptor, primary_id):
133 #The database schema ensures there will be only one matching 134 #row in the table. 135 136 #If an UnknownSeq was recorded, seq will be NULL, 137 #but length will be populated. This means length(seq) 138 #will return None. 139 seqs = adaptor.execute_and_fetchall( 140 "SELECT alphabet, length, length(seq) FROM biosequence" 141 " WHERE bioentry_id = %s", (primary_id,)) 142 if not seqs: 143 return 144 assert len(seqs) == 1 145 moltype, given_length, length = seqs[0] 146 147 try: 148 length = int(length) 149 given_length = int(length) 150 assert length == given_length 151 have_seq = True 152 except TypeError: 153 assert length is None 154 seqs = adaptor.execute_and_fetchall( 155 "SELECT alphabet, length, seq FROM biosequence" 156 " WHERE bioentry_id = %s", (primary_id,)) 157 assert len(seqs) == 1 158 moltype, given_length, seq = seqs[0] 159 assert seq is None or seq == "" 160 length = int(given_length) 161 have_seq = False 162 del seq 163 del given_length 164 165 moltype = moltype.lower() # might be upper case in database 166 #We have no way of knowing if these sequences will use IUPAC 167 #alphabets, and we certainly can't assume they are unambiguous! 168 if moltype == "dna": 169 alphabet = Alphabet.generic_dna 170 elif moltype == "rna": 171 alphabet = Alphabet.generic_rna 172 elif moltype == "protein": 173 alphabet = Alphabet.generic_protein 174 elif moltype == "unknown": 175 #This is used in BioSQL/Loader.py and would happen 176 #for any generic or nucleotide alphabets. 177 alphabet = Alphabet.single_letter_alphabet 178 else: 179 raise AssertionError("Unknown moltype: %s" % moltype) 180 181 if have_seq: 182 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 183 else: 184 return UnknownSeq(length, alphabet)
185 186
187 -def _retrieve_dbxrefs(adaptor, primary_id):
188 """Retrieve the database cross references for the sequence.""" 189 _dbxrefs = [] 190 dbxrefs = adaptor.execute_and_fetchall( 191 "SELECT dbname, accession, version" 192 " FROM bioentry_dbxref join dbxref using (dbxref_id)" 193 " WHERE bioentry_id = %s" 194 " ORDER BY rank", (primary_id,)) 195 for dbname, accession, version in dbxrefs: 196 if version and version != "0": 197 v = "%s.%s" % (accession, version) 198 else: 199 v = accession 200 _dbxrefs.append("%s:%s" % (dbname, v)) 201 return _dbxrefs
202 203
204 -def _retrieve_features(adaptor, primary_id):
205 sql = "SELECT seqfeature_id, type.name, rank" \ 206 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 207 " WHERE bioentry_id = %s" \ 208 " ORDER BY rank" 209 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 210 seq_feature_list = [] 211 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 212 # Get qualifiers [except for db_xref which is stored separately] 213 qvs = adaptor.execute_and_fetchall( 214 "SELECT name, value" 215 " FROM seqfeature_qualifier_value join term using (term_id)" 216 " WHERE seqfeature_id = %s" 217 " ORDER BY rank", (seqfeature_id,)) 218 qualifiers = {} 219 for qv_name, qv_value in qvs: 220 qualifiers.setdefault(qv_name, []).append(qv_value) 221 # Get db_xrefs [special case of qualifiers] 222 qvs = adaptor.execute_and_fetchall( 223 "SELECT dbxref.dbname, dbxref.accession" 224 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" 225 " WHERE seqfeature_dbxref.seqfeature_id = %s" 226 " ORDER BY rank", (seqfeature_id,)) 227 for qv_name, qv_value in qvs: 228 value = "%s:%s" % (qv_name, qv_value) 229 qualifiers.setdefault("db_xref", []).append(value) 230 # Get locations 231 results = adaptor.execute_and_fetchall( 232 "SELECT location_id, start_pos, end_pos, strand" 233 " FROM location" 234 " WHERE seqfeature_id = %s" 235 " ORDER BY rank", (seqfeature_id,)) 236 locations = [] 237 # convert to Python standard form 238 # Convert strand = 0 to strand = None 239 # re: comment in Loader.py: 240 # Biopython uses None when we don't know strand information but 241 # BioSQL requires something (non null) and sets this as zero 242 # So we'll use the strand or 0 if Biopython spits out None 243 for location_id, start, end, strand in results: 244 if start: 245 start -= 1 246 if strand == 0: 247 strand = None 248 if strand not in (+1, -1, None): 249 raise ValueError("Invalid strand %s found in database for " 250 "seqfeature_id %s" % (strand, seqfeature_id)) 251 if end < start: 252 import warnings 253 from Bio import BiopythonWarning 254 warnings.warn("Inverted location start/end (%i and %i) for " 255 "seqfeature_id %s" % (start, end, seqfeature_id), 256 BiopythonWarning) 257 locations.append((location_id, start, end, strand)) 258 # Get possible remote reference information 259 remote_results = adaptor.execute_and_fetchall( 260 "SELECT location_id, dbname, accession, version" 261 " FROM location join dbxref using (dbxref_id)" 262 " WHERE seqfeature_id = %s", (seqfeature_id,)) 263 lookup = {} 264 for location_id, dbname, accession, version in remote_results: 265 if version and version != "0": 266 v = "%s.%s" % (accession, version) 267 else: 268 v = accession 269 # subfeature remote location db_ref are stored as a empty string when 270 # not present 271 if dbname == "": 272 dbname = None 273 lookup[location_id] = (dbname, v) 274 275 feature = SeqFeature.SeqFeature(type=seqfeature_type) 276 feature._seqfeature_id = seqfeature_id # Store the key as a private property 277 feature.qualifiers = qualifiers 278 if len(locations) == 0: 279 pass 280 elif len(locations) == 1: 281 location_id, start, end, strand = locations[0] 282 #See Bug 2677, we currently don't record the location_operator 283 #For consistency with older versions Biopython, default to "". 284 feature.location_operator = \ 285 _retrieve_location_qualifier_value(adaptor, location_id) 286 dbname, version = lookup.get(location_id, (None, None)) 287 feature.location = SeqFeature.FeatureLocation(start, end) 288 feature.strand = strand 289 feature.ref_db = dbname 290 feature.ref = version 291 else: 292 assert feature.sub_features == [] 293 for location in locations: 294 location_id, start, end, strand = location 295 dbname, version = lookup.get(location_id, (None, None)) 296 subfeature = SeqFeature.SeqFeature() 297 subfeature.type = seqfeature_type 298 subfeature.location_operator = \ 299 _retrieve_location_qualifier_value(adaptor, location_id) 300 #TODO - See Bug 2677 - we don't yet record location_operator, 301 #so for consistency with older versions of Biopython default 302 #to assuming its a join. 303 if not subfeature.location_operator: 304 subfeature.location_operator = "join" 305 subfeature.location = SeqFeature.FeatureLocation(start, end) 306 subfeature.strand = strand 307 subfeature.ref_db = dbname 308 subfeature.ref = version 309 feature.sub_features.append(subfeature) 310 # Assuming that the feature loc.op is the same as the sub_feature 311 # loc.op: 312 feature.location_operator = \ 313 feature.sub_features[0].location_operator 314 # Locations are in order, but because of remote locations for 315 # sub-features they are not necessarily in numerical order: 316 start = locations[0][1] 317 end = locations[-1][2] 318 feature.location = SeqFeature.FeatureLocation(start, end) 319 # To get the parent strand (as done when parsing GenBank files), 320 # need to consider evil mixed strand examples like this, 321 # join(complement(69611..69724),139856..140087,140625..140650) 322 strands = set(sf.strand for sf in feature.sub_features) 323 if len(strands) == 1: 324 feature.strand = feature.sub_features[0].strand 325 else: 326 feature.strand = None # i.e. mixed strands 327 328 seq_feature_list.append(feature) 329 330 return seq_feature_list
331 332
333 -def _retrieve_location_qualifier_value(adaptor, location_id):
334 value = adaptor.execute_and_fetch_col0( 335 "SELECT value FROM location_qualifier_value" 336 " WHERE location_id = %s", (location_id,)) 337 try: 338 return value[0] 339 except IndexError: 340 return ""
341 342
343 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
344 annotations = {} 345 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 346 annotations.update(_retrieve_reference(adaptor, primary_id)) 347 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 348 annotations.update(_retrieve_comment(adaptor, primary_id)) 349 # Convert values into strings in cases of unicode from the database. 350 # BioSQL could eventually be expanded to be unicode aware. 351 str_anns = {} 352 for key, val in annotations.items(): 353 if isinstance(val, list): 354 val = [_make_unicode_into_string(x) for x in val] 355 elif isinstance(val, unicode): 356 val = str(val) 357 str_anns[key] = val 358 return str_anns
359 360
361 -def _make_unicode_into_string(text):
362 if isinstance(text, unicode): 363 return str(text) 364 else: 365 return text
366 367
368 -def _retrieve_qualifier_value(adaptor, primary_id):
369 qvs = adaptor.execute_and_fetchall( 370 "SELECT name, value" 371 " FROM bioentry_qualifier_value JOIN term USING (term_id)" 372 " WHERE bioentry_id = %s" 373 " ORDER BY rank", (primary_id,)) 374 qualifiers = {} 375 for name, value in qvs: 376 if name == "keyword": 377 name = "keywords" 378 #See handling of "date" in Loader.py 379 elif name == "date_changed": 380 name = "date" 381 elif name == "secondary_accession": 382 name = "accessions" 383 qualifiers.setdefault(name, []).append(value) 384 return qualifiers
385 386
387 -def _retrieve_reference(adaptor, primary_id):
388 # XXX dbxref_qualifier_value 389 390 refs = adaptor.execute_and_fetchall( 391 "SELECT start_pos, end_pos, " 392 " location, title, authors," 393 " dbname, accession" 394 " FROM bioentry_reference" 395 " JOIN reference USING (reference_id)" 396 " LEFT JOIN dbxref USING (dbxref_id)" 397 " WHERE bioentry_id = %s" 398 " ORDER BY rank", (primary_id,)) 399 references = [] 400 for start, end, location, title, authors, dbname, accession in refs: 401 reference = SeqFeature.Reference() 402 #If the start/end are missing, reference.location is an empty list 403 if (start is not None) or (end is not None): 404 if start is not None: 405 start -= 1 # python counting 406 reference.location = [SeqFeature.FeatureLocation(start, end)] 407 #Don't replace the default "" with None. 408 if authors: 409 reference.authors = authors 410 if title: 411 reference.title = title 412 reference.journal = location 413 if dbname == 'PUBMED': 414 reference.pubmed_id = accession 415 elif dbname == 'MEDLINE': 416 reference.medline_id = accession 417 references.append(reference) 418 if references: 419 return {'references': references} 420 else: 421 return {}
422 423
424 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
425 a = {} 426 common_names = adaptor.execute_and_fetch_col0( 427 "SELECT name FROM taxon_name WHERE taxon_id = %s" 428 " AND name_class = 'genbank common name'", (taxon_id,)) 429 if common_names: 430 a['source'] = common_names[0] 431 scientific_names = adaptor.execute_and_fetch_col0( 432 "SELECT name FROM taxon_name WHERE taxon_id = %s" 433 " AND name_class = 'scientific name'", (taxon_id,)) 434 if scientific_names: 435 a['organism'] = scientific_names[0] 436 ncbi_taxids = adaptor.execute_and_fetch_col0( 437 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 438 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 439 a['ncbi_taxid'] = ncbi_taxids[0] 440 441 #Old code used the left/right values in the taxon table to get the 442 #taxonomy lineage in one SQL command. This was actually very slow, 443 #and would fail if the (optional) left/right values were missing. 444 # 445 #The following code is based on a contribution from Eric Gibert, and 446 #relies on the taxon table's parent_taxon_id field only (ignoring the 447 #optional left/right values). This means that it has to make a 448 #separate SQL query for each entry in the lineage, but it does still 449 #appear to be *much* faster. See Bug 2494. 450 taxonomy = [] 451 while taxon_id: 452 name, rank, parent_taxon_id = adaptor.execute_one( 453 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" 454 " FROM taxon, taxon_name" 455 " WHERE taxon.taxon_id=taxon_name.taxon_id" 456 " AND taxon_name.name_class='scientific name'" 457 " AND taxon.taxon_id = %s", (taxon_id,)) 458 if taxon_id == parent_taxon_id: 459 # If the taxon table has been populated by the BioSQL script 460 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 461 # Personally, I would have used a NULL parent_taxon_id here. 462 break 463 if rank != "no rank": 464 #For consistency with older versions of Biopython, we are only 465 #interested in taxonomy entries with a stated rank. 466 #Add this to the start of the lineage list. 467 taxonomy.insert(0, name) 468 taxon_id = parent_taxon_id 469 470 if taxonomy: 471 a['taxonomy'] = taxonomy 472 return a
473 474
475 -def _retrieve_comment(adaptor, primary_id):
476 qvs = adaptor.execute_and_fetchall( 477 "SELECT comment_text FROM comment" 478 " WHERE bioentry_id=%s" 479 " ORDER BY rank", (primary_id,)) 480 comments = [comm[0] for comm in qvs] 481 #Don't want to add an empty list... 482 if comments: 483 return {"comment": comments} 484 else: 485 return {}
486 487
488 -class DBSeqRecord(SeqRecord):
489 """BioSQL equivalent of the biopython SeqRecord object. 490 """ 491
492 - def __init__(self, adaptor, primary_id):
493 self._adaptor = adaptor 494 self._primary_id = primary_id 495 496 (self._biodatabase_id, self._taxon_id, self.name, 497 accession, version, self._identifier, 498 self._division, self.description) = self._adaptor.execute_one( 499 "SELECT biodatabase_id, taxon_id, name, accession, version," 500 " identifier, division, description" 501 " FROM bioentry" 502 " WHERE bioentry_id = %s", (self._primary_id,)) 503 if version and version != "0": 504 self.id = "%s.%s" % (accession, version) 505 else: 506 self.id = accession 507 #We don't yet record any per-letter-annotations in the 508 #BioSQL database, but we should set this property up 509 #for completeness (and the __str__ method). 510 try: 511 length = len(self.seq) 512 except: 513 #Could be no sequence in the database! 514 length = 0 515 self._per_letter_annotations = _RestrictedDict(length=length)
516
517 - def __get_seq(self):
518 if not hasattr(self, "_seq"): 519 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 520 return self._seq
521
522 - def __set_seq(self, seq):
523 self._seq = seq
524
525 - def __del_seq(self):
526 del self._seq
527 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 528
529 - def __get_dbxrefs(self):
530 if not hasattr(self, "_dbxrefs"): 531 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 532 return self._dbxrefs
533
534 - def __set_dbxrefs(self, dbxrefs):
535 self._dbxrefs = dbxrefs
536
537 - def __del_dbxrefs(self):
538 del self._dbxrefs
539 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 540 "Database cross references") 541
542 - def __get_features(self):
543 if not hasattr(self, "_features"): 544 self._features = _retrieve_features(self._adaptor, 545 self._primary_id) 546 return self._features
547
548 - def __set_features(self, features):
549 self._features = features
550
551 - def __del_features(self):
552 del self._features
553 features = property(__get_features, __set_features, __del_features, 554 "Features") 555
556 - def __get_annotations(self):
557 if not hasattr(self, "_annotations"): 558 self._annotations = _retrieve_annotations(self._adaptor, 559 self._primary_id, 560 self._taxon_id) 561 if self._identifier: 562 self._annotations["gi"] = self._identifier 563 if self._division: 564 self._annotations["data_file_division"] = self._division 565 return self._annotations
566
567 - def __set_annotations(self, annotations):
568 self._annotations = annotations
569
570 - def __del_annotations(self):
571 del self._annotations
572 annotations = property(__get_annotations, __set_annotations, 573 __del_annotations, "Annotations")
574