Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like SeqRecord and Seq interface. 
 14   
 15  Note: Currently we do not support recording per-letter-annotations 
 16  (like quality scores) in BioSQL. 
 17  """ 
 18   
 19  from Bio._py3k import unicode 
 20   
 21  from Bio import Alphabet 
 22  from Bio.Seq import Seq, UnknownSeq 
 23  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 24  from Bio import SeqFeature 
 25   
 26   
27 -class DBSeq(Seq): # This implements the biopython Seq interface
28 - def __init__(self, primary_id, adaptor, alphabet, start, length):
29 """Create a new DBSeq object referring to a BioSQL entry. 30 31 You wouldn't normally create a DBSeq object yourself, this is done 32 for you when retreiving a DBSeqRecord object from the database. 33 """ 34 self.primary_id = primary_id 35 self.adaptor = adaptor 36 self.alphabet = alphabet 37 self._length = length 38 self.start = start
39
40 - def __len__(self):
41 return self._length
42
43 - def __getitem__(self, index): # Seq API requirement
44 #Note since Python 2.0, __getslice__ is deprecated 45 #and __getitem__ is used instead. 46 #See http://docs.python.org/ref/sequence-methods.html 47 if isinstance(index, int): 48 #Return a single letter as a string 49 i = index 50 if i < 0: 51 if -i > self._length: 52 raise IndexError(i) 53 i = i + self._length 54 elif i >= self._length: 55 raise IndexError(i) 56 return self.adaptor.get_subseq_as_string(self.primary_id, 57 self.start + i, 58 self.start + i + 1) 59 if not isinstance(index, slice): 60 raise ValueError("Unexpected index type") 61 62 #Return the (sub)sequence as another DBSeq or Seq object 63 #(see the Seq obect's __getitem__ method) 64 if index.start is None: 65 i = 0 66 else: 67 i = index.start 68 if i < 0: 69 #Map to equavilent positive index 70 if -i > self._length: 71 raise IndexError(i) 72 i = i + self._length 73 elif i >= self._length: 74 #Trivial case, should return empty string! 75 i = self._length 76 77 if index.stop is None: 78 j = self._length 79 else: 80 j = index.stop 81 if j < 0: 82 #Map to equavilent positive index 83 if -j > self._length: 84 raise IndexError(j) 85 j = j + self._length 86 elif j >= self._length: 87 j = self._length 88 89 if i >= j: 90 #Trivial case, empty string. 91 return Seq("", self.alphabet) 92 elif index.step is None or index.step == 1: 93 #Easy case - can return a DBSeq with the start and end adjusted 94 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 95 self.start + i, j - i) 96 else: 97 #Tricky. Will have to create a Seq object because of the stride 98 full = self.adaptor.get_subseq_as_string(self.primary_id, 99 self.start + i, 100 self.start + j) 101 return Seq(full[::index.step], self.alphabet) 102
103 - def tostring(self):
104 """Returns the full sequence as a python string (DEPRECATED). 105 106 You are now encouraged to use str(my_seq) instead of 107 my_seq.tostring().""" 108 import warnings 109 warnings.warn("This method is obsolete; please use str(my_seq) " 110 "instead of my_seq.tostring().", 111 PendingDeprecationWarning) 112 return self.adaptor.get_subseq_as_string(self.primary_id, 113 self.start, 114 self.start + self._length)
115
116 - def __str__(self):
117 """Returns the full sequence as a python string.""" 118 return self.adaptor.get_subseq_as_string(self.primary_id, 119 self.start, 120 self.start + self._length)
121 122 data = property(tostring, doc="Sequence as string (DEPRECATED)") 123
124 - def toseq(self):
125 """Returns the full sequence as a Seq object.""" 126 #Note - the method name copies that of the MutableSeq object 127 return Seq(str(self), self.alphabet)
128
129 - def __add__(self, other):
130 #Let the Seq object deal with the alphabet issues etc 131 return self.toseq() + other
132
133 - def __radd__(self, other):
134 #Let the Seq object deal with the alphabet issues etc 135 return other + self.toseq()
136 137
138 -def _retrieve_seq(adaptor, primary_id):
139 #The database schema ensures there will be only one matching 140 #row in the table. 141 142 #If an UnknownSeq was recorded, seq will be NULL, 143 #but length will be populated. This means length(seq) 144 #will return None. 145 seqs = adaptor.execute_and_fetchall( 146 "SELECT alphabet, length, length(seq) FROM biosequence" 147 " WHERE bioentry_id = %s", (primary_id,)) 148 if not seqs: 149 return 150 assert len(seqs) == 1 151 moltype, given_length, length = seqs[0] 152 153 try: 154 length = int(length) 155 given_length = int(length) 156 assert length == given_length 157 have_seq = True 158 except TypeError: 159 assert length is None 160 seqs = adaptor.execute_and_fetchall( 161 "SELECT alphabet, length, seq FROM biosequence" 162 " WHERE bioentry_id = %s", (primary_id,)) 163 assert len(seqs) == 1 164 moltype, given_length, seq = seqs[0] 165 assert seq is None or seq == "" 166 length = int(given_length) 167 have_seq = False 168 del seq 169 del given_length 170 171 moltype = moltype.lower() # might be upper case in database 172 #We have no way of knowing if these sequences will use IUPAC 173 #alphabets, and we certainly can't assume they are unambiguous! 174 if moltype == "dna": 175 alphabet = Alphabet.generic_dna 176 elif moltype == "rna": 177 alphabet = Alphabet.generic_rna 178 elif moltype == "protein": 179 alphabet = Alphabet.generic_protein 180 elif moltype == "unknown": 181 #This is used in BioSQL/Loader.py and would happen 182 #for any generic or nucleotide alphabets. 183 alphabet = Alphabet.single_letter_alphabet 184 else: 185 raise AssertionError("Unknown moltype: %s" % moltype) 186 187 if have_seq: 188 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 189 else: 190 return UnknownSeq(length, alphabet)
191 192
193 -def _retrieve_dbxrefs(adaptor, primary_id):
194 """Retrieve the database cross references for the sequence.""" 195 _dbxrefs = [] 196 dbxrefs = adaptor.execute_and_fetchall( 197 "SELECT dbname, accession, version" 198 " FROM bioentry_dbxref join dbxref using (dbxref_id)" 199 " WHERE bioentry_id = %s" 200 " ORDER BY rank", (primary_id,)) 201 for dbname, accession, version in dbxrefs: 202 if version and version != "0": 203 v = "%s.%s" % (accession, version) 204 else: 205 v = accession 206 _dbxrefs.append("%s:%s" % (dbname, v)) 207 return _dbxrefs
208 209
210 -def _retrieve_features(adaptor, primary_id):
211 sql = "SELECT seqfeature_id, type.name, rank" \ 212 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 213 " WHERE bioentry_id = %s" \ 214 " ORDER BY rank" 215 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 216 seq_feature_list = [] 217 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 218 # Get qualifiers [except for db_xref which is stored separately] 219 qvs = adaptor.execute_and_fetchall( 220 "SELECT name, value" 221 " FROM seqfeature_qualifier_value join term using (term_id)" 222 " WHERE seqfeature_id = %s" 223 " ORDER BY rank", (seqfeature_id,)) 224 qualifiers = {} 225 for qv_name, qv_value in qvs: 226 qualifiers.setdefault(qv_name, []).append(qv_value) 227 # Get db_xrefs [special case of qualifiers] 228 qvs = adaptor.execute_and_fetchall( 229 "SELECT dbxref.dbname, dbxref.accession" 230 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" 231 " WHERE seqfeature_dbxref.seqfeature_id = %s" 232 " ORDER BY rank", (seqfeature_id,)) 233 for qv_name, qv_value in qvs: 234 value = "%s:%s" % (qv_name, qv_value) 235 qualifiers.setdefault("db_xref", []).append(value) 236 # Get locations 237 results = adaptor.execute_and_fetchall( 238 "SELECT location_id, start_pos, end_pos, strand" 239 " FROM location" 240 " WHERE seqfeature_id = %s" 241 " ORDER BY rank", (seqfeature_id,)) 242 locations = [] 243 # convert to Python standard form 244 # Convert strand = 0 to strand = None 245 # re: comment in Loader.py: 246 # Biopython uses None when we don't know strand information but 247 # BioSQL requires something (non null) and sets this as zero 248 # So we'll use the strand or 0 if Biopython spits out None 249 for location_id, start, end, strand in results: 250 if start: 251 start -= 1 252 if strand == 0: 253 strand = None 254 if strand not in (+1, -1, None): 255 raise ValueError("Invalid strand %s found in database for " 256 "seqfeature_id %s" % (strand, seqfeature_id)) 257 if end < start: 258 import warnings 259 from Bio import BiopythonWarning 260 warnings.warn("Inverted location start/end (%i and %i) for " 261 "seqfeature_id %s" % (start, end, seqfeature_id), 262 BiopythonWarning) 263 locations.append((location_id, start, end, strand)) 264 # Get possible remote reference information 265 remote_results = adaptor.execute_and_fetchall( 266 "SELECT location_id, dbname, accession, version" 267 " FROM location join dbxref using (dbxref_id)" 268 " WHERE seqfeature_id = %s", (seqfeature_id,)) 269 lookup = {} 270 for location_id, dbname, accession, version in remote_results: 271 if version and version != "0": 272 v = "%s.%s" % (accession, version) 273 else: 274 v = accession 275 # subfeature remote location db_ref are stored as a empty string when 276 # not present 277 if dbname == "": 278 dbname = None 279 lookup[location_id] = (dbname, v) 280 281 feature = SeqFeature.SeqFeature(type=seqfeature_type) 282 feature._seqfeature_id = seqfeature_id # Store the key as a private property 283 feature.qualifiers = qualifiers 284 if len(locations) == 0: 285 pass 286 elif len(locations) == 1: 287 location_id, start, end, strand = locations[0] 288 #See Bug 2677, we currently don't record the location_operator 289 #For consistency with older versions Biopython, default to "". 290 feature.location_operator = \ 291 _retrieve_location_qualifier_value(adaptor, location_id) 292 dbname, version = lookup.get(location_id, (None, None)) 293 feature.location = SeqFeature.FeatureLocation(start, end) 294 feature.strand = strand 295 feature.ref_db = dbname 296 feature.ref = version 297 else: 298 sub_features = feature.sub_features 299 assert sub_features == [] 300 for location in locations: 301 location_id, start, end, strand = location 302 dbname, version = lookup.get(location_id, (None, None)) 303 subfeature = SeqFeature.SeqFeature() 304 subfeature.type = seqfeature_type 305 subfeature.location = SeqFeature.FeatureLocation(start, end) 306 #subfeature.location_operator = \ 307 # _retrieve_location_qualifier_value(adaptor, location_id) 308 subfeature.strand = strand 309 subfeature.ref_db = dbname 310 subfeature.ref = version 311 sub_features.append(subfeature) 312 # Locations are in order, but because of remote locations for 313 # sub-features they are not necessarily in numerical order: 314 strands = set(sf.strand for sf in sub_features) 315 if len(strands)==1 and -1 in strands: 316 #Evil hack time for backwards compatibility 317 #TODO - Check if BioPerl and (old) Biopython did the same, 318 #we may have an existing incompatibility lurking here... 319 locs = [f.location for f in sub_features[::-1]] 320 else: 321 #All forward, or mixed strands 322 locs = [f.location for f in sub_features] 323 feature.location = SeqFeature.CompoundLocation(locs, seqfeature_type) 324 #TODO - See Bug 2677 - we don't yet record location_operator, 325 #so for consistency with older versions of Biopython default 326 #to assuming its a join. 327 feature.location_operator = "join" 328 seq_feature_list.append(feature) 329 330 return seq_feature_list
331 332
333 -def _retrieve_location_qualifier_value(adaptor, location_id):
334 value = adaptor.execute_and_fetch_col0( 335 "SELECT value FROM location_qualifier_value" 336 " WHERE location_id = %s", (location_id,)) 337 try: 338 return value[0] 339 except IndexError: 340 return ""
341 342
343 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
344 annotations = {} 345 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 346 annotations.update(_retrieve_reference(adaptor, primary_id)) 347 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 348 annotations.update(_retrieve_comment(adaptor, primary_id)) 349 # Convert values into strings in cases of unicode from the database. 350 # BioSQL could eventually be expanded to be unicode aware. 351 str_anns = {} 352 for key, val in annotations.items(): 353 if isinstance(val, list): 354 val = [_make_unicode_into_string(x) for x in val] 355 elif isinstance(val, unicode): 356 val = str(val) 357 str_anns[key] = val 358 return str_anns
359 360
361 -def _make_unicode_into_string(text):
362 if isinstance(text, unicode): 363 return str(text) 364 else: 365 return text
366 367
368 -def _retrieve_qualifier_value(adaptor, primary_id):
369 qvs = adaptor.execute_and_fetchall( 370 "SELECT name, value" 371 " FROM bioentry_qualifier_value JOIN term USING (term_id)" 372 " WHERE bioentry_id = %s" 373 " ORDER BY rank", (primary_id,)) 374 qualifiers = {} 375 for name, value in qvs: 376 if name == "keyword": 377 name = "keywords" 378 #See handling of "date" in Loader.py 379 elif name == "date_changed": 380 name = "date" 381 elif name == "secondary_accession": 382 name = "accessions" 383 qualifiers.setdefault(name, []).append(value) 384 return qualifiers
385 386
387 -def _retrieve_reference(adaptor, primary_id):
388 # XXX dbxref_qualifier_value 389 390 refs = adaptor.execute_and_fetchall( 391 "SELECT start_pos, end_pos, " 392 " location, title, authors," 393 " dbname, accession" 394 " FROM bioentry_reference" 395 " JOIN reference USING (reference_id)" 396 " LEFT JOIN dbxref USING (dbxref_id)" 397 " WHERE bioentry_id = %s" 398 " ORDER BY rank", (primary_id,)) 399 references = [] 400 for start, end, location, title, authors, dbname, accession in refs: 401 reference = SeqFeature.Reference() 402 #If the start/end are missing, reference.location is an empty list 403 if (start is not None) or (end is not None): 404 if start is not None: 405 start -= 1 # python counting 406 reference.location = [SeqFeature.FeatureLocation(start, end)] 407 #Don't replace the default "" with None. 408 if authors: 409 reference.authors = authors 410 if title: 411 reference.title = title 412 reference.journal = location 413 if dbname == 'PUBMED': 414 reference.pubmed_id = accession 415 elif dbname == 'MEDLINE': 416 reference.medline_id = accession 417 references.append(reference) 418 if references: 419 return {'references': references} 420 else: 421 return {}
422 423
424 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
425 a = {} 426 common_names = adaptor.execute_and_fetch_col0( 427 "SELECT name FROM taxon_name WHERE taxon_id = %s" 428 " AND name_class = 'genbank common name'", (taxon_id,)) 429 if common_names: 430 a['source'] = common_names[0] 431 scientific_names = adaptor.execute_and_fetch_col0( 432 "SELECT name FROM taxon_name WHERE taxon_id = %s" 433 " AND name_class = 'scientific name'", (taxon_id,)) 434 if scientific_names: 435 a['organism'] = scientific_names[0] 436 ncbi_taxids = adaptor.execute_and_fetch_col0( 437 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 438 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 439 a['ncbi_taxid'] = ncbi_taxids[0] 440 441 #Old code used the left/right values in the taxon table to get the 442 #taxonomy lineage in one SQL command. This was actually very slow, 443 #and would fail if the (optional) left/right values were missing. 444 # 445 #The following code is based on a contribution from Eric Gibert, and 446 #relies on the taxon table's parent_taxon_id field only (ignoring the 447 #optional left/right values). This means that it has to make a 448 #separate SQL query for each entry in the lineage, but it does still 449 #appear to be *much* faster. See Bug 2494. 450 taxonomy = [] 451 while taxon_id: 452 name, rank, parent_taxon_id = adaptor.execute_one( 453 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" 454 " FROM taxon, taxon_name" 455 " WHERE taxon.taxon_id=taxon_name.taxon_id" 456 " AND taxon_name.name_class='scientific name'" 457 " AND taxon.taxon_id = %s", (taxon_id,)) 458 if taxon_id == parent_taxon_id: 459 # If the taxon table has been populated by the BioSQL script 460 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 461 # Personally, I would have used a NULL parent_taxon_id here. 462 break 463 if rank != "no rank": 464 #For consistency with older versions of Biopython, we are only 465 #interested in taxonomy entries with a stated rank. 466 #Add this to the start of the lineage list. 467 taxonomy.insert(0, name) 468 taxon_id = parent_taxon_id 469 470 if taxonomy: 471 a['taxonomy'] = taxonomy 472 return a
473 474
475 -def _retrieve_comment(adaptor, primary_id):
476 qvs = adaptor.execute_and_fetchall( 477 "SELECT comment_text FROM comment" 478 " WHERE bioentry_id=%s" 479 " ORDER BY rank", (primary_id,)) 480 comments = [comm[0] for comm in qvs] 481 #Don't want to add an empty list... 482 if comments: 483 return {"comment": comments} 484 else: 485 return {}
486 487
488 -class DBSeqRecord(SeqRecord):
489 """BioSQL equivalent of the biopython SeqRecord object. 490 """ 491
492 - def __init__(self, adaptor, primary_id):
493 self._adaptor = adaptor 494 self._primary_id = primary_id 495 496 (self._biodatabase_id, self._taxon_id, self.name, 497 accession, version, self._identifier, 498 self._division, self.description) = self._adaptor.execute_one( 499 "SELECT biodatabase_id, taxon_id, name, accession, version," 500 " identifier, division, description" 501 " FROM bioentry" 502 " WHERE bioentry_id = %s", (self._primary_id,)) 503 if version and version != "0": 504 self.id = "%s.%s" % (accession, version) 505 else: 506 self.id = accession 507 #We don't yet record any per-letter-annotations in the 508 #BioSQL database, but we should set this property up 509 #for completeness (and the __str__ method). 510 try: 511 length = len(self.seq) 512 except: 513 #Could be no sequence in the database! 514 length = 0 515 self._per_letter_annotations = _RestrictedDict(length=length)
516
517 - def __get_seq(self):
518 if not hasattr(self, "_seq"): 519 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 520 return self._seq
521
522 - def __set_seq(self, seq):
523 self._seq = seq
524
525 - def __del_seq(self):
526 del self._seq
527 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 528
529 - def __get_dbxrefs(self):
530 if not hasattr(self, "_dbxrefs"): 531 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 532 return self._dbxrefs
533
534 - def __set_dbxrefs(self, dbxrefs):
535 self._dbxrefs = dbxrefs
536
537 - def __del_dbxrefs(self):
538 del self._dbxrefs
539 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 540 "Database cross references") 541
542 - def __get_features(self):
543 if not hasattr(self, "_features"): 544 self._features = _retrieve_features(self._adaptor, 545 self._primary_id) 546 return self._features
547
548 - def __set_features(self, features):
549 self._features = features
550
551 - def __del_features(self):
552 del self._features
553 features = property(__get_features, __set_features, __del_features, 554 "Features") 555
556 - def __get_annotations(self):
557 if not hasattr(self, "_annotations"): 558 self._annotations = _retrieve_annotations(self._adaptor, 559 self._primary_id, 560 self._taxon_id) 561 if self._identifier: 562 self._annotations["gi"] = self._identifier 563 if self._division: 564 self._annotations["data_file_division"] = self._division 565 return self._annotations
566
567 - def __set_annotations(self, annotations):
568 self._annotations = annotations
569
570 - def __del_annotations(self):
571 del self._annotations
572 annotations = property(__get_annotations, __set_annotations, 573 __del_annotations, "Annotations")
574