1
2
3
4
5
6
7
8
9
10 """Implementations of Biopython-like Seq objects on top of BioSQL.
11
12 This allows retrival of items stored in a BioSQL database using
13 a biopython-like SeqRecord and Seq interface.
14
15 Note: Currently we do not support recording per-letter-annotations
16 (like quality scores) in BioSQL.
17 """
18
19 from Bio import Alphabet
20 from Bio.Seq import Seq, UnknownSeq
21 from Bio.SeqRecord import SeqRecord, _RestrictedDict
22 from Bio import SeqFeature
23
24
26 - def __init__(self, primary_id, adaptor, alphabet, start, length):
27 """Create a new DBSeq object referring to a BioSQL entry.
28
29 You wouldn't normally create a DBSeq object yourself, this is done
30 for you when retreiving a DBSeqRecord object from the database.
31 """
32 self.primary_id = primary_id
33 self.adaptor = adaptor
34 self.alphabet = alphabet
35 self._length = length
36 self.start = start
37
40
42
43
44
45 if isinstance(index, int):
46
47 i = index
48 if i < 0:
49 if -i > self._length:
50 raise IndexError(i)
51 i = i + self._length
52 elif i >= self._length:
53 raise IndexError(i)
54 return self.adaptor.get_subseq_as_string(self.primary_id,
55 self.start + i,
56 self.start + i + 1)
57 if not isinstance(index, slice):
58 raise ValueError("Unexpected index type")
59
60
61
62 if index.start is None:
63 i = 0
64 else:
65 i = index.start
66 if i < 0:
67
68 if -i > self._length:
69 raise IndexError(i)
70 i = i + self._length
71 elif i >= self._length:
72
73 i = self._length
74
75 if index.stop is None:
76 j = self._length
77 else:
78 j = index.stop
79 if j < 0:
80
81 if -j > self._length:
82 raise IndexError(j)
83 j = j + self._length
84 elif j >= self._length:
85 j = self._length
86
87 if i >= j:
88
89 return Seq("", self.alphabet)
90 elif index.step is None or index.step == 1:
91
92 return self.__class__(self.primary_id, self.adaptor, self.alphabet,
93 self.start + i, j - i)
94 else:
95
96 full = self.adaptor.get_subseq_as_string(self.primary_id,
97 self.start + i,
98 self.start + j)
99 return Seq(full[::index.step], self.alphabet)
100
102 """Returns the full sequence as a python string.
103
104 Although not formally deprecated, you are now encouraged to use
105 str(my_seq) instead of my_seq.tostring()."""
106 return self.adaptor.get_subseq_as_string(self.primary_id,
107 self.start,
108 self.start + self._length)
109
115
116 data = property(tostring, doc="Sequence as string (DEPRECATED)")
117
119 """Returns the full sequence as a Seq object."""
120
121 return Seq(str(self), self.alphabet)
122
126
130
131
133
134
135
136
137
138
139 seqs = adaptor.execute_and_fetchall(
140 "SELECT alphabet, length, length(seq) FROM biosequence"
141 " WHERE bioentry_id = %s", (primary_id,))
142 if not seqs:
143 return
144 assert len(seqs) == 1
145 moltype, given_length, length = seqs[0]
146
147 try:
148 length = int(length)
149 given_length = int(length)
150 assert length == given_length
151 have_seq = True
152 except TypeError:
153 assert length is None
154 seqs = adaptor.execute_and_fetchall(
155 "SELECT alphabet, length, seq FROM biosequence"
156 " WHERE bioentry_id = %s", (primary_id,))
157 assert len(seqs) == 1
158 moltype, given_length, seq = seqs[0]
159 assert seq is None or seq == ""
160 length = int(given_length)
161 have_seq = False
162 del seq
163 del given_length
164
165 moltype = moltype.lower()
166
167
168 if moltype == "dna":
169 alphabet = Alphabet.generic_dna
170 elif moltype == "rna":
171 alphabet = Alphabet.generic_rna
172 elif moltype == "protein":
173 alphabet = Alphabet.generic_protein
174 elif moltype == "unknown":
175
176
177 alphabet = Alphabet.single_letter_alphabet
178 else:
179 raise AssertionError("Unknown moltype: %s" % moltype)
180
181 if have_seq:
182 return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
183 else:
184 return UnknownSeq(length, alphabet)
185
186
188 """Retrieve the database cross references for the sequence."""
189 _dbxrefs = []
190 dbxrefs = adaptor.execute_and_fetchall(
191 "SELECT dbname, accession, version"
192 " FROM bioentry_dbxref join dbxref using (dbxref_id)"
193 " WHERE bioentry_id = %s"
194 " ORDER BY rank", (primary_id,))
195 for dbname, accession, version in dbxrefs:
196 if version and version != "0":
197 v = "%s.%s" % (accession, version)
198 else:
199 v = accession
200 _dbxrefs.append("%s:%s" % (dbname, v))
201 return _dbxrefs
202
203
205 sql = "SELECT seqfeature_id, type.name, rank" \
206 " FROM seqfeature join term type on (type_term_id = type.term_id)" \
207 " WHERE bioentry_id = %s" \
208 " ORDER BY rank"
209 results = adaptor.execute_and_fetchall(sql, (primary_id,))
210 seq_feature_list = []
211 for seqfeature_id, seqfeature_type, seqfeature_rank in results:
212
213 qvs = adaptor.execute_and_fetchall(
214 "SELECT name, value"
215 " FROM seqfeature_qualifier_value join term using (term_id)"
216 " WHERE seqfeature_id = %s"
217 " ORDER BY rank", (seqfeature_id,))
218 qualifiers = {}
219 for qv_name, qv_value in qvs:
220 qualifiers.setdefault(qv_name, []).append(qv_value)
221
222 qvs = adaptor.execute_and_fetchall(
223 "SELECT dbxref.dbname, dbxref.accession"
224 " FROM dbxref join seqfeature_dbxref using (dbxref_id)"
225 " WHERE seqfeature_dbxref.seqfeature_id = %s"
226 " ORDER BY rank", (seqfeature_id,))
227 for qv_name, qv_value in qvs:
228 value = "%s:%s" % (qv_name, qv_value)
229 qualifiers.setdefault("db_xref", []).append(value)
230
231 results = adaptor.execute_and_fetchall(
232 "SELECT location_id, start_pos, end_pos, strand"
233 " FROM location"
234 " WHERE seqfeature_id = %s"
235 " ORDER BY rank", (seqfeature_id,))
236 locations = []
237
238
239
240
241
242
243 for location_id, start, end, strand in results:
244 if start:
245 start -= 1
246 if strand == 0:
247 strand = None
248 if strand not in (+1, -1, None):
249 raise ValueError("Invalid strand %s found in database for "
250 "seqfeature_id %s" % (strand, seqfeature_id))
251 if end < start:
252 import warnings
253 from Bio import BiopythonWarning
254 warnings.warn("Inverted location start/end (%i and %i) for "
255 "seqfeature_id %s" % (start, end, seqfeature_id),
256 BiopythonWarning)
257 locations.append((location_id, start, end, strand))
258
259 remote_results = adaptor.execute_and_fetchall(
260 "SELECT location_id, dbname, accession, version"
261 " FROM location join dbxref using (dbxref_id)"
262 " WHERE seqfeature_id = %s", (seqfeature_id,))
263 lookup = {}
264 for location_id, dbname, accession, version in remote_results:
265 if version and version != "0":
266 v = "%s.%s" % (accession, version)
267 else:
268 v = accession
269
270
271 if dbname == "":
272 dbname = None
273 lookup[location_id] = (dbname, v)
274
275 feature = SeqFeature.SeqFeature(type=seqfeature_type)
276 feature._seqfeature_id = seqfeature_id
277 feature.qualifiers = qualifiers
278 if len(locations) == 0:
279 pass
280 elif len(locations) == 1:
281 location_id, start, end, strand = locations[0]
282
283
284 feature.location_operator = \
285 _retrieve_location_qualifier_value(adaptor, location_id)
286 dbname, version = lookup.get(location_id, (None, None))
287 feature.location = SeqFeature.FeatureLocation(start, end)
288 feature.strand = strand
289 feature.ref_db = dbname
290 feature.ref = version
291 else:
292 assert feature.sub_features == []
293 for location in locations:
294 location_id, start, end, strand = location
295 dbname, version = lookup.get(location_id, (None, None))
296 subfeature = SeqFeature.SeqFeature()
297 subfeature.type = seqfeature_type
298 subfeature.location_operator = \
299 _retrieve_location_qualifier_value(adaptor, location_id)
300
301
302
303 if not subfeature.location_operator:
304 subfeature.location_operator = "join"
305 subfeature.location = SeqFeature.FeatureLocation(start, end)
306 subfeature.strand = strand
307 subfeature.ref_db = dbname
308 subfeature.ref = version
309 feature.sub_features.append(subfeature)
310
311
312 feature.location_operator = \
313 feature.sub_features[0].location_operator
314
315
316 start = locations[0][1]
317 end = locations[-1][2]
318 feature.location = SeqFeature.FeatureLocation(start, end)
319
320
321
322 strands = set(sf.strand for sf in feature.sub_features)
323 if len(strands) == 1:
324 feature.strand = feature.sub_features[0].strand
325 else:
326 feature.strand = None
327
328 seq_feature_list.append(feature)
329
330 return seq_feature_list
331
332
334 value = adaptor.execute_and_fetch_col0(
335 "SELECT value FROM location_qualifier_value"
336 " WHERE location_id = %s", (location_id,))
337 try:
338 return value[0]
339 except IndexError:
340 return ""
341
342
359
360
362 if isinstance(text, unicode):
363 return str(text)
364 else:
365 return text
366
367
369 qvs = adaptor.execute_and_fetchall(
370 "SELECT name, value"
371 " FROM bioentry_qualifier_value JOIN term USING (term_id)"
372 " WHERE bioentry_id = %s"
373 " ORDER BY rank", (primary_id,))
374 qualifiers = {}
375 for name, value in qvs:
376 if name == "keyword":
377 name = "keywords"
378
379 elif name == "date_changed":
380 name = "date"
381 elif name == "secondary_accession":
382 name = "accessions"
383 qualifiers.setdefault(name, []).append(value)
384 return qualifiers
385
386
388
389
390 refs = adaptor.execute_and_fetchall(
391 "SELECT start_pos, end_pos, "
392 " location, title, authors,"
393 " dbname, accession"
394 " FROM bioentry_reference"
395 " JOIN reference USING (reference_id)"
396 " LEFT JOIN dbxref USING (dbxref_id)"
397 " WHERE bioentry_id = %s"
398 " ORDER BY rank", (primary_id,))
399 references = []
400 for start, end, location, title, authors, dbname, accession in refs:
401 reference = SeqFeature.Reference()
402
403 if (start is not None) or (end is not None):
404 if start is not None:
405 start -= 1
406 reference.location = [SeqFeature.FeatureLocation(start, end)]
407
408 if authors:
409 reference.authors = authors
410 if title:
411 reference.title = title
412 reference.journal = location
413 if dbname == 'PUBMED':
414 reference.pubmed_id = accession
415 elif dbname == 'MEDLINE':
416 reference.medline_id = accession
417 references.append(reference)
418 if references:
419 return {'references': references}
420 else:
421 return {}
422
423
425 a = {}
426 common_names = adaptor.execute_and_fetch_col0(
427 "SELECT name FROM taxon_name WHERE taxon_id = %s"
428 " AND name_class = 'genbank common name'", (taxon_id,))
429 if common_names:
430 a['source'] = common_names[0]
431 scientific_names = adaptor.execute_and_fetch_col0(
432 "SELECT name FROM taxon_name WHERE taxon_id = %s"
433 " AND name_class = 'scientific name'", (taxon_id,))
434 if scientific_names:
435 a['organism'] = scientific_names[0]
436 ncbi_taxids = adaptor.execute_and_fetch_col0(
437 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,))
438 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
439 a['ncbi_taxid'] = ncbi_taxids[0]
440
441
442
443
444
445
446
447
448
449
450 taxonomy = []
451 while taxon_id:
452 name, rank, parent_taxon_id = adaptor.execute_one(
453 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id"
454 " FROM taxon, taxon_name"
455 " WHERE taxon.taxon_id=taxon_name.taxon_id"
456 " AND taxon_name.name_class='scientific name'"
457 " AND taxon.taxon_id = %s", (taxon_id,))
458 if taxon_id == parent_taxon_id:
459
460
461
462 break
463 if rank != "no rank":
464
465
466
467 taxonomy.insert(0, name)
468 taxon_id = parent_taxon_id
469
470 if taxonomy:
471 a['taxonomy'] = taxonomy
472 return a
473
474
486
487
489 """BioSQL equivalent of the biopython SeqRecord object.
490 """
491
492 - def __init__(self, adaptor, primary_id):
493 self._adaptor = adaptor
494 self._primary_id = primary_id
495
496 (self._biodatabase_id, self._taxon_id, self.name,
497 accession, version, self._identifier,
498 self._division, self.description) = self._adaptor.execute_one(
499 "SELECT biodatabase_id, taxon_id, name, accession, version,"
500 " identifier, division, description"
501 " FROM bioentry"
502 " WHERE bioentry_id = %s", (self._primary_id,))
503 if version and version != "0":
504 self.id = "%s.%s" % (accession, version)
505 else:
506 self.id = accession
507
508
509
510 try:
511 length = len(self.seq)
512 except:
513
514 length = 0
515 self._per_letter_annotations = _RestrictedDict(length=length)
516
518 if not hasattr(self, "_seq"):
519 self._seq = _retrieve_seq(self._adaptor, self._primary_id)
520 return self._seq
521
524
527 seq = property(__get_seq, __set_seq, __del_seq, "Seq object")
528
530 if not hasattr(self, "_dbxrefs"):
531 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
532 return self._dbxrefs
533
536
539 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs,
540 "Database cross references")
541
543 if not hasattr(self, "_features"):
544 self._features = _retrieve_features(self._adaptor,
545 self._primary_id)
546 return self._features
547
550
553 features = property(__get_features, __set_features, __del_features,
554 "Features")
555
557 if not hasattr(self, "_annotations"):
558 self._annotations = _retrieve_annotations(self._adaptor,
559 self._primary_id,
560 self._taxon_id)
561 if self._identifier:
562 self._annotations["gi"] = self._identifier
563 if self._division:
564 self._annotations["data_file_division"] = self._division
565 return self._annotations
566
569
571 del self._annotations
572 annotations = property(__get_annotations, __set_annotations,
573 __del_annotations, "Annotations")
574