1
2
3
4
5 """
6 This module provides code to work with the sprotXX.dat file from
7 SwissProt.
8 http://www.expasy.ch/sprot/sprot-top.html
9
10 Tested with:
11 Release 56.9, 03-March-2009.
12
13
14 Classes:
15 Record Holds SwissProt data.
16 Reference Holds reference data from a SwissProt record.
17
18 Functions:
19 read Read one SwissProt record
20 parse Read multiple SwissProt records
21
22 """
23
24 from Bio._py3k import _as_string
25
26
28 """Holds information from a SwissProt record.
29
30 Members:
31 entry_name Name of this entry, e.g. RL1_ECOLI.
32 data_class Either 'STANDARD' or 'PRELIMINARY'.
33 molecule_type Type of molecule, 'PRT',
34 sequence_length Number of residues.
35
36 accessions List of the accession numbers, e.g. ['P00321']
37 created A tuple of (date, release).
38 sequence_update A tuple of (date, release).
39 annotation_update A tuple of (date, release).
40
41 description Free-format description.
42 gene_name Gene name. See userman.txt for description.
43 organism The source of the sequence.
44 organelle The origin of the sequence.
45 organism_classification The taxonomy classification. List of strings.
46 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
47 taxonomy_id A list of NCBI taxonomy id's.
48 host_organism A list of names of the hosts of a virus, if any.
49 host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any.
50 references List of Reference objects.
51 comments List of strings.
52 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
53 keywords List of the keywords.
54 features List of tuples (key name, from, to, description).
55 from and to can be either integers for the residue
56 numbers, '<', '>', or '?'
57
58 seqinfo tuple of (length, molecular weight, CRC32 value)
59 sequence The sequence.
60
61 """
63 self.entry_name = None
64 self.data_class = None
65 self.molecule_type = None
66 self.sequence_length = None
67
68 self.accessions = []
69 self.created = None
70 self.sequence_update = None
71 self.annotation_update = None
72
73 self.description = []
74 self.gene_name = ''
75 self.organism = []
76 self.organelle = ''
77 self.organism_classification = []
78 self.taxonomy_id = []
79 self.host_organism = []
80 self.host_taxonomy_id = []
81 self.references = []
82 self.comments = []
83 self.cross_references = []
84 self.keywords = []
85 self.features = []
86
87 self.seqinfo = None
88 self.sequence = ''
89
90
92 """Holds information from one reference in a SwissProt entry.
93
94 Members:
95 number Number of reference in an entry.
96 positions Describes extent of work. List of strings.
97 comments Comments. List of (token, text).
98 references References. List of (dbname, identifier).
99 authors The authors of the work.
100 title Title of the work.
101 location A citation for the work.
102
103 """
105 self.number = None
106 self.positions = []
107 self.comments = []
108 self.references = []
109 self.authors = []
110 self.title = []
111 self.location = []
112
113
115 while True:
116 record = _read(handle)
117 if not record:
118 return
119 yield record
120
121
123 record = _read(handle)
124 if not record:
125 raise ValueError("No SwissProt record found")
126
127 remainder = handle.read()
128 if remainder:
129 raise ValueError("More than one SwissProt record found")
130 return record
131
132
133
134
135
137 record = None
138 unread = ""
139 for line in handle:
140
141
142 line = _as_string(line)
143 key, value = line[:2], line[5:].rstrip()
144 if unread:
145 value = unread + " " + value
146 unread = ""
147 if key == '**':
148
149
150
151
152
153
154 pass
155 elif key == 'ID':
156 record = Record()
157 _read_id(record, line)
158 _sequence_lines = []
159 elif key == 'AC':
160 accessions = [word for word in value.rstrip(";").split("; ")]
161 record.accessions.extend(accessions)
162 elif key == 'DT':
163 _read_dt(record, line)
164 elif key == 'DE':
165 record.description.append(value.strip())
166 elif key == 'GN':
167 if record.gene_name:
168 record.gene_name += " "
169 record.gene_name += value
170 elif key == 'OS':
171 record.organism.append(value)
172 elif key == 'OG':
173 record.organelle += line[5:]
174 elif key == 'OC':
175 cols = [col for col in value.rstrip(";.").split("; ")]
176 record.organism_classification.extend(cols)
177 elif key == 'OX':
178 _read_ox(record, line)
179 elif key == 'OH':
180 _read_oh(record, line)
181 elif key == 'RN':
182 reference = Reference()
183 _read_rn(reference, value)
184 record.references.append(reference)
185 elif key == 'RP':
186 assert record.references, "RP: missing RN"
187 record.references[-1].positions.append(value)
188 elif key == 'RC':
189 assert record.references, "RC: missing RN"
190 reference = record.references[-1]
191 unread = _read_rc(reference, value)
192 elif key == 'RX':
193 assert record.references, "RX: missing RN"
194 reference = record.references[-1]
195 _read_rx(reference, value)
196 elif key == 'RL':
197 assert record.references, "RL: missing RN"
198 reference = record.references[-1]
199 reference.location.append(value)
200
201
202
203 elif key == 'RA':
204 assert record.references, "RA: missing RN"
205 reference = record.references[-1]
206 reference.authors.append(value)
207 elif key == 'RG':
208 assert record.references, "RG: missing RN"
209 reference = record.references[-1]
210 reference.authors.append(value)
211 elif key == "RT":
212 assert record.references, "RT: missing RN"
213 reference = record.references[-1]
214 reference.title.append(value)
215 elif key == 'CC':
216 _read_cc(record, line)
217 elif key == 'DR':
218 _read_dr(record, value)
219 elif key == 'PE':
220
221 pass
222 elif key == 'KW':
223 cols = value.rstrip(";.").split('; ')
224 record.keywords.extend(cols)
225 elif key == 'FT':
226 _read_ft(record, line)
227 elif key == 'SQ':
228 cols = value.split()
229 assert len(cols) == 7, "I don't understand SQ line %s" % line
230
231 record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
232 elif key == ' ':
233 _sequence_lines.append(value.replace(" ", "").rstrip())
234 elif key == '//':
235
236 record.description = " ".join(record.description)
237 record.organism = " ".join(record.organism)
238 record.organelle = record.organelle.rstrip()
239 for reference in record.references:
240 reference.authors = " ".join(reference.authors).rstrip(";")
241 reference.title = " ".join(reference.title).rstrip(";")
242 if reference.title.startswith('"') and reference.title.endswith('"'):
243 reference.title = reference.title[1:-1]
244 reference.location = " ".join(reference.location)
245 record.sequence = "".join(_sequence_lines)
246 return record
247 else:
248 raise ValueError("Unknown keyword '%s' found" % key)
249 if record:
250 raise ValueError("Unexpected end of stream.")
251
252
254 cols = line[5:].split()
255
256
257
258
259
260 if len(cols) == 5:
261 record.entry_name = cols[0]
262 record.data_class = cols[1].rstrip(";")
263 record.molecule_type = cols[2].rstrip(";")
264 record.sequence_length = int(cols[3])
265 elif len(cols) == 4:
266 record.entry_name = cols[0]
267 record.data_class = cols[1].rstrip(";")
268 record.molecule_type = None
269 record.sequence_length = int(cols[2])
270 else:
271 raise ValueError("ID line has unrecognised format:\n" + line)
272
273 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed')
274 if record.data_class not in allowed:
275 raise ValueError("Unrecognized data class %s in line\n%s" %
276 (record.data_class, line))
277
278
279 if record.molecule_type not in (None, 'PRT'):
280 raise ValueError("Unrecognized molecule type %s in line\n%s" %
281 (record.molecule_type, line))
282
283
285 value = line[5:]
286 uprline = value.upper()
287 cols = value.rstrip().split()
288 if 'CREATED' in uprline \
289 or 'LAST SEQUENCE UPDATE' in uprline \
290 or 'LAST ANNOTATION UPDATE' in uprline:
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306 uprcols = uprline.split()
307 rel_index = -1
308 for index in range(len(uprcols)):
309 if 'REL.' in uprcols[index]:
310 rel_index = index
311 assert rel_index >= 0, \
312 "Could not find Rel. in DT line: %s" % line
313 version_index = rel_index + 1
314
315 str_version = cols[version_index].rstrip(",")
316
317 if str_version == '':
318 version = 0
319
320 elif '.' in str_version:
321 version = str_version
322
323 else:
324 version = int(str_version)
325 date = cols[0]
326
327 if 'CREATED' in uprline:
328 record.created = date, version
329 elif 'LAST SEQUENCE UPDATE' in uprline:
330 record.sequence_update = date, version
331 elif 'LAST ANNOTATION UPDATE' in uprline:
332 record.annotation_update = date, version
333 else:
334 assert False, "Shouldn't reach this line!"
335 elif 'INTEGRATED INTO' in uprline \
336 or 'SEQUENCE VERSION' in uprline \
337 or 'ENTRY VERSION' in uprline:
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358 try:
359 version = int(cols[-1])
360 except ValueError:
361 version = 0
362 date = cols[0].rstrip(",")
363
364
365
366 if "INTEGRATED" in uprline:
367 record.created = date, version
368 elif 'SEQUENCE VERSION' in uprline:
369 record.sequence_update = date, version
370 elif 'ENTRY VERSION' in uprline:
371 record.annotation_update = date, version
372 else:
373 assert False, "Shouldn't reach this line!"
374 else:
375 raise ValueError("I don't understand the date line %s" % line)
376
377
379
380
381
382
383
384
385
386
387
388
389 if record.taxonomy_id:
390 ids = line[5:].rstrip().rstrip(";")
391 else:
392 descr, ids = line[5:].rstrip().rstrip(";").split("=")
393 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
394 record.taxonomy_id.extend(ids.split(', '))
395
396
398
399 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line
400 line = line[16:].rstrip()
401 assert line[-1] == "." and line.count(";") == 1, line
402 taxid, name = line[:-1].split(";")
403 record.host_taxonomy_id.append(taxid.strip())
404 record.host_organism.append(name.strip())
405
406
408 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
409 reference.number = int(rn[1:-1])
410
411
432
433
435
436
437
438
439
440
441
442
443 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '')
444
445
446
447
448
449
450
451
452 warn = False
453 if "=" in value:
454 cols = value.split("; ")
455 cols = [x.strip() for x in cols]
456 cols = [x for x in cols if x]
457 for col in cols:
458 x = col.split("=")
459 if len(x) != 2 or x == ("DOI", "DOI"):
460 warn = True
461 break
462 assert len(x) == 2, "I don't understand RX line %s" % value
463 reference.references.append((x[0], x[1].rstrip(";")))
464
465 else:
466 cols = value.split("; ")
467
468 if len(cols) != 2:
469 warn = True
470 else:
471 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
472 if warn:
473 import warnings
474 from Bio import BiopythonParserWarning
475 warnings.warn("Possibly corrupt RX line %r" % value,
476 BiopythonParserWarning)
477
478
480 key, value = line[5:8], line[9:].rstrip()
481 if key == '-!-':
482 record.comments.append(value)
483 elif key == ' ':
484 if not record.comments:
485
486 record.comments.append(value)
487 else:
488 record.comments[-1] += " " + value
489
490
498
499
501 line = line[5:]
502 name = line[0:8].rstrip()
503 try:
504 from_res = int(line[9:15])
505 except ValueError:
506 from_res = line[9:15].lstrip()
507 try:
508 to_res = int(line[16:22])
509 except ValueError:
510 to_res = line[16:22].lstrip()
511
512 if line[29:35] == r"/FTId=":
513 ft_id = line[35:70].rstrip()[:-1]
514 description = ""
515 else:
516 ft_id = ""
517 description = line[29:70].rstrip()
518 if not name:
519 assert not from_res and not to_res
520 name, from_res, to_res, old_description, old_ft_id = record.features[-1]
521 del record.features[-1]
522 description = ("%s %s" % (old_description, description)).strip()
523
524
525 if name == "VARSPLIC":
526
527
528
529
530
531 descr_cols = description.split(" -> ")
532 if len(descr_cols) == 2:
533 first_seq, second_seq = descr_cols
534 extra_info = ''
535
536
537 extra_info_pos = second_seq.find(" (")
538 if extra_info_pos != -1:
539 extra_info = second_seq[extra_info_pos:]
540 second_seq = second_seq[:extra_info_pos]
541
542 first_seq = first_seq.replace(" ", "")
543 second_seq = second_seq.replace(" ", "")
544
545 description = first_seq + " -> " + second_seq + extra_info
546 record.features.append((name, from_res, to_res, description, ft_id))
547
548
549 if __name__ == "__main__":
550 print "Quick self test..."
551
552 example_filename = "../../Tests/SwissProt/sp008"
553
554 import os
555 if not os.path.isfile(example_filename):
556 print "Missing test file %s" % example_filename
557 else:
558
559
560 handle = open(example_filename)
561 records = parse(handle)
562 for record in records:
563 print record.entry_name
564 print ",".join(record.accessions)
565 print record.keywords
566 print repr(record.organism)
567 print record.sequence[:20] + "..."
568 handle.close()
569