1
2
3
4
5
6
7
8 """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
9
10 You are expected to use this module via the Bio.SeqIO functions.
11 See also the Bio.SwissProt module which offers more than just accessing
12 the sequences as SeqRecord objects.
13
14 See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format.
15 """
16
17 from Bio import Seq
18 from Bio import SeqRecord
19 from Bio import Alphabet
20 from Bio import SeqFeature
21 from Bio import SwissProt
22
23
25 """Turn a Swiss location position into a SeqFeature position object (PRIVATE).
26
27 An offset of -1 is used with a start location to make it pythonic.
28 """
29 if location_string == "?":
30 return SeqFeature.UnknownPosition()
31
32 try:
33 return SeqFeature.ExactPosition(max(0, offset + int(location_string)))
34 except ValueError:
35 pass
36 if location_string.startswith("<"):
37 try:
38 return SeqFeature.BeforePosition(max(0, offset + int(location_string[1:])))
39 except ValueError:
40 pass
41 elif location_string.startswith(">"):
42 try:
43 return SeqFeature.AfterPosition(max(0, offset + int(location_string[1:])))
44 except ValueError:
45 pass
46 elif location_string.startswith("?"):
47 try:
48 return SeqFeature.UncertainPosition(max(0, offset + int(location_string[1:])))
49 except ValueError:
50 pass
51 raise NotImplementedError("Cannot parse location '%s'" % location_string)
52
53
62
63
65 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.
66
67 Every section from the ID line to the terminating // becomes
68 a single SeqRecord with associated annotation and features.
69
70 This parser is for the flat file "swiss" format as used by:
71 * Swiss-Prot aka SwissProt
72 * TrEMBL
73 * UniProtKB aka UniProt Knowledgebase
74
75 For consistency with BioPerl and EMBOSS we call this the "swiss"
76 format. See also the SeqIO support for "uniprot-xml" format.
77 """
78 swiss_records = SwissProt.parse(handle)
79 for swiss_record in swiss_records:
80
81 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
82 record = SeqRecord.SeqRecord(seq,
83 id=swiss_record.accessions[0],
84 name=swiss_record.entry_name,
85 description=swiss_record.description,
86 features=[_make_seqfeature(*f) for f
87 in swiss_record.features],
88 )
89 record.description = swiss_record.description
90 for cross_reference in swiss_record.cross_references:
91 if len(cross_reference) < 2:
92 continue
93 database, accession = cross_reference[:2]
94 dbxref = "%s:%s" % (database, accession)
95 if not dbxref in record.dbxrefs:
96 record.dbxrefs.append(dbxref)
97 annotations = record.annotations
98 annotations['accessions'] = swiss_record.accessions
99 annotations['date'] = swiss_record.created[0]
100 annotations[
101 'date_last_sequence_update'] = swiss_record.sequence_update[0]
102 if swiss_record.annotation_update:
103 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
104 if swiss_record.gene_name:
105 annotations['gene_name'] = swiss_record.gene_name
106 annotations['organism'] = swiss_record.organism.rstrip(".")
107 annotations['taxonomy'] = swiss_record.organism_classification
108 annotations['ncbi_taxid'] = swiss_record.taxonomy_id
109 if swiss_record.host_organism:
110 annotations['organism_host'] = swiss_record.host_organism
111 if swiss_record.host_taxonomy_id:
112 annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
113 if swiss_record.comments:
114 annotations['comment'] = "\n".join(swiss_record.comments)
115 if swiss_record.references:
116 annotations['references'] = []
117 for reference in swiss_record.references:
118 feature = SeqFeature.Reference()
119 feature.comment = " ".join(["%s=%s;" % (key, value)
120 for key, value
121 in reference.comments])
122 for key, value in reference.references:
123 if key == 'PubMed':
124 feature.pubmed_id = value
125 elif key == 'MEDLINE':
126 feature.medline_id = value
127 elif key == 'DOI':
128 pass
129 elif key == 'AGRICOLA':
130 pass
131 else:
132 raise ValueError(
133 "Unknown key %s found in references" % key)
134 feature.authors = reference.authors
135 feature.title = reference.title
136 feature.journal = reference.location
137 annotations['references'].append(feature)
138 if swiss_record.keywords:
139 record.annotations['keywords'] = swiss_record.keywords
140 yield record
141
142 if __name__ == "__main__":
143 print "Quick self test..."
144
145 example_filename = "../../Tests/SwissProt/sp008"
146
147 import os
148 if not os.path.isfile(example_filename):
149 print "Missing test file %s" % example_filename
150 else:
151
152 handle = open(example_filename)
153 records = SwissIterator(handle)
154 for record in records:
155 print record.name
156 print record.id
157 print record.annotations['keywords']
158 print repr(record.annotations['organism'])
159 print str(record.seq)[:20] + "..."
160 for f in record.features:
161 print f
162 handle.close()
163