1
2
3
4
5
6
7
8 """
9 This module provides code to work with the prosite dat file from
10 Prosite.
11 http://www.expasy.ch/prosite/
12
13 Tested with:
14 Release 20.43, 10-Feb-2009
15
16
17 Functions:
18 read Reads a Prosite file containing one Prosite record
19 parse Iterates over records in a Prosite file.
20
21 Classes:
22 Record Holds Prosite data.
23 """
24
25
27 """Parse Prosite records.
28
29 This function is for parsing Prosite files containing multiple
30 records.
31
32 handle - handle to the file."""
33 while True:
34 record = __read(handle)
35 if not record:
36 break
37 yield record
38
39
41 """Read one Prosite record.
42
43 This function is for parsing Prosite files containing
44 exactly one record.
45
46 handle - handle to the file."""
47
48 record = __read(handle)
49
50 remainder = handle.read()
51 if remainder:
52 raise ValueError("More than one Prosite record found")
53 return record
54
55
57 """Holds information from a Prosite record.
58
59 Members:
60 name ID of the record. e.g. ADH_ZINC
61 type Type of entry. e.g. PATTERN, MATRIX, or RULE
62 accession e.g. PS00387
63 created Date the entry was created. (MMM-YYYY)
64 data_update Date the 'primary' data was last updated.
65 info_update Date data other than 'primary' data was last updated.
66 pdoc ID of the PROSITE DOCumentation.
67
68 description Free-format description.
69 pattern The PROSITE pattern. See docs.
70 matrix List of strings that describes a matrix entry.
71 rules List of rule definitions (from RU lines). (strings)
72 prorules List of prorules (from PR lines). (strings)
73
74 NUMERICAL RESULTS
75 nr_sp_release SwissProt release.
76 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
77 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
78 nr_positive True positives. tuple of (hits, seqs)
79 nr_unknown Could be positives. tuple of (hits, seqs)
80 nr_false_pos False positives. tuple of (hits, seqs)
81 nr_false_neg False negatives. (int)
82 nr_partial False negatives, because they are fragments. (int)
83
84 COMMENTS
85 cc_taxo_range Taxonomic range. See docs for format
86 cc_max_repeat Maximum number of repetitions in a protein
87 cc_site Interesting site. list of tuples (pattern pos, desc.)
88 cc_skip_flag Can this entry be ignored?
89 cc_matrix_type
90 cc_scaling_db
91 cc_author
92 cc_ft_key
93 cc_ft_desc
94 cc_version version number (introduced in release 19.0)
95
96 DATA BANK REFERENCES - The following are all
97 lists of tuples (swiss-prot accession,
98 swiss-prot name)
99 dr_positive
100 dr_false_neg
101 dr_false_pos
102 dr_potential Potential hits, but fingerprint region not yet available.
103 dr_unknown Could possibly belong
104
105 pdb_structs List of PDB entries.
106
107 """
109 self.name = ''
110 self.type = ''
111 self.accession = ''
112 self.created = ''
113 self.data_update = ''
114 self.info_update = ''
115 self.pdoc = ''
116
117 self.description = ''
118 self.pattern = ''
119 self.matrix = []
120 self.rules = []
121 self.prorules = []
122 self.postprocessing = []
123
124 self.nr_sp_release = ''
125 self.nr_sp_seqs = ''
126 self.nr_total = (None, None)
127 self.nr_positive = (None, None)
128 self.nr_unknown = (None, None)
129 self.nr_false_pos = (None, None)
130 self.nr_false_neg = None
131 self.nr_partial = None
132
133 self.cc_taxo_range = ''
134 self.cc_max_repeat = ''
135 self.cc_site = []
136 self.cc_skip_flag = ''
137
138 self.dr_positive = []
139 self.dr_false_neg = []
140 self.dr_false_pos = []
141 self.dr_potential = []
142 self.dr_unknown = []
143
144 self.pdb_structs = []
145
146
147
148
150 import re
151 record = None
152 for line in handle:
153 keyword, value = line[:2], line[5:].rstrip()
154 if keyword=='ID':
155 record = Record()
156 cols = value.split("; ")
157 if len(cols) != 2:
158 raise ValueError("I don't understand identification line\n%s"
159 % line)
160 record.name = cols[0]
161 record.type = cols[1].rstrip('.')
162 elif keyword=='AC':
163 record.accession = value.rstrip(';')
164 elif keyword=='DT':
165 dates = value.rstrip('.').split("; ")
166 if (not dates[0].endswith('(CREATED)')) or \
167 (not dates[1].endswith('(DATA UPDATE)')) or \
168 (not dates[2].endswith('(INFO UPDATE)')):
169 raise ValueError("I don't understand date line\n%s" % line)
170 record.created = dates[0].rstrip(' (CREATED)')
171 record.data_update = dates[1].rstrip(' (DATA UPDATE)')
172 record.info_update = dates[2].rstrip(' (INFO UPDATE)')
173 elif keyword=='DE':
174 record.description = value
175 elif keyword=='PA':
176 record.pattern += value
177 elif keyword=='MA':
178 record.matrix.append(value)
179 elif keyword=='PP':
180 record.postprocessing.extend(value.split(";"))
181 elif keyword=='RU':
182 record.rules.append(value)
183 elif keyword=='NR':
184 cols = value.split(";")
185 for col in cols:
186 if not col:
187 continue
188 qual, data = [word.lstrip() for word in col.split("=")]
189 if qual == '/RELEASE':
190 release, seqs = data.split(",")
191 record.nr_sp_release = release
192 record.nr_sp_seqs = int(seqs)
193 elif qual == '/FALSE_NEG':
194 record.nr_false_neg = int(data)
195 elif qual == '/PARTIAL':
196 record.nr_partial = int(data)
197 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
198 m = re.match(r'(\d+)\((\d+)\)', data)
199 if not m:
200 raise Exception("Broken data %s in comment line\n%s"
201 % (repr(data), line))
202 hits = tuple(map(int, m.groups()))
203 if(qual == "/TOTAL"):
204 record.nr_total = hits
205 elif(qual == "/POSITIVE"):
206 record.nr_positive = hits
207 elif(qual == "/UNKNOWN"):
208 record.nr_unknown = hits
209 elif(qual == "/FALSE_POS"):
210 record.nr_false_pos = hits
211 else:
212 raise ValueError("Unknown qual %s in comment line\n%s"
213 % (repr(qual), line))
214 elif keyword=='CC':
215
216
217
218 cols = value.split(";")
219 for col in cols:
220 if not col or col[:17] == 'Automatic scaling':
221
222
223
224 continue
225 if col.count("=") == 0:
226
227
228
229 continue
230 qual, data = [word.lstrip() for word in col.split("=")]
231 if qual == '/TAXO-RANGE':
232 record.cc_taxo_range = data
233 elif qual == '/MAX-REPEAT':
234 record.cc_max_repeat = data
235 elif qual == '/SITE':
236 pos, desc = data.split(",")
237 record.cc_site.append((int(pos), desc))
238 elif qual == '/SKIP-FLAG':
239 record.cc_skip_flag = data
240 elif qual == '/MATRIX_TYPE':
241 record.cc_matrix_type = data
242 elif qual == '/SCALING_DB':
243 record.cc_scaling_db = data
244 elif qual == '/AUTHOR':
245 record.cc_author = data
246 elif qual == '/FT_KEY':
247 record.cc_ft_key = data
248 elif qual == '/FT_DESC':
249 record.cc_ft_desc = data
250 elif qual == '/VERSION':
251 record.cc_version = data
252 else:
253 raise ValueError("Unknown qual %s in comment line\n%s"
254 % (repr(qual), line))
255 elif keyword=='DR':
256 refs = value.split(";")
257 for ref in refs:
258 if not ref:
259 continue
260 acc, name, type = [word.strip() for word in ref.split(",")]
261 if type == 'T':
262 record.dr_positive.append((acc, name))
263 elif type == 'F':
264 record.dr_false_pos.append((acc, name))
265 elif type == 'N':
266 record.dr_false_neg.append((acc, name))
267 elif type == 'P':
268 record.dr_potential.append((acc, name))
269 elif type == '?':
270 record.dr_unknown.append((acc, name))
271 else:
272 raise ValueError("I don't understand type flag %s" % type)
273 elif keyword=='3D':
274 cols = value.split()
275 for id in cols:
276 record.pdb_structs.append(id.rstrip(';'))
277 elif keyword=='PR':
278 rules = value.split(";")
279 record.prorules.extend(rules)
280 elif keyword=='DO':
281 record.pdoc = value.rstrip(';')
282 elif keyword=='CC':
283 continue
284 elif keyword=='//':
285 if not record:
286
287 continue
288 break
289 else:
290 raise ValueError("Unknown keyword %s found" % keyword)
291 else:
292 return
293 if not record:
294 raise ValueError("Unexpected end of stream.")
295 return record
296