| Trees | Indices | Help |
|
|---|
|
|
1 """Parser for FSSP files, used in a database of protein fold classifications.
2
3 This is a module to handle FSSP files. For now it parses only the header,
4 summary and alignment sections.
5
6 See: Holm and Sander (1996) The FSSP database: fold classification based on
7 structure-structure alignment of proteins.
8
9 functions: read_fssp(file_handle): reads an fssp file into the records. Returns a
10 tuple of two instances.
11 mult_align: returns a Biopython alignment object
12 """
13 import re
14 import fssp_rec
15 from Bio.Align import Generic
16 from Bio import Alphabet
17 fff_rec = fssp_rec.fff_rec
18 header_records = {
19 'database': re.compile('^DATABASE'),
20 'pdbid': re.compile('^PDBID'),
21 'header': re.compile('^HEADER'),
22 'compnd': re.compile('^COMPND'),
23 'author': re.compile('^AUTHOR'),
24 'source': re.compile('^SOURCE'),
25 'seqlength': re.compile('^SEQLENGTH'),
26 'nalign': re.compile('^NALIGN')
27 }
28
29 summary_title = re.compile('## +SUMMARY')
30 summary_rec = re.compile(' *[0-9]+: +[1-9][0-9a-z]{3,3}')
31 alignments_title= re.compile('## +ALIGNMENTS')
32 alignments_rec = re.compile(' *[0-9]+ +-{0,1}[0-9]+')
33 equiv_title = re.compile('## +EQUIVALENCES')
34
35
38 self.database = None
39 self.pdbid = ''
40 self.header = ''
41 self.compnd = ''
42 self.source = ''
43 self.author = []
44 self.seqlength = 0
45 self.nalign = 0
46
48 for i in header_records:
49 if header_records[i].match(inline):
50 if i == 'database' or i == 'seqlength' or i == 'nalign':
51 setattr(self, i, int(inline.split()[1]))
52 elif i == 'compnd' or i == 'author':
53 setattr(self, i, inline.split()[1:])
54 elif i == 'source' or i == 'header':
55 attr = inline[inline.find(' ')+1:].strip()
56 setattr(self, i, attr)
57 else:
58 setattr(self, i, inline.split()[1])
59
60
63 inStr = inStr.strip()
64 if len(inStr) != 1 and len(inStr) != 2:
65 raise ValueError('PosAlign: length not 2 chars' + inStr)
66 if inStr == '..':
67 self.aa = '-'
68 self.gap = 1
69 else:
70 self.gap = 0
71 self.aa = inStr[0]
72 if self.aa == self.aa.lower():
73 self.aa = 'C'
74 if len(inStr) == 2:
75 self.ss = inStr[1].upper()
76 else:
77 self.ss = '0'
78
80 if self.gap:
81 outstring = '..'
82 else:
83 outstring = self.aa+self.ss.lower()
84 return outstring
85
86 __str__ = __repr__
87
88
90 """ Contains info from an FSSP summary record"""
92 self.raw = in_str
93 in_rec = in_str.strip().split()
94 # print in_rec
95 self.nr = int(in_rec[0][:-1])
96 self.pdb1 = in_rec[1][:4]
97 if len(in_rec[1]) == 4:
98 self.chain1='0'
99 elif len(in_rec[1]) == 5:
100 self.chain1=in_rec[1][4]
101 else:
102 raise ValueError('Bad PDB ID 1')
103 self.pdb2 = in_rec[2][:4]
104 if len(in_rec[2]) == 4:
105 self.chain2='0'
106 elif len(in_rec[2]) == 5:
107 self.chain2=in_rec[2][4]
108 else:
109 raise ValueError('Bad PDB ID 2')
110 self.zscore = float(in_rec[3])
111 self.rmsd = float(in_rec[4])
112 self.lali = float(in_rec[5])
113 self.lseq2 = float(in_rec[6])
114 self.pID = float(in_rec[7])
115 self.revers = int(in_rec[8])
116 self.permut = int(in_rec[9])
117 self.nfrag = int(in_rec[10])
118 self.topo = in_rec[11]
119 self.doc = ''
120 for i in in_rec[12:]:
121 self.doc = self.doc + i + ' '
122 self.doc = self.doc.rstrip() + '\n'
123
126 __str__ = __repr__
127
128
131 # print in_fff_rec
132 self.abs_res_num = int(in_fff_rec[fssp_rec.align.abs_res_num])
133 self.pdb_res_num = in_fff_rec[fssp_rec.align.pdb_res_num].strip()
134 self.chain_id = in_fff_rec[fssp_rec.align.chain_id]
135 if self.chain_id == ' ':
136 self.chain_id = '0'
137 self.res_name = in_fff_rec[fssp_rec.align.res_name]
138 if self.res_name == self.res_name.lower():
139 self.res_name = 'C'
140 self.ss1 = in_fff_rec[fssp_rec.align.ss1]
141 self.turn3 = in_fff_rec[fssp_rec.align.turn3]
142 self.turn4 = in_fff_rec[fssp_rec.align.turn4]
143 self.turn5 = in_fff_rec[fssp_rec.align.turn5]
144 self.pos_align_dict = {}
145 self.PosAlignList = []
146
150
156
157
160 # The following two dictionaries are pointers to records in self
161 # The first dictionary is a "pdb_residue_number: self_key"
162 # The second dictionary is a "absolute_residue_number: self_key"
163 self.pdb_res_dict = {}
164 self.abs_res_dict = {}
165 self.data = {}
166
168 for i in self:
169 self.abs_res_dict[self[i].abs_res_num] = i
170 self.pdb_res_dict[self[i].pdb_res_num] = i
171
172 # Given an absolute residue number & chain, returns the relevant fssp
173 # record
176
177 # Given an PDB residue number & chain, returns the relevant fssp
178 # record
181
182 # Returns a sequence string
184 s = ''
185 sorted_pos_nums = self.abs_res_dict.keys()
186 sorted_pos_nums.sort()
187 for i in sorted_pos_nums:
188 s += self.abs(i).pos_align_dict[num].aa
189 return s
190
192 mult_align_dict = {}
193 for j in self.abs(1).pos_align_dict:
194 mult_align_dict[j] = ''
195 for fssp_rec in self.itervalues():
196 for j in fssp_rec.pos_align_dict:
197 mult_align_dict[j] += fssp_rec.pos_align_dict[j].aa
198 seq_order = mult_align_dict.keys()
199 seq_order.sort()
200 out_str = ''
201 for i in seq_order:
202 out_str += '> %d\n' % i
203 k = 0
204 for j in mult_align_dict[i]:
205 k += 1
206 if k % 72 == 0:
207 out_str += '\n'
208 out_str += j
209 out_str += '\n'
210 return out_str
211
212
215
216
217 #
218 # Process a fssp file into its constituents. Return a 2-tuple containing
219 # a list of FSSPSumRecs and a dictionary of alignment records.
220 #
222 header = FSSPHeader()
223 sum_dict = FSSPSumDict()
224 align_dict = FSSPAlignDict()
225 # fssp_handle=open(fssp_handlename)
226 curline = fssp_handle.readline()
227 while not summary_title.match(curline):
228 # Still in title
229 header.fill_header(curline)
230 curline = fssp_handle.readline()
231
232 if not summary_title.match(curline):
233 raise ValueError('Bad FSSP file: no summary record found')
234 curline = fssp_handle.readline() # Read the title line, discard
235 curline = fssp_handle.readline() # Read the next line
236 # Process the summary records into a list
237 while summary_rec.match(curline):
238 cur_sum_rec = FSSPSumRec(curline)
239 sum_dict[cur_sum_rec.nr] = cur_sum_rec
240 curline = fssp_handle.readline()
241
242 # Outer loop: process everything up to the EQUIVALENCES title record
243 while not equiv_title.match(curline):
244 while (not alignments_title.match(curline) and
245 not equiv_title.match(curline)):
246 curline = fssp_handle.readline()
247 if not alignments_title.match(curline):
248 if equiv_title.match(curline):
249 # print "Reached equiv_title"
250 break
251 else:
252 raise ValueError('Bad FSSP file: no alignments title record found')
253
254 if equiv_title.match(curline):
255 break
256 # If we got to this point, this means that we have matched an
257 # alignments title. Parse the alignment records in a loop.
258 curline = fssp_handle.readline() # Read the title line, discard
259 curline = fssp_handle.readline() # Read the next line
260 while alignments_rec.match(curline):
261 align_rec = FSSPAlignRec(fff_rec(curline))
262 key = align_rec.chain_id + align_rec.res_name + str(align_rec.pdb_res_num)
263 align_list = curline[fssp_rec.align.start_aa_list:].strip().split()
264 if key not in align_dict:
265 align_dict[key] = align_rec
266 align_dict[key].add_align_list(align_list)
267 curline = fssp_handle.readline()
268 if not curline:
269 print 'EOFEOFEOF'
270 raise EOFError
271 for i in align_dict.itervalues():
272 i.pos_align_list2dict()
273 del i.PosAlignList
274 align_dict.build_resnum_list()
275 return (header, sum_dict, align_dict)
276
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Tue Feb 5 18:03:24 2013 | http://epydoc.sourceforge.net |