1
2
3
4
5 from __future__ import with_statement
6
7 import collections
8 import warnings
9
10 from Bio.Alphabet import generic_protein
11 from Bio.Seq import Seq
12 from Bio.SeqRecord import SeqRecord
13
14
16 """Returns SeqRecord objects for each chain in a PDB file.
17
18 The sequences are derived from the SEQRES lines in the
19 PDB file header, not the atoms of the 3D structure.
20
21 Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES
22
23 See: http://www.wwpdb.org/documentation/format23/sect3.html
24 """
25
26
27 from Bio.SCOP.three_to_one_dict import to_one_letter_code
28
29 chains = collections.defaultdict(list)
30 metadata = collections.defaultdict(list)
31 for line in handle:
32 rec_name = line[0:6].strip()
33 if rec_name == 'SEQRES':
34
35
36
37
38
39
40
41
42 chn_id = line[11]
43
44
45 residues = [to_one_letter_code.get(res, 'X')
46 for res in line[19:].split()]
47 chains[chn_id].extend(residues)
48 elif rec_name == 'DBREF':
49
50 pdb_id = line[7:11]
51
52 chn_id = line[12]
53
54
55
56
57
58
59
60
61
62 database = line[26:32].strip()
63
64 db_acc = line[33:41].strip()
65
66 db_id_code = line[42:54].strip()
67
68
69
70
71
72
73
74
75
76
77 metadata[chn_id].append({'pdb_id': pdb_id, 'database': database,
78 'db_acc': db_acc, 'db_id_code': db_id_code})
79
80
81 for chn_id, residues in sorted(chains.iteritems()):
82 record = SeqRecord(Seq(''.join(residues), generic_protein))
83 record.annotations = {"chain": chn_id}
84 if chn_id in metadata:
85 m = metadata[chn_id][0]
86 record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
87 record.description = ("%s:%s %s" % (m['database'],
88 m['db_acc'],
89 m['db_id_code']))
90 for melem in metadata[chn_id]:
91 record.dbxrefs.extend([
92 "%s:%s" % (melem['database'], melem['db_acc']),
93 "%s:%s" % (melem['database'], melem['db_id_code'])])
94 else:
95 record.id = chn_id
96 yield record
97
98
100 """Returns SeqRecord objects for each chain in a PDB file
101
102 The sequences are derived from the 3D structure (ATOM records), not the
103 SEQRES lines in the PDB file header.
104
105 Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
106 are converted to "X" in the sequence.
107
108 In addition to information from the PDB header (which is the same for all
109 records), the following chain specific information is placed in the
110 annotation:
111
112 record.annotations["residues"] = List of residue ID strings
113 record.annotations["chain"] = Chain ID (typically A, B ,...)
114 record.annotations["model"] = Model ID (typically zero)
115
116 Where amino acids are missing from the structure, as indicated by residue
117 numbering, the sequence is filled in with 'X' characters to match the size
118 of the missing region, and None is included as the corresponding entry in
119 the list record.annotations["residues"].
120
121 This function uses the Bio.PDB module to do most of the hard work. The
122 annotation information could be improved but this extra parsing should be
123 done in parse_pdb_header, not this module.
124 """
125
126 from Bio.PDB import PDBParser
127 from Bio.SCOP.three_to_one_dict import to_one_letter_code
128
129 def restype(residue):
130 """Return a residue's type as a one-letter code.
131
132 Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
133 """
134 return to_one_letter_code.get(residue.resname, 'X')
135
136
137
138 from Bio.File import UndoHandle
139 undo_handle = UndoHandle(handle)
140 firstline = undo_handle.peekline()
141 if firstline.startswith("HEADER"):
142 pdb_id = firstline[62:66]
143 else:
144 warnings.warn("First line is not a 'HEADER'; can't determine PDB ID")
145 pdb_id = '????'
146
147 struct = PDBParser().get_structure(pdb_id, undo_handle)
148 model = struct[0]
149 for chn_id, chain in sorted(model.child_dict.iteritems()):
150
151 residues = [res for res in chain.get_unpacked_list()
152 if res.get_resname().upper() in to_one_letter_code]
153 if not residues:
154 continue
155
156
157 gaps = []
158 rnumbers = [r.id[1] for r in residues]
159 for i, rnum in enumerate(rnumbers[:-1]):
160 if rnumbers[i+1] != rnum + 1:
161
162 gaps.append((i+1, rnum, rnumbers[i+1]))
163 if gaps:
164 res_out = []
165 prev_idx = 0
166 for i, pregap, postgap in gaps:
167 if postgap > pregap:
168 gapsize = postgap - pregap - 1
169 res_out.extend(map(restype, residues[prev_idx:i]))
170 prev_idx = i
171 res_out.append('X'*gapsize)
172
173 res_out.extend(map(restype, residues[prev_idx:]))
174 else:
175 warnings.warn("Ignoring out-of-order residues after a gap",
176 UserWarning)
177
178
179 res_out.extend(map(restype, residues[prev_idx:i]))
180 else:
181
182 res_out = map(restype, residues)
183 record_id = "%s:%s" % (pdb_id, chn_id)
184
185
186
187
188
189 record = SeqRecord(Seq(''.join(res_out), generic_protein),
190 id=record_id,
191 description=record_id,
192 )
193
194
195 record.annotations = struct.header.copy()
196
197 record.annotations["model"] = model.id
198 record.annotations["chain"] = chain.id
199
200
201 record.annotations["start"] = int(rnumbers[0])
202 record.annotations["end"] = int(rnumbers[-1])
203
204
205
206 yield record
207
208
209 if __name__ == '__main__':
210
211 import sys
212 from Bio import SeqIO
213 for fname in sys.argv[1:]:
214 for parser in (PdbSeqresIterator, PdbAtomIterator):
215 with open(fname) as handle:
216 records = parser(handle)
217 SeqIO.write(records, sys.stdout, 'fasta')
218