Package Bio :: Package PDB :: Module PDBParser'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBParser'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for PDB files.""" 
  7   
  8  from __future__ import print_function 
  9   
 10  import warnings 
 11   
 12  try: 
 13      import numpy 
 14  except: 
 15      from Bio import MissingPythonDependencyError 
 16      raise MissingPythonDependencyError( 
 17          "Install NumPy if you want to use the PDB parser.") 
 18   
 19  from Bio.File import as_handle 
 20   
 21  from Bio.PDB.PDBExceptions import PDBConstructionException 
 22  from Bio.PDB.PDBExceptions import PDBConstructionWarning 
 23   
 24  from Bio.PDB.StructureBuilder import StructureBuilder 
 25  from Bio.PDB.parse_pdb_header import _parse_pdb_header_list 
 26   
 27   
 28  # If PDB spec says "COLUMNS 18-20" this means line[17:20] 
 29   
 30   
31 -class PDBParser(object):
32 """ 33 Parse a PDB file and return a Structure object. 34 """ 35
36 - def __init__(self, PERMISSIVE=True, get_header=False, 37 structure_builder=None, QUIET=False):
38 """ 39 The PDB parser call a number of standard methods in an aggregated 40 StructureBuilder object. Normally this object is instanciated by the 41 PDBParser object itself, but if the user provides his/her own 42 StructureBuilder object, the latter is used instead. 43 44 Arguments: 45 46 o PERMISSIVE - Evaluated as a Boolean. If false, exceptions in 47 constructing the SMCRA data structure are fatal. If true (DEFAULT), 48 the exceptions are caught, but some residues or atoms will be missing. 49 THESE EXCEPTIONS ARE DUE TO PROBLEMS IN THE PDB FILE!. 50 51 o structure_builder - an optional user implemented StructureBuilder class. 52 53 o QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 54 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 55 These warnings might be indicative of problems in the PDB file! 56 """ 57 if structure_builder is not None: 58 self.structure_builder = structure_builder 59 else: 60 self.structure_builder = StructureBuilder() 61 self.header = None 62 self.trailer = None 63 self.line_counter = 0 64 self.PERMISSIVE = bool(PERMISSIVE) 65 self.QUIET = bool(QUIET)
66 67 # Public methods 68
69 - def get_structure(self, id, file):
70 """Return the structure. 71 72 Arguments: 73 o id - string, the id that will be used for the structure 74 o file - name of the PDB file OR an open filehandle 75 """ 76 with warnings.catch_warnings(): 77 if self.QUIET: 78 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 79 80 self.header = None 81 self.trailer = None 82 # Make a StructureBuilder instance (pass id of structure as parameter) 83 self.structure_builder.init_structure(id) 84 85 with as_handle(file) as handle: 86 self._parse(handle.readlines()) 87 88 self.structure_builder.set_header(self.header) 89 # Return the Structure instance 90 structure = self.structure_builder.get_structure() 91 92 return structure
93
94 - def get_header(self):
95 "Return the header." 96 return self.header
97
98 - def get_trailer(self):
99 "Return the trailer." 100 return self.trailer
101 102 # Private methods 103
104 - def _parse(self, header_coords_trailer):
105 "Parse the PDB file." 106 # Extract the header; return the rest of the file 107 self.header, coords_trailer = self._get_header(header_coords_trailer) 108 # Parse the atomic data; return the PDB file trailer 109 self.trailer = self._parse_coordinates(coords_trailer)
110
111 - def _get_header(self, header_coords_trailer):
112 "Get the header of the PDB file, return the rest." 113 structure_builder = self.structure_builder 114 i = 0 115 for i in range(0, len(header_coords_trailer)): 116 structure_builder.set_line_counter(i + 1) 117 line = header_coords_trailer[i] 118 record_type = line[0:6] 119 if record_type == "ATOM " or record_type == "HETATM" or record_type == "MODEL ": 120 break 121 header = header_coords_trailer[0:i] 122 # Return the rest of the coords+trailer for further processing 123 self.line_counter = i 124 coords_trailer = header_coords_trailer[i:] 125 header_dict = _parse_pdb_header_list(header) 126 return header_dict, coords_trailer
127
128 - def _parse_coordinates(self, coords_trailer):
129 "Parse the atomic data in the PDB file." 130 local_line_counter = 0 131 structure_builder = self.structure_builder 132 current_model_id = 0 133 # Flag we have an open model 134 model_open = 0 135 current_chain_id = None 136 current_segid = None 137 current_residue_id = None 138 current_resname = None 139 for i in range(0, len(coords_trailer)): 140 line = coords_trailer[i] 141 record_type = line[0:6] 142 global_line_counter = self.line_counter + local_line_counter + 1 143 structure_builder.set_line_counter(global_line_counter) 144 if record_type == "ATOM " or record_type == "HETATM": 145 # Initialize the Model - there was no explicit MODEL record 146 if not model_open: 147 structure_builder.init_model(current_model_id) 148 current_model_id += 1 149 model_open = 1 150 fullname = line[12:16] 151 # get rid of whitespace in atom names 152 split_list = fullname.split() 153 if len(split_list) != 1: 154 # atom name has internal spaces, e.g. " N B ", so 155 # we do not strip spaces 156 name = fullname 157 else: 158 # atom name is like " CA ", so we can strip spaces 159 name = split_list[0] 160 altloc = line[16] 161 resname = line[17:20] 162 chainid = line[21] 163 try: 164 serial_number = int(line[6:11]) 165 except: 166 serial_number = 0 167 resseq = int(line[22:26].split()[0]) # sequence identifier 168 icode = line[26] # insertion code 169 if record_type == "HETATM": # hetero atom flag 170 if resname == "HOH" or resname == "WAT": 171 hetero_flag = "W" 172 else: 173 hetero_flag = "H" 174 else: 175 hetero_flag = " " 176 residue_id = (hetero_flag, resseq, icode) 177 # atomic coordinates 178 try: 179 x = float(line[30:38]) 180 y = float(line[38:46]) 181 z = float(line[46:54]) 182 except: 183 # Should we allow parsing to continue in permissive mode? 184 # If so, what coordinates should we default to? Easier to abort! 185 raise PDBConstructionException("Invalid or missing coordinate(s) at line %i." 186 % global_line_counter) 187 coord = numpy.array((x, y, z), "f") 188 # occupancy & B factor 189 try: 190 occupancy = float(line[54:60]) 191 except: 192 self._handle_PDB_exception("Invalid or missing occupancy", 193 global_line_counter) 194 occupancy = None # Rather than arbitrary zero or one 195 try: 196 bfactor = float(line[60:66]) 197 except: 198 self._handle_PDB_exception("Invalid or missing B factor", 199 global_line_counter) 200 bfactor = 0.0 # The PDB use a default of zero if the data is missing 201 segid = line[72:76] 202 element = line[76:78].strip() 203 if current_segid != segid: 204 current_segid = segid 205 structure_builder.init_seg(current_segid) 206 if current_chain_id != chainid: 207 current_chain_id = chainid 208 structure_builder.init_chain(current_chain_id) 209 current_residue_id = residue_id 210 current_resname = resname 211 try: 212 structure_builder.init_residue(resname, hetero_flag, resseq, icode) 213 except PDBConstructionException as message: 214 self._handle_PDB_exception(message, global_line_counter) 215 elif current_residue_id != residue_id or current_resname != resname: 216 current_residue_id = residue_id 217 current_resname = resname 218 try: 219 structure_builder.init_residue(resname, hetero_flag, resseq, icode) 220 except PDBConstructionException as message: 221 self._handle_PDB_exception(message, global_line_counter) 222 # init atom 223 try: 224 structure_builder.init_atom(name, coord, bfactor, occupancy, altloc, 225 fullname, serial_number, element) 226 except PDBConstructionException as message: 227 self._handle_PDB_exception(message, global_line_counter) 228 elif record_type == "ANISOU": 229 anisou = [float(x) for x in (line[28:35], line[35:42], line[43:49], 230 line[49:56], line[56:63], line[63:70])] 231 # U's are scaled by 10^4 232 anisou_array = (numpy.array(anisou, "f") / 10000.0).astype("f") 233 structure_builder.set_anisou(anisou_array) 234 elif record_type == "MODEL ": 235 try: 236 serial_num = int(line[10:14]) 237 except: 238 self._handle_PDB_exception("Invalid or missing model serial number", 239 global_line_counter) 240 serial_num = 0 241 structure_builder.init_model(current_model_id, serial_num) 242 current_model_id += 1 243 model_open = 1 244 current_chain_id = None 245 current_residue_id = None 246 elif record_type == "END " or record_type == "CONECT": 247 # End of atomic data, return the trailer 248 self.line_counter += local_line_counter 249 return coords_trailer[local_line_counter:] 250 elif record_type == "ENDMDL": 251 model_open = 0 252 current_chain_id = None 253 current_residue_id = None 254 elif record_type == "SIGUIJ": 255 # standard deviation of anisotropic B factor 256 siguij = [float(x) for x in (line[28:35], line[35:42], line[42:49], 257 line[49:56], line[56:63], line[63:70])] 258 # U sigma's are scaled by 10^4 259 siguij_array = (numpy.array(siguij, "f") / 10000.0).astype("f") 260 structure_builder.set_siguij(siguij_array) 261 elif record_type == "SIGATM": 262 # standard deviation of atomic positions 263 sigatm = [float(x) for x in (line[30:38], line[38:45], line[46:54], 264 line[54:60], line[60:66])] 265 sigatm_array = numpy.array(sigatm, "f") 266 structure_builder.set_sigatm(sigatm_array) 267 local_line_counter += 1 268 # EOF (does not end in END or CONECT) 269 self.line_counter = self.line_counter + local_line_counter 270 return []
271
272 - def _handle_PDB_exception(self, message, line_counter):
273 """ 274 This method catches an exception that occurs in the StructureBuilder 275 object (if PERMISSIVE), or raises it again, this time adding the 276 PDB line number to the error message. 277 """ 278 message = "%s at line %i." % (message, line_counter) 279 if self.PERMISSIVE: 280 # just print a warning - some residues/atoms may be missing 281 warnings.warn("PDBConstructionException: %s\n" 282 "Exception ignored.\n" 283 "Some atoms or residues may be missing in the data structure." 284 % message, PDBConstructionWarning) 285 else: 286 # exceptions are fatal - raise again with new message (including line nr) 287 raise PDBConstructionException(message)
288 289 290 if __name__ == "__main__": 291 292 import sys 293 294 p = PDBParser(PERMISSIVE=True) 295 296 filename = sys.argv[1] 297 s = p.get_structure("scr", filename) 298 299 for m in s: 300 p = m.get_parent() 301 assert(p is s) 302 for c in m: 303 p = c.get_parent() 304 assert(p is m) 305 for r in c: 306 print(r) 307 p = r.get_parent() 308 assert(p is c) 309 for a in r: 310 p = a.get_parent() 311 if not p is r: 312 print("%s %s" % (p, r)) 313