Package Bio :: Package PDB :: Module DSSP'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.DSSP'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Use the DSSP program to calculate secondary structure and accessibility. 
  7   
  8  You need to have a working version of DSSP (and a license, free for academic 
  9  use) in order to use this. For DSSP, see U{http://swift.cmbi.ru.nl/gv/dssp/}. 
 10   
 11  The DSSP codes for secondary structure used here are: 
 12   
 13      - H        Alpha helix (4-12) 
 14      - B        Isolated beta-bridge residue 
 15      - E        Strand 
 16      - G        3-10 helix 
 17      - I        pi helix 
 18      - T        Turn 
 19      - S        Bend 
 20      - -        None 
 21  """ 
 22   
 23  from __future__ import print_function 
 24   
 25  __docformat__ = "restructuredtext en" 
 26   
 27  import re 
 28  from Bio._py3k import StringIO 
 29  import subprocess 
 30   
 31  from Bio.Data import SCOPData 
 32   
 33  from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap 
 34  from Bio.PDB.PDBExceptions import PDBException 
 35  from Bio.PDB.PDBParser import PDBParser 
 36   
 37   
 38  # Match C in DSSP 
 39  _dssp_cys=re.compile('[a-z]') 
 40   
 41  # Maximal ASA of amino acids 
 42  # Values from Sander & Rost, (1994), Proteins, 20:216-226 
 43  # Used for relative accessibility 
 44  MAX_ACC={} 
 45  MAX_ACC["ALA"]=106.0 
 46  MAX_ACC["CYS"]=135.0 
 47  MAX_ACC["ASP"]=163.0 
 48  MAX_ACC["GLU"]=194.0 
 49  MAX_ACC["PHE"]=197.0 
 50  MAX_ACC["GLY"]=84.0 
 51  MAX_ACC["HIS"]=184.0 
 52  MAX_ACC["ILE"]=169.0 
 53  MAX_ACC["LYS"]=205.0 
 54  MAX_ACC["LEU"]=164.0 
 55  MAX_ACC["MET"]=188.0 
 56  MAX_ACC["ASN"]=157.0 
 57  MAX_ACC["PRO"]=136.0 
 58  MAX_ACC["GLN"]=198.0 
 59  MAX_ACC["ARG"]=248.0 
 60  MAX_ACC["SER"]=130.0 
 61  MAX_ACC["THR"]=142.0 
 62  MAX_ACC["VAL"]=142.0 
 63  MAX_ACC["TRP"]=227.0 
 64  MAX_ACC["TYR"]=222.0 
 65   
 66   
67 -def ss_to_index(ss):
68 """ 69 Secondary structure symbol to index. 70 H=0 71 E=1 72 C=2 73 """ 74 if ss=='H': 75 return 0 76 if ss=='E': 77 return 1 78 if ss=='C': 79 return 2 80 assert 0
81 82
83 -def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
84 """ 85 Create a DSSP dictionary from a PDB file. 86 87 Example: 88 -------- 89 >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") 90 >>> aa, ss, acc=dssp_dict[('A', 1)] 91 92 :: 93 94 @param in_file: pdb file 95 @type in_file: string :: 96 97 @param DSSP: DSSP executable (argument to os.system) 98 @type DSSP: string :: 99 100 @return: a dictionary that maps (chainid, resid) to 101 amino acid type, secondary structure code and 102 accessibility. 103 @rtype: {} 104 """ 105 # Using universal newlines is important on Python 3, this 106 # gives unicode handles rather than bytes handles. 107 p = subprocess.Popen([DSSP, in_file], universal_newlines=True, 108 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 109 out, err = p.communicate() 110 out_dict, keys = _make_dssp_dict(StringIO(out)) 111 return out_dict, keys
112 113
114 -def make_dssp_dict(filename):
115 """ 116 Return a DSSP dictionary that maps (chainid, resid) to 117 aa, ss and accessibility, from a DSSP file. :: 118 119 @param filename: the DSSP output file 120 @type filename: string 121 """ 122 with open(filename, "r") as handle: 123 return _make_dssp_dict(handle)
124 125
126 -def _make_dssp_dict(handle):
127 """ 128 Return a DSSP dictionary that maps (chainid, resid) to 129 aa, ss and accessibility, from an open DSSP file object. :: 130 131 @param handle: the open DSSP output file handle 132 @type handle: file 133 """ 134 dssp = {} 135 start = 0 136 keys = [] 137 for l in handle.readlines(): 138 sl = l.split() 139 if len(sl) < 2: 140 continue 141 if sl[1] == "RESIDUE": 142 # Start parsing from here 143 start = 1 144 continue 145 if not start: 146 continue 147 if l[9] == " ": 148 # Skip -- missing residue 149 continue 150 resseq = int(l[5:10]) 151 icode = l[10] 152 chainid = l[11] 153 aa = l[13] 154 ss = l[16] 155 if ss == " ": 156 ss = "-" 157 try: 158 acc = int(l[34:38]) 159 phi = float(l[103:109]) 160 psi = float(l[109:115]) 161 except ValueError as exc: 162 # DSSP output breaks its own format when there are >9999 163 # residues, since only 4 digits are allocated to the seq num 164 # field. See 3kic chain T res 321, 1vsy chain T res 6077. 165 # Here, look for whitespace to figure out the number of extra 166 # digits, and shift parsing the rest of the line by that amount. 167 if l[34] != ' ': 168 shift = l[34:].find(' ') 169 acc = int((l[34+shift:38+shift])) 170 phi = float(l[103+shift:109+shift]) 171 psi = float(l[109+shift:115+shift]) 172 else: 173 raise ValueError(exc) 174 res_id = (" ", resseq, icode) 175 dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi) 176 keys.append((chainid, res_id)) 177 return dssp, keys
178 179
180 -class DSSP(AbstractResiduePropertyMap):
181 """ 182 Run DSSP on a pdb file, and provide a handle to the 183 DSSP secondary structure and accessibility. 184 185 **Note** that DSSP can only handle one model. 186 187 Example: 188 -------- 189 190 >>> p = PDBParser() 191 >>> structure = p.get_structure("1MOT", "1MOT.pdb") 192 >>> model = structure[0] 193 >>> dssp = DSSP(model, "1MOT.pdb") 194 >>> # DSSP data is accessed by a tuple (chain_id, res_id) 195 >>> a_key = list(dssp)[2] 196 >>> # residue object, secondary structure, solvent accessibility, 197 >>> # relative accessiblity, phi, psi 198 >>> dssp[a_key] 199 (<Residue ALA het= resseq=251 icode= >, 200 'H', 201 72, 202 0.67924528301886788, 203 -61.200000000000003, 204 -42.399999999999999) 205 """ 206
207 - def __init__(self, model, pdb_file, dssp="dssp"):
208 """ 209 :: 210 211 @param model: the first model of the structure 212 @type model: L{Model} :: 213 214 @param pdb_file: a PDB file 215 @type pdb_file: string :: 216 217 @param dssp: the dssp executable (ie. the argument to os.system) 218 @type dssp: string 219 """ 220 # create DSSP dictionary 221 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(pdb_file, dssp) 222 dssp_map = {} 223 dssp_list = [] 224 225 def resid2code(res_id): 226 """Serialize a residue's resseq and icode for easy comparison.""" 227 return '%s%s' % (res_id[1], res_id[2])
228 229 # Now create a dictionary that maps Residue objects to 230 # secondary structure and accessibility, and a list of 231 # (residue, (secondary structure, accessibility)) tuples 232 for key in dssp_keys: 233 chain_id, res_id = key 234 chain = model[chain_id] 235 try: 236 res = chain[res_id] 237 except KeyError: 238 # In DSSP, HET field is not considered in residue identifier. 239 # Thus HETATM records may cause unnecessary exceptions. 240 # (See 3jui chain A res 593.) 241 # Try the lookup again with all HETATM other than water 242 res_seq_icode = resid2code(res_id) 243 for r in chain: 244 if r.id[0] not in (' ', 'W'): 245 # Compare resseq + icode 246 if resid2code(r.id) == res_seq_icode: 247 # Found a matching residue 248 res = r 249 break 250 else: 251 raise KeyError(res_id) 252 253 # For disordered residues of point mutations, BioPython uses the 254 # last one as default, But DSSP takes the first one (alternative 255 # location is blank, A or 1). See 1h9h chain E resi 22. 256 # Here we select the res in which all atoms have altloc blank, A or 257 # 1. If no such residues are found, simply use the first one appears 258 # (as DSSP does). 259 if res.is_disordered() == 2: 260 for rk in res.disordered_get_id_list(): 261 # All atoms in the disordered residue should have the same 262 # altloc, so it suffices to check the altloc of the first 263 # atom. 264 altloc = res.child_dict[rk].get_list()[0].get_altloc() 265 if altloc in tuple('A1 '): 266 res.disordered_select(rk) 267 break 268 else: 269 # Simply select the first one 270 res.disordered_select(res.disordered_get_id_list()[0]) 271 272 # Sometimes point mutations are put into HETATM and ATOM with altloc 273 # 'A' and 'B'. 274 # See 3piu chain A residue 273: 275 # <Residue LLP het=H_LLP resseq=273 icode= > 276 # <Residue LYS het= resseq=273 icode= > 277 # DSSP uses the HETATM LLP as it has altloc 'A' 278 # We check the altloc code here. 279 elif res.is_disordered() == 1: 280 # Check altloc of all atoms in the DisorderedResidue. If it 281 # contains blank, A or 1, then use it. Otherwise, look for HET 282 # residues of the same seq+icode. If not such HET residues are 283 # found, just accept the current one. 284 altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) 285 if altlocs.isdisjoint('A1 '): 286 # Try again with all HETATM other than water 287 res_seq_icode = resid2code(res_id) 288 for r in chain: 289 if r.id[0] not in (' ', 'W'): 290 if resid2code(r.id) == res_seq_icode and \ 291 r.get_list()[0].get_altloc() in tuple('A1 '): 292 res = r 293 break 294 295 aa, ss, acc, phi, psi = dssp_dict[key] 296 res.xtra["SS_DSSP"] = ss 297 res.xtra["EXP_DSSP_ASA"] = acc 298 res.xtra["PHI_DSSP"] = phi 299 res.xtra["PSI_DSSP"] = psi 300 # Relative accessibility 301 resname = res.get_resname() 302 try: 303 rel_acc = acc/MAX_ACC[resname] 304 except KeyError: 305 # Invalid value for resname 306 rel_acc = 'NA' 307 else: 308 if rel_acc > 1.0: 309 rel_acc = 1.0 310 res.xtra["EXP_DSSP_RASA"] = rel_acc 311 # Verify if AA in DSSP == AA in Structure 312 # Something went wrong if this is not true! 313 # NB: DSSP uses X often 314 resname = SCOPData.protein_letters_3to1.get(resname, 'X') 315 if resname == "C": 316 # DSSP renames C in C-bridges to a,b,c,d,... 317 # - we rename it back to 'C' 318 if _dssp_cys.match(aa): 319 aa = 'C' 320 # Take care of HETATM again 321 if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): 322 raise PDBException("Structure/DSSP mismatch at %s" % res) 323 dssp_map[key] = ((res, ss, acc, rel_acc, phi, psi)) 324 dssp_list.append((res, ss, acc, rel_acc, phi, psi)) 325 326 AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, 327 dssp_list)
328 329 330 if __name__ == "__main__": 331 import sys 332 333 p = PDBParser() 334 s = p.get_structure('X', sys.argv[1]) 335 model = s[0] 336 d = DSSP(model, sys.argv[1]) 337 338 for r in d: 339 print(r) 340 print("Handled %i residues" % len(d)) 341 print(sorted(d)) 342 if ('A', 1) in d: 343 print(d[('A', 1)]) 344 print(s[0]['A'][1].xtra) 345 # Secondary structure 346 print(''.join(item[1] for item in d)) 347