Package Bio :: Package SCOP :: Module Raf
[hide private]
[frames] | no frames]

Source Code for Module Bio.SCOP.Raf

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Gavin E. Crooks 2001-10-10 
  7   
  8  """ASTRAL RAF (Rapid Access Format) Sequence Maps. 
  9   
 10  The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES 
 11  records (representing the sequence of the molecule used in an experiment) to 
 12  the ATOM records (representing the atoms experimentally observed). 
 13   
 14  This data is derived from the Protein Data Bank CIF files. Known errors in the 
 15  CIF files are corrected manually, with the original PDB file serving as the 
 16  final arbiter in case of discrepancies. 
 17   
 18  Residues are referenced by residue ID. This consists of a the PDB residue 
 19  sequence number (up to 4 digits) and an optional PDB insertion code (an 
 20  ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" 
 21   
 22  See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html 
 23   
 24  protein_letters_3to1 -- A mapping from the 3-letter amino acid codes found 
 25                          in PDB files to 1-letter codes.  The 3-letter codes 
 26                          include chemically modified residues. 
 27  """ 
 28   
 29  from __future__ import print_function 
 30  from Bio._py3k import basestring 
 31  from Bio._py3k import _universal_read_mode 
 32   
 33  from copy import copy 
 34   
 35  from Bio.Data.SCOPData import protein_letters_3to1 
 36   
 37  from Bio.SCOP.Residues import Residues 
 38   
 39  __docformat__ = "restructuredtext en" 
 40   
41 -def normalize_letters(one_letter_code):
42 """Convert RAF one-letter amino acid codes into IUPAC standard codes. 43 44 Letters are uppercased, and "." ("Unknown") is converted to "X". 45 """ 46 if one_letter_code == '.': 47 return 'X' 48 else: 49 return one_letter_code.upper()
50 51
52 -class SeqMapIndex(dict):
53 """An RAF file index. 54 55 The RAF file itself is about 50 MB. This index provides rapid, random 56 access of RAF records without having to load the entire file into memory. 57 58 The index key is a concatenation of the PDB ID and chain ID. e.g 59 "2drcA", ``"155c_"``. RAF uses an underscore to indicate blank 60 chain IDs. 61 """ 62
63 - def __init__(self, filename):
64 """ 65 Arguments: 66 67 filename -- The file to index 68 """ 69 dict.__init__(self) 70 self.filename = filename 71 72 with open(self.filename, _universal_read_mode) as f: 73 position = 0 74 while True: 75 line = f.readline() 76 if not line: 77 break 78 key = line[0:5] 79 if key is not None: 80 self[key]=position 81 position = f.tell()
82
83 - def __getitem__(self, key):
84 """ Return an item from the indexed file. """ 85 position = dict.__getitem__(self, key) 86 87 with open(self.filename, _universal_read_mode) as f: 88 f.seek(position) 89 line = f.readline() 90 record = SeqMap(line) 91 return record
92
93 - def getSeqMap(self, residues):
94 """Get the sequence map for a collection of residues. 95 96 residues -- A Residues instance, or a string that can be converted into 97 a Residues instance. 98 """ 99 if isinstance(residues, basestring): 100 residues = Residues(residues) 101 102 pdbid = residues.pdbid 103 frags = residues.fragments 104 if not frags: 105 frags = (('_', '', ''),) # All residues of unnamed chain 106 107 seqMap = None 108 for frag in frags: 109 chainid = frag[0] 110 if chainid in ['', '-', ' ', '_']: 111 chainid = '_' 112 id = pdbid + chainid 113 114 sm = self[id] 115 116 # Cut out fragment of interest 117 start = 0 118 end = len(sm.res) 119 if frag[1]: 120 start = int(sm.index(frag[1], chainid)) 121 if frag[2]: 122 end = int(sm.index(frag[2], chainid)) + 1 123 124 sm = sm[start:end] 125 126 if seqMap is None: 127 seqMap = sm 128 else: 129 seqMap += sm 130 131 return seqMap
132 133
134 -class SeqMap(object):
135 """An ASTRAL RAF (Rapid Access Format) Sequence Map. 136 137 This is a list like object; You can find the location of particular residues 138 with index(), slice this SeqMap into fragments, and glue fragments back 139 together with extend(). 140 141 pdbid -- The PDB 4 character ID 142 143 pdb_datestamp -- From the PDB file 144 145 version -- The RAF format version. e.g. 0.01 146 147 flags -- RAF flags. (See release notes for more information.) 148 149 res -- A list of Res objects, one for each residue in this sequence map 150 """ 151
152 - def __init__(self, line=None):
153 self.pdbid = '' 154 self.pdb_datestamp = '' 155 self.version = '' 156 self.flags = '' 157 self.res = [] 158 if line: 159 self._process(line)
160
161 - def _process(self, line):
162 """Parses a RAF record into a SeqMap object. 163 """ 164 header_len = 38 165 166 line = line.rstrip() # no trailing whitespace 167 168 if len(line) < header_len: 169 raise ValueError("Incomplete header: "+line) 170 171 self.pdbid = line[0:4] 172 chainid = line[4:5] 173 174 self.version = line[6:10] 175 176 # Raf format versions 0.01 and 0.02 are identical for practical purposes 177 if(self.version != "0.01" and self.version != "0.02"): 178 raise ValueError("Incompatible RAF version: " + self.version) 179 180 self.pdb_datestamp = line[14:20] 181 self.flags = line[21:27] 182 183 for i in range(header_len, len(line), 7): 184 f = line[i:i+7] 185 if len(f)!=7: 186 raise ValueError("Corrupt Field: ("+f+")") 187 r = Res() 188 r.chainid = chainid 189 r.resid = f[0:5].strip() 190 r.atom = normalize_letters(f[5:6]) 191 r.seqres = normalize_letters(f[6:7]) 192 193 self.res.append(r)
194
195 - def index(self, resid, chainid="_"):
196 for i in range(0, len(self.res)): 197 if self.res[i].resid == resid and self.res[i].chainid == chainid: 198 return i 199 raise KeyError("No such residue " + chainid + resid)
200
201 - def __getitem__(self, index):
202 if not isinstance(index, slice): 203 raise NotImplementedError 204 s = copy(self) 205 s.res = s.res[index] 206 return s
207
208 - def append(self, res):
209 """Append another Res object onto the list of residue mappings.""" 210 self.res.append(res)
211
212 - def extend(self, other):
213 """Append another SeqMap onto the end of self. 214 215 Both SeqMaps must have the same PDB ID, PDB datestamp and 216 RAF version. The RAF flags are erased if they are inconsistent. This 217 may happen when fragments are taken from different chains. 218 """ 219 if not isinstance(other, SeqMap): 220 raise TypeError("Can only extend a SeqMap with a SeqMap.") 221 if self.pdbid != other.pdbid: 222 raise TypeError("Cannot add fragments from different proteins") 223 if self.version != other.version: 224 raise TypeError("Incompatible rafs") 225 if self.pdb_datestamp != other.pdb_datestamp: 226 raise TypeError("Different pdb dates!") 227 if self.flags != other.flags: 228 self.flags = '' 229 self.res += other.res
230
231 - def __iadd__(self, other):
232 self.extend(other) 233 return self
234
235 - def __add__(self, other):
236 s = copy(self) 237 s.extend(other) 238 return s
239
240 - def getAtoms(self, pdb_handle, out_handle):
241 """Extract all relevant ATOM and HETATOM records from a PDB file. 242 243 The PDB file is scanned for ATOM and HETATOM records. If the 244 chain ID, residue ID (seqNum and iCode), and residue type match 245 a residue in this sequence map, then the record is echoed to the 246 output handle. 247 248 This is typically used to find the coordinates of a domain, or other 249 residue subset. 250 251 pdb_handle -- A handle to the relevant PDB file. 252 253 out_handle -- All output is written to this file like object. 254 """ 255 # This code should be refactored when (if?) biopython gets a PDB parser 256 257 # The set of residues that I have to find records for. 258 resSet = {} 259 for r in self.res: 260 if r.atom == 'X': # Unknown residue type 261 continue 262 chainid = r.chainid 263 if chainid == '_': 264 chainid = ' ' 265 resid = r.resid 266 resSet[(chainid, resid)] = r 267 268 resFound = {} 269 for line in pdb_handle: 270 if line.startswith("ATOM ") or line.startswith("HETATM"): 271 chainid = line[21:22] 272 resid = line[22:27].strip() 273 key = (chainid, resid) 274 if key in resSet: 275 res = resSet[key] 276 atom_aa = res.atom 277 resName = line[17:20] 278 if resName in protein_letters_3to1: 279 if protein_letters_3to1[resName] == atom_aa: 280 out_handle.write(line) 281 resFound[key] = res 282 283 if len(resSet) != len(resFound): 284 # for k in resFound: 285 # del resSet[k] 286 # print(resSet) 287 288 raise RuntimeError('I could not find at least one ATOM or HETATM' 289 +' record for each and every residue in this sequence map.')
290 291
292 -class Res(object):
293 """ A single residue mapping from a RAF record. 294 295 chainid -- A single character chain ID. 296 297 resid -- The residue ID. 298 299 atom -- amino acid one-letter code from ATOM records. 300 301 seqres -- amino acid one-letter code from SEQRES records. 302 """
303 - def __init__(self):
304 self.chainid = '' 305 self.resid = '' 306 self.atom = '' 307 self.seqres = ''
308 309
310 -def parse(handle):
311 """Iterates over a RAF file, returning a SeqMap object for each line 312 in the file. 313 314 Arguments: 315 316 handle -- file-like object. 317 """ 318 for line in handle: 319 yield SeqMap(line)
320