Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # (c) 2003 Kristian Rother 
  8  # This work was supported by the German Ministry of Education 
  9  # and Research (BMBF). Project http://www.bcbio.de 
 10  # 
 11  # Contact the author 
 12  #    homepage : http://www.rubor.de/bioinf 
 13  #    email    : krother@genesilico.pl 
 14  # 
 15  # 
 16  # This code is released under the conditions of the Biopython license. 
 17  # It may be distributed freely with respect to the original author. 
 18  # Any maintainer of the Biopython code may change this notice 
 19  # when appropriate. 
 20   
 21  """ Access the PDB over the internet (e.g. to download structures). """ 
 22   
 23  from __future__ import print_function 
 24   
 25  import contextlib 
 26  import gzip 
 27  import os 
 28  import shutil 
 29   
 30  # Importing these functions with leading underscore as not intended for reuse 
 31  from Bio._py3k import urlopen as _urlopen 
 32  from Bio._py3k import urlretrieve as _urlretrieve 
 33   
 34  __docformat__ = "restructuredtext en" 
 35   
 36   
37 -class PDBList(object):
38 """ 39 This class provides quick access to the structure lists on the 40 PDB server or its mirrors. The structure lists contain 41 four-letter PDB codes, indicating that structures are 42 new, have been modified or are obsolete. The lists are released 43 on a weekly basis. 44 45 It also provides a function to retrieve PDB files from the server. 46 To use it properly, prepare a directory /pdb or the like, 47 where PDB files are stored. 48 49 If you want to use this module from inside a proxy, add 50 the proxy variable to your environment, e.g. in Unix: 51 export HTTP_PROXY='http://realproxy.charite.de:888' 52 (This can also be added to ~/.bashrc) 53 """ 54 55 PDB_REF = """ 56 The Protein Data Bank: a computer-based archival file for macromolecular structures. 57 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 58 J. Mol. Biol. 112 pp. 535-542 (1977) 59 http://www.pdb.org/. 60 """ 61 62 alternative_download_url = "http://www.rcsb.org/pdb/files/" 63 # just append PDB code to this, and then it works. 64
65 - def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), 66 obsolete_pdb=None):
67 """Initialize the class with the default server or a custom one.""" 68 self.pdb_server = server # remote pdb server 69 self.local_pdb = pdb # local pdb file tree 70 71 # local file tree for obsolete pdb files 72 if obsolete_pdb: 73 self.obsolete_pdb = obsolete_pdb 74 else: 75 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 76 if not os.access(self.obsolete_pdb, os.F_OK): 77 os.makedirs(self.obsolete_pdb) 78 79 # variables for command-line options 80 self.overwrite = 0 81 self.flat_tree = 0
82
83 - def get_status_list(self, url):
84 """Retrieves a list of pdb codes in the weekly pdb status file 85 from the given URL. Used by get_recent_files. 86 87 Typical contents of the list files parsed by this method is now 88 very simply one PDB name per line. 89 """ 90 with contextlib.closing(_urlopen(url)) as handle: 91 answer = [] 92 for line in handle: 93 pdb = line.strip() 94 assert len(pdb) == 4 95 answer.append(pdb) 96 return answer
97
98 - def get_recent_changes(self):
99 """Returns three lists of the newest weekly files (added,mod,obsolete). 100 101 Reads the directories with changed entries from the PDB server and 102 returns a tuple of three URL's to the files of new, modified and 103 obsolete entries from the most recent list. The directory with the 104 largest numerical name is used. 105 Returns None if something goes wrong. 106 107 Contents of the data/status dir (20031013 would be used); 108 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 109 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 110 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 111 """ 112 url = self.pdb_server + '/pub/pdb/data/status/' 113 with contextlib.closing(_urlopen(url)) as handle: 114 recent = filter(str.isdigit, 115 (x.split()[-1] for x in handle.readlines()) 116 )[-1] 117 118 path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) 119 120 # Retrieve the lists 121 added = self.get_status_list(path + 'added.pdb') 122 modified = self.get_status_list(path + 'modified.pdb') 123 obsolete = self.get_status_list(path + 'obsolete.pdb') 124 return [added, modified, obsolete]
125
126 - def get_all_entries(self):
127 """Retrieves a big file containing all the 128 PDB entries and some annotation to them. 129 Returns a list of PDB codes in the index file. 130 """ 131 print("retrieving index file. Takes about 5 MB.") 132 url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' 133 with contextlib.closing(_urlopen(url)) as handle: 134 all_entries = [line[:4] for line in handle.readlines()[2:] 135 if len(line) > 4] 136 return all_entries
137
138 - def get_all_obsolete(self):
139 """Returns a list of all obsolete entries ever in the PDB. 140 141 Returns a list of all obsolete pdb codes that have ever been 142 in the PDB. 143 144 Gets and parses the file from the PDB server in the format 145 (the first pdb_code column is the one used). The file looks 146 like this:: 147 148 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 149 OBSLTE 31-JUL-94 116L 216L 150 ... 151 OBSLTE 29-JAN-96 1HFT 2HFT 152 OBSLTE 21-SEP-06 1HFV 2J5X 153 OBSLTE 21-NOV-03 1HG6 154 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 155 OBSLTE 08-NOV-96 1HID 2HID 156 OBSLTE 01-APR-97 1HIU 2HIU 157 OBSLTE 14-JAN-04 1HKE 1UUZ 158 ... 159 160 """ 161 url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' 162 with contextlib.closing(_urlopen(url)) as handle: 163 # Extract pdb codes. Could use a list comprehension, but I want 164 # to include an assert to check for mis-reading the data. 165 obsolete = [] 166 for line in handle: 167 if not line.startswith("OBSLTE "): 168 continue 169 pdb = line.split()[2] 170 assert len(pdb) == 4 171 obsolete.append(pdb) 172 return obsolete
173
174 - def retrieve_pdb_file(self, pdb_code, obsolete=False, pdir=None):
175 """ Retrieves a PDB structure file from the PDB server and 176 stores it in a local file tree. 177 178 The PDB structure's file name is returned as a single string. 179 If obsolete ``==`` True, the file will be saved in a special file tree. 180 181 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 182 @type pdir: string 183 184 @return: filename 185 @rtype: string 186 """ 187 # Get the compressed PDB structure 188 code = pdb_code.lower() 189 archive_fn = "pdb%s.ent.gz" % code 190 pdb_dir = "divided" if not obsolete else "obsolete" 191 url = (self.pdb_server + 192 '/pub/pdb/data/structures/%s/pdb/%s/%s' % 193 (pdb_dir, code[1:3], archive_fn)) 194 195 # Where does the final PDB file get saved? 196 if pdir is None: 197 path = self.local_pdb if not obsolete else self.obsolete_pdb 198 if not self.flat_tree: # Put in PDB-style directory tree 199 path = os.path.join(path, code[1:3]) 200 else: # Put in specified directory 201 path = pdir 202 if not os.access(path, os.F_OK): 203 os.makedirs(path) 204 205 filename = os.path.join(path, archive_fn) 206 final_file = os.path.join(path, "pdb%s.ent" % code) # (decompressed) 207 208 # Skip download if the file already exists 209 if not self.overwrite: 210 if os.path.exists(final_file): 211 print("Structure exists: '%s' " % final_file) 212 return final_file 213 214 # Retrieve the file 215 print("Downloading PDB structure '%s'..." % pdb_code) 216 _urlretrieve(url, filename) 217 218 # Uncompress the archive, delete when done 219 # Can't use context manager with gzip.open until Python 2.7 220 gz = gzip.open(filename, 'rb') 221 with open(final_file, 'wb') as out: 222 out.writelines(gz) 223 gz.close() 224 os.remove(filename) 225 226 return final_file
227
228 - def update_pdb(self):
229 """ 230 I guess this is the 'most wanted' function from this module. 231 It gets the weekly lists of new and modified pdb entries and 232 automatically downloads the according PDB files. 233 You can call this module as a weekly cronjob. 234 """ 235 assert os.path.isdir(self.local_pdb) 236 assert os.path.isdir(self.obsolete_pdb) 237 238 new, modified, obsolete = self.get_recent_changes() 239 240 for pdb_code in new + modified: 241 try: 242 self.retrieve_pdb_file(pdb_code) 243 except Exception: 244 print('error %s\n' % pdb_code) 245 # you can insert here some more log notes that 246 # something has gone wrong. 247 248 # Move the obsolete files to a special folder 249 for pdb_code in obsolete: 250 if self.flat_tree: 251 old_file = os.path.join(self.local_pdb, 252 'pdb%s.ent' % pdb_code) 253 new_dir = self.obsolete_pdb 254 else: 255 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 256 'pdb%s.ent' % pdb_code) 257 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 258 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 259 if os.path.isfile(old_file): 260 if not os.path.isdir(new_dir): 261 os.mkdir(new_dir) 262 try: 263 shutil.move(old_file, new_file) 264 except Exception: 265 print("Could not move %s to obsolete folder" % old_file) 266 elif os.path.isfile(new_file): 267 print("Obsolete file %s already moved" % old_file) 268 else: 269 print("Obsolete file %s is missing" % old_file)
270
271 - def download_entire_pdb(self, listfile=None):
272 """Retrieve all PDB entries not present in the local PDB copy. 273 274 Writes a list file containing all PDB codes (optional, if listfile is 275 given). 276 """ 277 entries = self.get_all_entries() 278 for pdb_code in entries: 279 self.retrieve_pdb_file(pdb_code) 280 # Write the list 281 if listfile: 282 with open(listfile, 'w') as outfile: 283 outfile.writelines((x + '\n' for x in entries))
284
285 - def download_obsolete_entries(self, listfile=None):
286 """Retrieve all obsolete PDB entries not present in the local obsolete 287 PDB copy. 288 289 Writes a list file containing all PDB codes (optional, if listfile is 290 given). 291 """ 292 entries = self.get_all_obsolete() 293 for pdb_code in entries: 294 self.retrieve_pdb_file(pdb_code, obsolete=1) 295 296 # Write the list 297 if listfile: 298 with open(listfile, 'w') as outfile: 299 outfile.writelines((x + '\n' for x in entries))
300
301 - def get_seqres_file(self, savefile='pdb_seqres.txt'):
302 """Retrieves a (big) file containing all the sequences of PDB entries 303 and writes it to a file. 304 """ 305 print("Retrieving sequence file (takes about 15 MB).") 306 url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt' 307 _urlretrieve(url, savefile)
308 309 310 if __name__ == '__main__': 311 312 import sys 313 314 doc = """PDBList.py 315 (c) Kristian Rother 2003, Contributed to BioPython 316 317 Usage: 318 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 319 local pdb tree. 320 PDBList.py all <pdb_path> [options] - write all PDB entries to 321 local pdb tree. 322 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 323 entries to local pdb tree. 324 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 325 326 Options: 327 -d A single directory will be used as <pdb_path>, not a tree. 328 -o Overwrite existing structure files. 329 """ 330 print(doc) 331 332 if len(sys.argv) > 2: 333 pdb_path = sys.argv[2] 334 pl = PDBList(pdb=pdb_path) 335 if len(sys.argv) > 3: 336 for option in sys.argv[3:]: 337 if option == '-d': 338 pl.flat_tree = 1 339 elif option == '-o': 340 pl.overwrite = 1 341 342 else: 343 pdb_path = os.getcwd() 344 pl = PDBList() 345 pl.flat_tree = 1 346 347 if len(sys.argv) > 1: 348 if sys.argv[1] == 'update': 349 # update PDB 350 print("updating local PDB at " + pdb_path) 351 pl.update_pdb() 352 353 elif sys.argv[1] == 'all': 354 # get the entire PDB 355 pl.download_entire_pdb() 356 357 elif sys.argv[1] == 'obsol': 358 # get all obsolete entries 359 pl.download_obsolete_entries(pdb_path) 360 361 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 362 # get single PDB entry 363 pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path) 364