1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """ Access the PDB over the internet (e.g. to download structures). """
22
23
24 from __future__ import with_statement
25
26 import contextlib
27 import gzip
28 import os
29 import shutil
30 import urllib
31 from urllib2 import urlopen as _urlopen
32
33
35 """
36 This class provides quick access to the structure lists on the
37 PDB server or its mirrors. The structure lists contain
38 four-letter PDB codes, indicating that structures are
39 new, have been modified or are obsolete. The lists are released
40 on a weekly basis.
41
42 It also provides a function to retrieve PDB files from the server.
43 To use it properly, prepare a directory /pdb or the like,
44 where PDB files are stored.
45
46 If you want to use this module from inside a proxy, add
47 the proxy variable to your environment, e.g. in Unix:
48 export HTTP_PROXY='http://realproxy.charite.de:888'
49 (This can also be added to ~/.bashrc)
50 """
51
52 PDB_REF = """
53 The Protein Data Bank: a computer-based archival file for macromolecular structures.
54 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
55 J. Mol. Biol. 112 pp. 535-542 (1977)
56 http://www.pdb.org/.
57 """
58
59 alternative_download_url = "http://www.rcsb.org/pdb/files/"
60
61
62 - def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(),
63 obsolete_pdb=None):
64 """Initialize the class with the default server or a custom one."""
65 self.pdb_server = server
66 self.local_pdb = pdb
67
68
69 if obsolete_pdb:
70 self.obsolete_pdb = obsolete_pdb
71 else:
72 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete')
73 if not os.access(self.obsolete_pdb, os.F_OK):
74 os.makedirs(self.obsolete_pdb)
75
76
77 self.overwrite = 0
78 self.flat_tree = 0
79
81 """Retrieves a list of pdb codes in the weekly pdb status file
82 from the given URL. Used by get_recent_files.
83
84 Typical contents of the list files parsed by this method is now
85 very simply one PDB name per line.
86 """
87 with contextlib.closing(_urlopen(url)) as handle:
88 answer = []
89 for line in handle:
90 pdb = line.strip()
91 assert len(pdb) == 4
92 answer.append(pdb)
93 return answer
94
96 """Returns three lists of the newest weekly files (added,mod,obsolete).
97
98 Reads the directories with changed entries from the PDB server and
99 returns a tuple of three URL's to the files of new, modified and
100 obsolete entries from the most recent list. The directory with the
101 largest numerical name is used.
102 Returns None if something goes wrong.
103
104 Contents of the data/status dir (20031013 would be used);
105 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006
106 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013
107 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README
108 """
109 url = self.pdb_server + '/pub/pdb/data/status/'
110 with contextlib.closing(_urlopen(url)) as handle:
111 recent = filter(str.isdigit,
112 (x.split()[-1] for x in handle.readlines())
113 )[-1]
114
115 path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent)
116
117
118 added = self.get_status_list(path + 'added.pdb')
119 modified = self.get_status_list(path + 'modified.pdb')
120 obsolete = self.get_status_list(path + 'obsolete.pdb')
121 return [added, modified, obsolete]
122
124 """Retrieves a big file containing all the
125 PDB entries and some annotation to them.
126 Returns a list of PDB codes in the index file.
127 """
128 print "retrieving index file. Takes about 5 MB."
129 url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx'
130 with contextlib.closing(_urlopen(url)) as handle:
131 all_entries = [line[:4] for line in handle.readlines()[2:]
132 if len(line) > 4]
133 return all_entries
134
136 """Returns a list of all obsolete entries ever in the PDB.
137
138 Returns a list of all obsolete pdb codes that have ever been
139 in the PDB.
140
141 Gets and parses the file from the PDB server in the format
142 (the first pdb_code column is the one used). The file looks
143 like this:
144
145 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
146 OBSLTE 31-JUL-94 116L 216L
147 ...
148 OBSLTE 29-JAN-96 1HFT 2HFT
149 OBSLTE 21-SEP-06 1HFV 2J5X
150 OBSLTE 21-NOV-03 1HG6
151 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB
152 OBSLTE 08-NOV-96 1HID 2HID
153 OBSLTE 01-APR-97 1HIU 2HIU
154 OBSLTE 14-JAN-04 1HKE 1UUZ
155 ...
156
157 """
158 url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat'
159 with contextlib.closing(_urlopen(url)) as handle:
160
161
162 obsolete = []
163 for line in handle:
164 if not line.startswith("OBSLTE "):
165 continue
166 pdb = line.split()[2]
167 assert len(pdb) == 4
168 obsolete.append(pdb)
169 return obsolete
170
172 """ Retrieves a PDB structure file from the PDB server and
173 stores it in a local file tree.
174
175 The PDB structure's file name is returned as a single string.
176 If obsolete == True, the file will be saved in a special file tree.
177
178 @param pdir: put the file in this directory (default: create a PDB-style directory tree)
179 @type pdir: string
180
181 @return: filename
182 @rtype: string
183 """
184
185 code = pdb_code.lower()
186 archive_fn = "pdb%s.ent.gz" % code
187 pdb_dir = "divided" if not obsolete else "obsolete"
188 url = (self.pdb_server +
189 '/pub/pdb/data/structures/%s/pdb/%s/%s' %
190 (pdb_dir, code[1:3], archive_fn))
191
192
193 if pdir is None:
194 path = self.local_pdb if not obsolete else self.obsolete_pdb
195 if not self.flat_tree:
196 path = os.path.join(path, code[1:3])
197 else:
198 path = pdir
199 if not os.access(path, os.F_OK):
200 os.makedirs(path)
201
202 filename = os.path.join(path, archive_fn)
203 final_file = os.path.join(path, "pdb%s.ent" % code)
204
205
206 if not self.overwrite:
207 if os.path.exists(final_file):
208 print "Structure exists: '%s' " % final_file
209 return final_file
210
211
212 print "Downloading PDB structure '%s'..." % pdb_code
213 urllib.urlretrieve(url, filename)
214
215
216 with gzip.open(filename, 'rb') as gz:
217 with open(final_file, 'wb') as out:
218 out.writelines(gz)
219 os.remove(filename)
220
221 return final_file
222
224 """
225 I guess this is the 'most wanted' function from this module.
226 It gets the weekly lists of new and modified pdb entries and
227 automatically downloads the according PDB files.
228 You can call this module as a weekly cronjob.
229 """
230 assert os.path.isdir(self.local_pdb)
231 assert os.path.isdir(self.obsolete_pdb)
232
233 new, modified, obsolete = self.get_recent_changes()
234
235 for pdb_code in new + modified:
236 try:
237 self.retrieve_pdb_file(pdb_code)
238 except Exception:
239 print 'error %s\n' % pdb_code
240
241
242
243
244 for pdb_code in obsolete:
245 if self.flat_tree:
246 old_file = os.path.join(self.local_pdb,
247 'pdb%s.ent' % pdb_code)
248 new_dir = self.obsolete_pdb
249 else:
250 old_file = os.path.join(self.local_pdb, pdb_code[1:3],
251 'pdb%s.ent' % pdb_code)
252 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3])
253 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code)
254 if os.path.isfile(old_file):
255 if not os.path.isdir(new_dir):
256 os.mkdir(new_dir)
257 try:
258 shutil.move(old_file, new_file)
259 except Exception:
260 print "Could not move %s to obsolete folder" % old_file
261 elif os.path.isfile(new_file):
262 print "Obsolete file %s already moved" % old_file
263 else:
264 print "Obsolete file %s is missing" % old_file
265
267 """Retrieve all PDB entries not present in the local PDB copy.
268
269 Writes a list file containing all PDB codes (optional, if listfile is
270 given).
271 """
272 entries = self.get_all_entries()
273 for pdb_code in entries:
274 self.retrieve_pdb_file(pdb_code)
275
276 if listfile:
277 with open(listfile, 'w') as outfile:
278 outfile.writelines((x + '\n' for x in entries))
279
281 """Retrieve all obsolete PDB entries not present in the local obsolete
282 PDB copy.
283
284 Writes a list file containing all PDB codes (optional, if listfile is
285 given).
286 """
287 entries = self.get_all_obsolete()
288 for pdb_code in entries:
289 self.retrieve_pdb_file(pdb_code, obsolete=1)
290
291
292 if listfile:
293 with open(listfile, 'w') as outfile:
294 outfile.writelines((x + '\n' for x in entries))
295
297 """Retrieves a (big) file containing all the sequences of PDB entries
298 and writes it to a file.
299 """
300 print "Retrieving sequence file (takes about 15 MB)."
301 url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt'
302 urllib.urlretrieve(url, savefile)
303
304
305 if __name__ == '__main__':
306
307 import sys
308
309 doc = """PDBList.py
310 (c) Kristian Rother 2003, Contributed to BioPython
311
312 Usage:
313 PDBList.py update <pdb_path> [options] - write weekly PDB updates to
314 local pdb tree.
315 PDBList.py all <pdb_path> [options] - write all PDB entries to
316 local pdb tree.
317 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB
318 entries to local pdb tree.
319 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure
320
321 Options:
322 -d A single directory will be used as <pdb_path>, not a tree.
323 -o Overwrite existing structure files.
324 """
325 print doc
326
327 if len(sys.argv) > 2:
328 pdb_path = sys.argv[2]
329 pl = PDBList(pdb=pdb_path)
330 if len(sys.argv) > 3:
331 for option in sys.argv[3:]:
332 if option == '-d':
333 pl.flat_tree = 1
334 elif option == '-o':
335 pl.overwrite = 1
336
337 else:
338 pdb_path = os.getcwd()
339 pl = PDBList()
340 pl.flat_tree = 1
341
342 if len(sys.argv) > 1:
343 if sys.argv[1] == 'update':
344
345 print "updating local PDB at " + pdb_path
346 pl.update_pdb()
347
348 elif sys.argv[1] == 'all':
349
350 pl.download_entire_pdb()
351
352 elif sys.argv[1] == 'obsol':
353
354 pl.download_obsolete_entries(pdb_path)
355
356 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit():
357
358 pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path)
359