Package Bio :: Package PDB :: Module parse_pdb_header'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.parse_pdb_header'

  1  #!/usr/bin/env python 
  2  # 
  3  # parse_pdb_header.py 
  4  # parses header of PDB files into a python dictionary. 
  5  # emerged from the Columba database project www.columba-db.de. 
  6  # 
  7  # author: Kristian Rother 
  8  # 
  9  # license: same as BioPython, read LICENSE.TXT from current BioPython release. 
 10  # 
 11  # last modified: 9.2.2004 
 12  # 
 13  # Added some small changes: the whole PDB file is not read in anymore, but just 
 14  # until the first ATOM record (faster). I also split parse_pdb_header into 
 15  # parse_pdb_header and parse_pdb_header_list, because parse_pdb_header_list 
 16  # can be more easily reused in PDBParser. 
 17  # 
 18  # Thomas, 19/03/04 
 19  # 
 20  # Renamed some clearly private functions to _something (ie. parse_pdb_header_list 
 21  # is now _parse_pdb_header_list) 
 22  # Thomas 9/05/04 
 23   
 24  """Parse the header of a PDB file.""" 
 25   
 26  from __future__ import print_function 
 27   
 28  import re 
 29   
 30  from Bio import File 
 31   
 32   
33 -def _get_journal(inl):
34 # JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7 35 journal="" 36 for l in inl: 37 if re.search("\AJRNL", l): 38 journal+=l[19:72].lower() 39 journal=re.sub("\s\s+", " ", journal) 40 return journal
41 42
43 -def _get_references(inl):
44 # REMARK 1 REFERENCE 1 1CSE 11 45 # REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12 46 references=[] 47 actref="" 48 for l in inl: 49 if re.search("\AREMARK 1", l): 50 if re.search("\AREMARK 1 REFERENCE", l): 51 if actref!="": 52 actref=re.sub("\s\s+", " ", actref) 53 if actref!=" ": 54 references.append(actref) 55 actref="" 56 else: 57 actref+=l[19:72].lower() 58 59 if actref!="": 60 actref=re.sub("\s\s+", " ", actref) 61 if actref!=" ": 62 references.append(actref) 63 return references
64 65 66 # bring dates to format: 1909-01-08
67 -def _format_date(pdb_date):
68 """Converts dates from DD-Mon-YY to YYYY-MM-DD format.""" 69 date="" 70 year=int(pdb_date[7:]) 71 if year<50: 72 century=2000 73 else: 74 century=1900 75 date=str(century+year)+"-" 76 all_months=['xxx', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 77 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 78 month=str(all_months.index(pdb_date[3:6])) 79 if len(month)==1: 80 month = '0'+month 81 date = date+month+'-'+pdb_date[:2] 82 return date
83 84
85 -def _chop_end_codes(line):
86 """Chops lines ending with ' 1CSA 14' and the like.""" 87 return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line)
88 89
90 -def _chop_end_misc(line):
91 """Chops lines ending with ' 14-JUL-97 1CSA' and the like.""" 92 return re.sub("\s\s\s\s+.*\Z", "", line)
93 94
95 -def _nice_case(line):
96 """Makes A Lowercase String With Capitals.""" 97 l=line.lower() 98 s="" 99 i=0 100 nextCap=1 101 while i<len(l): 102 c=l[i] 103 if c>='a' and c<='z' and nextCap: 104 c=c.upper() 105 nextCap=0 106 elif c==' ' or c=='.' or c==',' or c==';' or c==':' or c=='\t' or\ 107 c=='-' or c=='_': 108 nextCap=1 109 s+=c 110 i+=1 111 return s
112 113
114 -def parse_pdb_header(infile):
115 """ 116 Returns the header lines of a pdb file as a dictionary. 117 118 Dictionary keys are: head, deposition_date, release_date, structure_method, 119 resolution, structure_reference, journal_reference, author and 120 compound. 121 """ 122 header = [] 123 with File.as_handle(infile, 'r') as f: 124 for l in f: 125 record_type=l[0:6] 126 if (record_type=='ATOM ' or record_type=='HETATM' or 127 record_type=='MODEL '): 128 break 129 else: 130 header.append(l) 131 return _parse_pdb_header_list(header)
132 133
134 -def _parse_pdb_header_list(header):
135 # database fields 136 dict={'name':"", 137 'head':'', 138 'deposition_date' : "1909-01-08", 139 'release_date' : "1909-01-08", 140 'structure_method' : "unknown", 141 'resolution' : 0.0, 142 'structure_reference' : "unknown", 143 'journal_reference' : "unknown", 144 'author' : "", 145 'compound':{'1':{'misc':''}},'source':{'1':{'misc':''}}} 146 147 dict['structure_reference'] = _get_references(header) 148 dict['journal_reference'] = _get_journal(header) 149 comp_molid="1" 150 src_molid="1" 151 last_comp_key="misc" 152 last_src_key="misc" 153 154 for hh in header: 155 h=re.sub("[\s\n\r]*\Z", "", hh) # chop linebreaks off 156 #key=re.sub("\s.+\s*","",h) 157 key = h[:6].strip() 158 #tail=re.sub("\A\w+\s+\d*\s*","",h) 159 tail = h[10:].strip() 160 # print("%s:%s" % (key, tail) 161 162 # From here, all the keys from the header are being parsed 163 if key=="TITLE": 164 name=_chop_end_codes(tail).lower() 165 if 'name' in dict: 166 dict['name'] += " "+name 167 else: 168 dict['name']=name 169 elif key=="HEADER": 170 rr=re.search("\d\d-\w\w\w-\d\d", tail) 171 if rr is not None: 172 dict['deposition_date']=_format_date(_nice_case(rr.group())) 173 head=_chop_end_misc(tail).lower() 174 dict['head']=head 175 elif key=="COMPND": 176 tt=re.sub("\;\s*\Z", "", _chop_end_codes(tail)).lower() 177 # look for E.C. numbers in COMPND lines 178 rec = re.search('\d+\.\d+\.\d+\.\d+', tt) 179 if rec: 180 dict['compound'][comp_molid]['ec_number']=rec.group() 181 tt=re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)", "", tt) 182 tok=tt.split(":") 183 if len(tok)>=2: 184 ckey=tok[0] 185 cval=re.sub("\A\s*", "", tok[1]) 186 if ckey=='mol_id': 187 dict['compound'][cval]={'misc':''} 188 comp_molid=cval 189 last_comp_key="misc" 190 else: 191 dict['compound'][comp_molid][ckey]=cval 192 last_comp_key=ckey 193 else: 194 dict['compound'][comp_molid][last_comp_key]+=tok[0]+" " 195 elif key=="SOURCE": 196 tt=re.sub("\;\s*\Z", "", _chop_end_codes(tail)).lower() 197 tok=tt.split(":") 198 # print(tok) 199 if len(tok)>=2: 200 ckey=tok[0] 201 cval=re.sub("\A\s*", "", tok[1]) 202 if ckey=='mol_id': 203 dict['source'][cval]={'misc':''} 204 comp_molid=cval 205 last_src_key="misc" 206 else: 207 dict['source'][comp_molid][ckey]=cval 208 last_src_key=ckey 209 else: 210 dict['source'][comp_molid][last_src_key]+=tok[0]+" " 211 elif key=="KEYWDS": 212 kwd=_chop_end_codes(tail).lower() 213 if 'keywords' in dict: 214 dict['keywords']+=" "+kwd 215 else: 216 dict['keywords']=kwd 217 elif key=="EXPDTA": 218 expd=_chop_end_codes(tail) 219 # chop junk at end of lines for some structures 220 expd=re.sub('\s\s\s\s\s\s\s.*\Z', '', expd) 221 # if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr' 222 # if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction' 223 dict['structure_method']=expd.lower() 224 elif key=="CAVEAT": 225 # make Annotation entries out of these!!! 226 pass 227 elif key=="REVDAT": 228 rr=re.search("\d\d-\w\w\w-\d\d", tail) 229 if rr is not None: 230 dict['release_date']=_format_date(_nice_case(rr.group())) 231 elif key=="JRNL": 232 # print("%s:%s" % (key, tail)) 233 if 'journal' in dict: 234 dict['journal']+=tail 235 else: 236 dict['journal']=tail 237 elif key=="AUTHOR": 238 auth = _nice_case(_chop_end_codes(tail)) 239 if 'author' in dict: 240 dict['author']+=auth 241 else: 242 dict['author']=auth 243 elif key=="REMARK": 244 if re.search("REMARK 2 RESOLUTION.", hh): 245 r=_chop_end_codes(re.sub("REMARK 2 RESOLUTION.", '', hh)) 246 r=re.sub("\s+ANGSTROM.*", "", r) 247 try: 248 dict['resolution']=float(r) 249 except: 250 #print('nonstandard resolution %r' % r) 251 dict['resolution']=None 252 else: 253 # print(key) 254 pass 255 if dict['structure_method']=='unknown': 256 if dict['resolution']>0.0: 257 dict['structure_method']='x-ray diffraction' 258 return dict
259 260 if __name__=='__main__': 261 # Reads a PDB file passed as argument, parses its header, extracts 262 # some data and returns it as a dictionary. 263 import sys 264 filename = sys.argv[1] 265 with open(filename, 'r') as handle: 266 data_dict = parse_pdb_header(handle) 267 268 # print the dictionary 269 for k, y in data_dict.items(): 270 print("-"*40) 271 print(k) 272 print(y) 273