Package Bio :: Package SeqIO :: Module PdbIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PdbIO

  1  # Copyright 2012 by Eric Talevich.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  from __future__ import with_statement 
  6   
  7  import collections 
  8  import warnings 
  9   
 10  from Bio.Alphabet import generic_protein 
 11  from Bio.Seq import Seq 
 12  from Bio.SeqRecord import SeqRecord 
 13   
 14   
15 -def PdbSeqresIterator(handle):
16 """Returns SeqRecord objects for each chain in a PDB file. 17 18 The sequences are derived from the SEQRES lines in the 19 PDB file header, not the atoms of the 3D structure. 20 21 Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES 22 23 See: http://www.wwpdb.org/documentation/format23/sect3.html 24 """ 25 # Late-binding import to avoid circular dependency on SeqIO in Bio.SCOP 26 # TODO - swap in Bow's SeqUtils.seq1 once that's merged 27 from Bio.SCOP.three_to_one_dict import to_one_letter_code 28 29 chains = collections.defaultdict(list) 30 metadata = collections.defaultdict(list) 31 for line in handle: 32 rec_name = line[0:6].strip() 33 if rec_name == 'SEQRES': 34 # NB: We only actually need chain ID and the residues here; 35 # commented bits are placeholders from the wwPDB spec. 36 # Serial number of the SEQRES record for the current chain. 37 # Starts at 1 and increments by one each line. 38 # Reset to 1 for each chain. 39 # ser_num = int(line[8:10]) 40 # Chain identifier. This may be any single legal character, 41 # including a blank which is used if there is only one chain. 42 chn_id = line[11] 43 # Number of residues in the chain (repeated on every record) 44 # num_res = int(line[13:17]) 45 residues = [to_one_letter_code.get(res, 'X') 46 for res in line[19:].split()] 47 chains[chn_id].extend(residues) 48 elif rec_name == 'DBREF': 49 # ID code of this entry (PDB ID) 50 pdb_id = line[7:11] 51 # Chain identifier. 52 chn_id = line[12] 53 # Initial sequence number of the PDB sequence segment. 54 # seq_begin = int(line[14:18]) 55 # Initial insertion code of the PDB sequence segment. 56 # icode_begin = line[18] 57 # Ending sequence number of the PDB sequence segment. 58 # seq_end = int(line[20:24]) 59 # Ending insertion code of the PDB sequence segment. 60 # icode_end = line[24] 61 # Sequence database name. 62 database = line[26:32].strip() 63 # Sequence database accession code. 64 db_acc = line[33:41].strip() 65 # Sequence database identification code. 66 db_id_code = line[42:54].strip() 67 # Initial sequence number of the database seqment. 68 # db_seq_begin = int(line[55:60]) 69 # Insertion code of initial residue of the segment, if PDB is the 70 # reference. 71 # db_icode_begin = line[60] 72 # Ending sequence number of the database segment. 73 # db_seq_end = int(line[62:67]) 74 # Insertion code of the ending residue of the segment, if PDB is the 75 # reference. 76 # db_icode_end = line[67] 77 metadata[chn_id].append({'pdb_id': pdb_id, 'database': database, 78 'db_acc': db_acc, 'db_id_code': db_id_code}) 79 # ENH: 'SEQADV' 'MODRES' 80 81 for chn_id, residues in sorted(chains.iteritems()): 82 record = SeqRecord(Seq(''.join(residues), generic_protein)) 83 record.annotations = {"chain": chn_id} 84 if chn_id in metadata: 85 m = metadata[chn_id][0] 86 record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) 87 record.description = ("%s:%s %s" % (m['database'], 88 m['db_acc'], 89 m['db_id_code'])) 90 for melem in metadata[chn_id]: 91 record.dbxrefs.extend([ 92 "%s:%s" % (melem['database'], melem['db_acc']), 93 "%s:%s" % (melem['database'], melem['db_id_code'])]) 94 else: 95 record.id = chn_id 96 yield record
97 98
99 -def PdbAtomIterator(handle):
100 """Returns SeqRecord objects for each chain in a PDB file 101 102 The sequences are derived from the 3D structure (ATOM records), not the 103 SEQRES lines in the PDB file header. 104 105 Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries 106 are converted to "X" in the sequence. 107 108 In addition to information from the PDB header (which is the same for all 109 records), the following chain specific information is placed in the 110 annotation: 111 112 record.annotations["residues"] = List of residue ID strings 113 record.annotations["chain"] = Chain ID (typically A, B ,...) 114 record.annotations["model"] = Model ID (typically zero) 115 116 Where amino acids are missing from the structure, as indicated by residue 117 numbering, the sequence is filled in with 'X' characters to match the size 118 of the missing region, and None is included as the corresponding entry in 119 the list record.annotations["residues"]. 120 121 This function uses the Bio.PDB module to do most of the hard work. The 122 annotation information could be improved but this extra parsing should be 123 done in parse_pdb_header, not this module. 124 """ 125 # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO 126 from Bio.PDB import PDBParser 127 from Bio.SCOP.three_to_one_dict import to_one_letter_code 128 129 def restype(residue): 130 """Return a residue's type as a one-letter code. 131 132 Non-standard residues (e.g. CSD, ANP) are returned as 'X'. 133 """ 134 return to_one_letter_code.get(residue.resname, 'X')
135 136 # Deduce the PDB ID from the PDB header 137 # ENH: or filename? 138 from Bio.File import UndoHandle 139 undo_handle = UndoHandle(handle) 140 firstline = undo_handle.peekline() 141 if firstline.startswith("HEADER"): 142 pdb_id = firstline[62:66] 143 else: 144 warnings.warn("First line is not a 'HEADER'; can't determine PDB ID") 145 pdb_id = '????' 146 147 struct = PDBParser().get_structure(pdb_id, undo_handle) 148 model = struct[0] 149 for chn_id, chain in sorted(model.child_dict.iteritems()): 150 # HETATM mod. res. policy: remove mod if in sequence, else discard 151 residues = [res for res in chain.get_unpacked_list() 152 if res.get_resname().upper() in to_one_letter_code] 153 if not residues: 154 continue 155 # Identify missing residues in the structure 156 # (fill the sequence with 'X' residues in these regions) 157 gaps = [] 158 rnumbers = [r.id[1] for r in residues] 159 for i, rnum in enumerate(rnumbers[:-1]): 160 if rnumbers[i+1] != rnum + 1: 161 # It's a gap! 162 gaps.append((i+1, rnum, rnumbers[i+1])) 163 if gaps: 164 res_out = [] 165 prev_idx = 0 166 for i, pregap, postgap in gaps: 167 if postgap > pregap: 168 gapsize = postgap - pregap - 1 169 res_out.extend(map(restype, residues[prev_idx:i])) 170 prev_idx = i 171 res_out.append('X'*gapsize) 172 # Last segment 173 res_out.extend(map(restype, residues[prev_idx:])) 174 else: 175 warnings.warn("Ignoring out-of-order residues after a gap", 176 UserWarning) 177 # Keep the normal part, drop the out-of-order segment 178 # (presumably modified or hetatm residues, e.g. 3BEG) 179 res_out.extend(map(restype, residues[prev_idx:i])) 180 else: 181 # No gaps 182 res_out = map(restype, residues) 183 record_id = "%s:%s" % (pdb_id, chn_id) 184 # ENH - model number in SeqRecord id if multiple models? 185 # id = "Chain%s" % str(chain.id) 186 # if len(structure) > 1 : 187 # id = ("Model%s|" % str(model.id)) + id 188 189 record = SeqRecord(Seq(''.join(res_out), generic_protein), 190 id=record_id, 191 description=record_id, 192 ) 193 194 # The PDB header was loaded as a dictionary, so let's reuse it all 195 record.annotations = struct.header.copy() 196 # Plus some chain specifics: 197 record.annotations["model"] = model.id 198 record.annotations["chain"] = chain.id 199 200 # Start & end 201 record.annotations["start"] = int(rnumbers[0]) 202 record.annotations["end"] = int(rnumbers[-1]) 203 204 # ENH - add letter annotations -- per-residue info, e.g. numbers 205 206 yield record 207 208 209 if __name__ == '__main__': 210 # Test 211 import sys 212 from Bio import SeqIO 213 for fname in sys.argv[1:]: 214 for parser in (PdbSeqresIterator, PdbAtomIterator): 215 with open(fname) as handle: 216 records = parser(handle) 217 SeqIO.write(records, sys.stdout, 'fasta') 218