Package Bio :: Package ExPASy :: Module Prosite
[hide private]
[frames] | no frames]

Source Code for Module Bio.ExPASy.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  This module provides code to work with the prosite dat file from 
 10  Prosite. 
 11  http://www.expasy.ch/prosite/ 
 12   
 13  Tested with: 
 14  Release 20.43, 10-Feb-2009 
 15   
 16   
 17  Functions: 
 18   
 19      - read                  Reads a Prosite file containing one Prosite record 
 20      - parse                 Iterates over records in a Prosite file. 
 21   
 22  Classes: 
 23   
 24      - Record                Holds Prosite data. 
 25  """ 
 26   
 27  __docformat__ = "restructuredtext en" 
 28   
 29   
30 -def parse(handle):
31 """Parse Prosite records. 32 33 This function is for parsing Prosite files containing multiple 34 records. 35 36 handle - handle to the file.""" 37 while True: 38 record = __read(handle) 39 if not record: 40 break 41 yield record
42 43
44 -def read(handle):
45 """Read one Prosite record. 46 47 This function is for parsing Prosite files containing 48 exactly one record. 49 50 handle - handle to the file.""" 51 52 record = __read(handle) 53 # We should have reached the end of the record by now 54 remainder = handle.read() 55 if remainder: 56 raise ValueError("More than one Prosite record found") 57 return record
58 59
60 -class Record(object):
61 """Holds information from a Prosite record. 62 63 Members: 64 65 - name ID of the record. e.g. ADH_ZINC 66 - type Type of entry. e.g. PATTERN, MATRIX, or RULE 67 - accession e.g. PS00387 68 - created Date the entry was created. (MMM-YYYY) 69 - data_update Date the 'primary' data was last updated. 70 - info_update Date data other than 'primary' data was last updated. 71 - pdoc ID of the PROSITE DOCumentation. 72 73 - description Free-format description. 74 - pattern The PROSITE pattern. See docs. 75 - matrix List of strings that describes a matrix entry. 76 - rules List of rule definitions (from RU lines). (strings) 77 - prorules List of prorules (from PR lines). (strings) 78 79 NUMERICAL RESULTS 80 81 - nr_sp_release SwissProt release. 82 - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 83 - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 84 - nr_positive True positives. tuple of (hits, seqs) 85 - nr_unknown Could be positives. tuple of (hits, seqs) 86 - nr_false_pos False positives. tuple of (hits, seqs) 87 - nr_false_neg False negatives. (int) 88 - nr_partial False negatives, because they are fragments. (int) 89 90 COMMENTS 91 92 - cc_taxo_range Taxonomic range. See docs for format 93 - cc_max_repeat Maximum number of repetitions in a protein 94 - cc_site Interesting site. list of tuples (pattern pos, desc.) 95 - cc_skip_flag Can this entry be ignored? 96 - cc_matrix_type 97 - cc_scaling_db 98 - cc_author 99 - cc_ft_key 100 - cc_ft_desc 101 - cc_version version number (introduced in release 19.0) 102 103 The following are all lists if tuples (swiss-prot accession, swiss-prot name). 104 105 DATA BANK REFERENCES 106 107 - dr_positive 108 - dr_false_neg 109 - dr_false_pos 110 - dr_potential Potential hits, but fingerprint region not yet available. 111 - dr_unknown Could possibly belong 112 - pdb_structs List of PDB entries. 113 114 """
115 - def __init__(self):
116 self.name = '' 117 self.type = '' 118 self.accession = '' 119 self.created = '' 120 self.data_update = '' 121 self.info_update = '' 122 self.pdoc = '' 123 124 self.description = '' 125 self.pattern = '' 126 self.matrix = [] 127 self.rules = [] 128 self.prorules = [] 129 self.postprocessing = [] 130 131 self.nr_sp_release = '' 132 self.nr_sp_seqs = '' 133 self.nr_total = (None, None) 134 self.nr_positive = (None, None) 135 self.nr_unknown = (None, None) 136 self.nr_false_pos = (None, None) 137 self.nr_false_neg = None 138 self.nr_partial = None 139 140 self.cc_taxo_range = '' 141 self.cc_max_repeat = '' 142 self.cc_site = [] 143 self.cc_skip_flag = '' 144 145 self.dr_positive = [] 146 self.dr_false_neg = [] 147 self.dr_false_pos = [] 148 self.dr_potential = [] 149 self.dr_unknown = [] 150 151 self.pdb_structs = []
152 153 154 # Everything below are private functions 155
156 -def __read(handle):
157 import re 158 record = None 159 for line in handle: 160 keyword, value = line[:2], line[5:].rstrip() 161 if keyword == 'ID': 162 record = Record() 163 cols = value.split("; ") 164 if len(cols) != 2: 165 raise ValueError("I don't understand identification line\n%s" 166 % line) 167 record.name = cols[0] 168 record.type = cols[1].rstrip('.') # don't want '.' 169 elif keyword == 'AC': 170 record.accession = value.rstrip(';') 171 elif keyword == 'DT': 172 dates = value.rstrip('.').split("; ") 173 if (not dates[0].endswith('(CREATED)')) or \ 174 (not dates[1].endswith('(DATA UPDATE)')) or \ 175 (not dates[2].endswith('(INFO UPDATE)')): 176 raise ValueError("I don't understand date line\n%s" % line) 177 record.created = dates[0].rstrip(' (CREATED)') 178 record.data_update = dates[1].rstrip(' (DATA UPDATE)') 179 record.info_update = dates[2].rstrip(' (INFO UPDATE)') 180 elif keyword == 'DE': 181 record.description = value 182 elif keyword == 'PA': 183 record.pattern += value 184 elif keyword == 'MA': 185 record.matrix.append(value) 186 elif keyword == 'PP': 187 record.postprocessing.extend(value.split(";")) 188 elif keyword == 'RU': 189 record.rules.append(value) 190 elif keyword == 'NR': 191 cols = value.split(";") 192 for col in cols: 193 if not col: 194 continue 195 qual, data = [word.lstrip() for word in col.split("=")] 196 if qual == '/RELEASE': 197 release, seqs = data.split(",") 198 record.nr_sp_release = release 199 record.nr_sp_seqs = int(seqs) 200 elif qual == '/FALSE_NEG': 201 record.nr_false_neg = int(data) 202 elif qual == '/PARTIAL': 203 record.nr_partial = int(data) 204 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 205 m = re.match(r'(\d+)\((\d+)\)', data) 206 if not m: 207 raise Exception("Broken data %s in comment line\n%s" 208 % (repr(data), line)) 209 hits = tuple(map(int, m.groups())) 210 if(qual == "/TOTAL"): 211 record.nr_total = hits 212 elif(qual == "/POSITIVE"): 213 record.nr_positive = hits 214 elif(qual == "/UNKNOWN"): 215 record.nr_unknown = hits 216 elif(qual == "/FALSE_POS"): 217 record.nr_false_pos = hits 218 else: 219 raise ValueError("Unknown qual %s in comment line\n%s" 220 % (repr(qual), line)) 221 elif keyword == 'CC': 222 # Expect CC lines like this: 223 # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 224 # Can (normally) split on ";" and then on "=" 225 cols = value.split(";") 226 for col in cols: 227 if not col or col[:17] == 'Automatic scaling': 228 # DNAJ_2 in Release 15 has a non-standard comment line: 229 # CC Automatic scaling using reversed database 230 # Throw it away. (Should I keep it?) 231 continue 232 if col.count("=") == 0: 233 # Missing qualifier! Can we recover gracefully? 234 # For example, from Bug 2403, in PS50293 have: 235 # CC /AUTHOR=K_Hofmann; N_Hulo 236 continue 237 qual, data = [word.lstrip() for word in col.split("=")] 238 if qual == '/TAXO-RANGE': 239 record.cc_taxo_range = data 240 elif qual == '/MAX-REPEAT': 241 record.cc_max_repeat = data 242 elif qual == '/SITE': 243 pos, desc = data.split(",") 244 record.cc_site.append((int(pos), desc)) 245 elif qual == '/SKIP-FLAG': 246 record.cc_skip_flag = data 247 elif qual == '/MATRIX_TYPE': 248 record.cc_matrix_type = data 249 elif qual == '/SCALING_DB': 250 record.cc_scaling_db = data 251 elif qual == '/AUTHOR': 252 record.cc_author = data 253 elif qual == '/FT_KEY': 254 record.cc_ft_key = data 255 elif qual == '/FT_DESC': 256 record.cc_ft_desc = data 257 elif qual == '/VERSION': 258 record.cc_version = data 259 else: 260 raise ValueError("Unknown qual %s in comment line\n%s" 261 % (repr(qual), line)) 262 elif keyword == 'DR': 263 refs = value.split(";") 264 for ref in refs: 265 if not ref: 266 continue 267 acc, name, type = [word.strip() for word in ref.split(",")] 268 if type == 'T': 269 record.dr_positive.append((acc, name)) 270 elif type == 'F': 271 record.dr_false_pos.append((acc, name)) 272 elif type == 'N': 273 record.dr_false_neg.append((acc, name)) 274 elif type == 'P': 275 record.dr_potential.append((acc, name)) 276 elif type == '?': 277 record.dr_unknown.append((acc, name)) 278 else: 279 raise ValueError("I don't understand type flag %s" % type) 280 elif keyword == '3D': 281 cols = value.split() 282 for id in cols: 283 record.pdb_structs.append(id.rstrip(';')) 284 elif keyword == 'PR': 285 rules = value.split(";") 286 record.prorules.extend(rules) 287 elif keyword == 'DO': 288 record.pdoc = value.rstrip(';') 289 elif keyword == 'CC': 290 continue 291 elif keyword == '//': 292 if not record: 293 # Then this was the copyright statement 294 continue 295 break 296 else: 297 raise ValueError("Unknown keyword %s found" % keyword) 298 else: 299 return 300 if not record: 301 raise ValueError("Unexpected end of stream.") 302 return record
303