Package Bio :: Package ExPASy :: Module Prosite
[hide private]
[frames] | no frames]

Source Code for Module Bio.ExPASy.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  This module provides code to work with the prosite dat file from 
 10  Prosite. 
 11  http://www.expasy.ch/prosite/ 
 12   
 13  Tested with: 
 14  Release 20.43, 10-Feb-2009 
 15   
 16   
 17  Functions: 
 18   
 19      - read                  Reads a Prosite file containing one Prosite record 
 20      - parse                 Iterates over records in a Prosite file. 
 21   
 22  Classes: 
 23   
 24      - Record                Holds Prosite data. 
 25  """ 
 26   
 27  __docformat__ = "restructuredtext en" 
 28   
29 -def parse(handle):
30 """Parse Prosite records. 31 32 This function is for parsing Prosite files containing multiple 33 records. 34 35 handle - handle to the file.""" 36 while True: 37 record = __read(handle) 38 if not record: 39 break 40 yield record
41 42
43 -def read(handle):
44 """Read one Prosite record. 45 46 This function is for parsing Prosite files containing 47 exactly one record. 48 49 handle - handle to the file.""" 50 51 record = __read(handle) 52 # We should have reached the end of the record by now 53 remainder = handle.read() 54 if remainder: 55 raise ValueError("More than one Prosite record found") 56 return record
57 58
59 -class Record(object):
60 """Holds information from a Prosite record. 61 62 Members: 63 64 - name ID of the record. e.g. ADH_ZINC 65 - type Type of entry. e.g. PATTERN, MATRIX, or RULE 66 - accession e.g. PS00387 67 - created Date the entry was created. (MMM-YYYY) 68 - data_update Date the 'primary' data was last updated. 69 - info_update Date data other than 'primary' data was last updated. 70 - pdoc ID of the PROSITE DOCumentation. 71 72 - description Free-format description. 73 - pattern The PROSITE pattern. See docs. 74 - matrix List of strings that describes a matrix entry. 75 - rules List of rule definitions (from RU lines). (strings) 76 - prorules List of prorules (from PR lines). (strings) 77 78 NUMERICAL RESULTS 79 80 - nr_sp_release SwissProt release. 81 - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 82 - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 83 - nr_positive True positives. tuple of (hits, seqs) 84 - nr_unknown Could be positives. tuple of (hits, seqs) 85 - nr_false_pos False positives. tuple of (hits, seqs) 86 - nr_false_neg False negatives. (int) 87 - nr_partial False negatives, because they are fragments. (int) 88 89 COMMENTS 90 91 - cc_taxo_range Taxonomic range. See docs for format 92 - cc_max_repeat Maximum number of repetitions in a protein 93 - cc_site Interesting site. list of tuples (pattern pos, desc.) 94 - cc_skip_flag Can this entry be ignored? 95 - cc_matrix_type 96 - cc_scaling_db 97 - cc_author 98 - cc_ft_key 99 - cc_ft_desc 100 - cc_version version number (introduced in release 19.0) 101 102 The following are all lists if tuples (swiss-prot accession, swiss-prot name). 103 104 DATA BANK REFERENCES 105 106 - dr_positive 107 - dr_false_neg 108 - dr_false_pos 109 - dr_potential Potential hits, but fingerprint region not yet available. 110 - dr_unknown Could possibly belong 111 - pdb_structs List of PDB entries. 112 113 """
114 - def __init__(self):
115 self.name = '' 116 self.type = '' 117 self.accession = '' 118 self.created = '' 119 self.data_update = '' 120 self.info_update = '' 121 self.pdoc = '' 122 123 self.description = '' 124 self.pattern = '' 125 self.matrix = [] 126 self.rules = [] 127 self.prorules = [] 128 self.postprocessing = [] 129 130 self.nr_sp_release = '' 131 self.nr_sp_seqs = '' 132 self.nr_total = (None, None) 133 self.nr_positive = (None, None) 134 self.nr_unknown = (None, None) 135 self.nr_false_pos = (None, None) 136 self.nr_false_neg = None 137 self.nr_partial = None 138 139 self.cc_taxo_range = '' 140 self.cc_max_repeat = '' 141 self.cc_site = [] 142 self.cc_skip_flag = '' 143 144 self.dr_positive = [] 145 self.dr_false_neg = [] 146 self.dr_false_pos = [] 147 self.dr_potential = [] 148 self.dr_unknown = [] 149 150 self.pdb_structs = []
151 152 153 # Everything below are private functions 154
155 -def __read(handle):
156 import re 157 record = None 158 for line in handle: 159 keyword, value = line[:2], line[5:].rstrip() 160 if keyword=='ID': 161 record = Record() 162 cols = value.split("; ") 163 if len(cols) != 2: 164 raise ValueError("I don't understand identification line\n%s" 165 % line) 166 record.name = cols[0] 167 record.type = cols[1].rstrip('.') # don't want '.' 168 elif keyword=='AC': 169 record.accession = value.rstrip(';') 170 elif keyword=='DT': 171 dates = value.rstrip('.').split("; ") 172 if (not dates[0].endswith('(CREATED)')) or \ 173 (not dates[1].endswith('(DATA UPDATE)')) or \ 174 (not dates[2].endswith('(INFO UPDATE)')): 175 raise ValueError("I don't understand date line\n%s" % line) 176 record.created = dates[0].rstrip(' (CREATED)') 177 record.data_update = dates[1].rstrip(' (DATA UPDATE)') 178 record.info_update = dates[2].rstrip(' (INFO UPDATE)') 179 elif keyword=='DE': 180 record.description = value 181 elif keyword=='PA': 182 record.pattern += value 183 elif keyword=='MA': 184 record.matrix.append(value) 185 elif keyword=='PP': 186 record.postprocessing.extend(value.split(";")) 187 elif keyword=='RU': 188 record.rules.append(value) 189 elif keyword=='NR': 190 cols = value.split(";") 191 for col in cols: 192 if not col: 193 continue 194 qual, data = [word.lstrip() for word in col.split("=")] 195 if qual == '/RELEASE': 196 release, seqs = data.split(",") 197 record.nr_sp_release = release 198 record.nr_sp_seqs = int(seqs) 199 elif qual == '/FALSE_NEG': 200 record.nr_false_neg = int(data) 201 elif qual == '/PARTIAL': 202 record.nr_partial = int(data) 203 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 204 m = re.match(r'(\d+)\((\d+)\)', data) 205 if not m: 206 raise Exception("Broken data %s in comment line\n%s" 207 % (repr(data), line)) 208 hits = tuple(map(int, m.groups())) 209 if(qual == "/TOTAL"): 210 record.nr_total = hits 211 elif(qual == "/POSITIVE"): 212 record.nr_positive = hits 213 elif(qual == "/UNKNOWN"): 214 record.nr_unknown = hits 215 elif(qual == "/FALSE_POS"): 216 record.nr_false_pos = hits 217 else: 218 raise ValueError("Unknown qual %s in comment line\n%s" 219 % (repr(qual), line)) 220 elif keyword=='CC': 221 # Expect CC lines like this: 222 # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 223 # Can (normally) split on ";" and then on "=" 224 cols = value.split(";") 225 for col in cols: 226 if not col or col[:17] == 'Automatic scaling': 227 # DNAJ_2 in Release 15 has a non-standard comment line: 228 # CC Automatic scaling using reversed database 229 # Throw it away. (Should I keep it?) 230 continue 231 if col.count("=") == 0: 232 # Missing qualifier! Can we recover gracefully? 233 # For example, from Bug 2403, in PS50293 have: 234 # CC /AUTHOR=K_Hofmann; N_Hulo 235 continue 236 qual, data = [word.lstrip() for word in col.split("=")] 237 if qual == '/TAXO-RANGE': 238 record.cc_taxo_range = data 239 elif qual == '/MAX-REPEAT': 240 record.cc_max_repeat = data 241 elif qual == '/SITE': 242 pos, desc = data.split(",") 243 record.cc_site.append((int(pos), desc)) 244 elif qual == '/SKIP-FLAG': 245 record.cc_skip_flag = data 246 elif qual == '/MATRIX_TYPE': 247 record.cc_matrix_type = data 248 elif qual == '/SCALING_DB': 249 record.cc_scaling_db = data 250 elif qual == '/AUTHOR': 251 record.cc_author = data 252 elif qual == '/FT_KEY': 253 record.cc_ft_key = data 254 elif qual == '/FT_DESC': 255 record.cc_ft_desc = data 256 elif qual == '/VERSION': 257 record.cc_version = data 258 else: 259 raise ValueError("Unknown qual %s in comment line\n%s" 260 % (repr(qual), line)) 261 elif keyword=='DR': 262 refs = value.split(";") 263 for ref in refs: 264 if not ref: 265 continue 266 acc, name, type = [word.strip() for word in ref.split(",")] 267 if type == 'T': 268 record.dr_positive.append((acc, name)) 269 elif type == 'F': 270 record.dr_false_pos.append((acc, name)) 271 elif type == 'N': 272 record.dr_false_neg.append((acc, name)) 273 elif type == 'P': 274 record.dr_potential.append((acc, name)) 275 elif type == '?': 276 record.dr_unknown.append((acc, name)) 277 else: 278 raise ValueError("I don't understand type flag %s" % type) 279 elif keyword=='3D': 280 cols = value.split() 281 for id in cols: 282 record.pdb_structs.append(id.rstrip(';')) 283 elif keyword=='PR': 284 rules = value.split(";") 285 record.prorules.extend(rules) 286 elif keyword=='DO': 287 record.pdoc = value.rstrip(';') 288 elif keyword=='CC': 289 continue 290 elif keyword=='//': 291 if not record: 292 # Then this was the copyright statement 293 continue 294 break 295 else: 296 raise ValueError("Unknown keyword %s found" % keyword) 297 else: 298 return 299 if not record: 300 raise ValueError("Unexpected end of stream.") 301 return record
302