Package Bio :: Package ExPASy :: Module Prosite
[hide private]
[frames] | no frames]

Source Code for Module Bio.ExPASy.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  This module provides code to work with the prosite dat file from 
 10  Prosite. 
 11  http://www.expasy.ch/prosite/ 
 12   
 13  Tested with: 
 14  Release 20.43, 10-Feb-2009 
 15   
 16   
 17  Functions: 
 18  read                  Reads a Prosite file containing one Prosite record 
 19  parse                 Iterates over records in a Prosite file. 
 20   
 21  Classes: 
 22  Record                Holds Prosite data. 
 23  """ 
 24   
 25   
26 -def parse(handle):
27 """Parse Prosite records. 28 29 This function is for parsing Prosite files containing multiple 30 records. 31 32 handle - handle to the file.""" 33 while True: 34 record = __read(handle) 35 if not record: 36 break 37 yield record
38 39
40 -def read(handle):
41 """Read one Prosite record. 42 43 This function is for parsing Prosite files containing 44 exactly one record. 45 46 handle - handle to the file.""" 47 48 record = __read(handle) 49 # We should have reached the end of the record by now 50 remainder = handle.read() 51 if remainder: 52 raise ValueError("More than one Prosite record found") 53 return record
54 55
56 -class Record(object):
57 """Holds information from a Prosite record. 58 59 Members: 60 name ID of the record. e.g. ADH_ZINC 61 type Type of entry. e.g. PATTERN, MATRIX, or RULE 62 accession e.g. PS00387 63 created Date the entry was created. (MMM-YYYY) 64 data_update Date the 'primary' data was last updated. 65 info_update Date data other than 'primary' data was last updated. 66 pdoc ID of the PROSITE DOCumentation. 67 68 description Free-format description. 69 pattern The PROSITE pattern. See docs. 70 matrix List of strings that describes a matrix entry. 71 rules List of rule definitions (from RU lines). (strings) 72 prorules List of prorules (from PR lines). (strings) 73 74 NUMERICAL RESULTS 75 nr_sp_release SwissProt release. 76 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 77 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 78 nr_positive True positives. tuple of (hits, seqs) 79 nr_unknown Could be positives. tuple of (hits, seqs) 80 nr_false_pos False positives. tuple of (hits, seqs) 81 nr_false_neg False negatives. (int) 82 nr_partial False negatives, because they are fragments. (int) 83 84 COMMENTS 85 cc_taxo_range Taxonomic range. See docs for format 86 cc_max_repeat Maximum number of repetitions in a protein 87 cc_site Interesting site. list of tuples (pattern pos, desc.) 88 cc_skip_flag Can this entry be ignored? 89 cc_matrix_type 90 cc_scaling_db 91 cc_author 92 cc_ft_key 93 cc_ft_desc 94 cc_version version number (introduced in release 19.0) 95 96 DATA BANK REFERENCES - The following are all 97 lists of tuples (swiss-prot accession, 98 swiss-prot name) 99 dr_positive 100 dr_false_neg 101 dr_false_pos 102 dr_potential Potential hits, but fingerprint region not yet available. 103 dr_unknown Could possibly belong 104 105 pdb_structs List of PDB entries. 106 107 """
108 - def __init__(self):
109 self.name = '' 110 self.type = '' 111 self.accession = '' 112 self.created = '' 113 self.data_update = '' 114 self.info_update = '' 115 self.pdoc = '' 116 117 self.description = '' 118 self.pattern = '' 119 self.matrix = [] 120 self.rules = [] 121 self.prorules = [] 122 self.postprocessing = [] 123 124 self.nr_sp_release = '' 125 self.nr_sp_seqs = '' 126 self.nr_total = (None, None) 127 self.nr_positive = (None, None) 128 self.nr_unknown = (None, None) 129 self.nr_false_pos = (None, None) 130 self.nr_false_neg = None 131 self.nr_partial = None 132 133 self.cc_taxo_range = '' 134 self.cc_max_repeat = '' 135 self.cc_site = [] 136 self.cc_skip_flag = '' 137 138 self.dr_positive = [] 139 self.dr_false_neg = [] 140 self.dr_false_pos = [] 141 self.dr_potential = [] 142 self.dr_unknown = [] 143 144 self.pdb_structs = []
145 146 147 # Everything below are private functions 148
149 -def __read(handle):
150 import re 151 record = None 152 for line in handle: 153 keyword, value = line[:2], line[5:].rstrip() 154 if keyword=='ID': 155 record = Record() 156 cols = value.split("; ") 157 if len(cols) != 2: 158 raise ValueError("I don't understand identification line\n%s" 159 % line) 160 record.name = cols[0] 161 record.type = cols[1].rstrip('.') # don't want '.' 162 elif keyword=='AC': 163 record.accession = value.rstrip(';') 164 elif keyword=='DT': 165 dates = value.rstrip('.').split("; ") 166 if (not dates[0].endswith('(CREATED)')) or \ 167 (not dates[1].endswith('(DATA UPDATE)')) or \ 168 (not dates[2].endswith('(INFO UPDATE)')): 169 raise ValueError("I don't understand date line\n%s" % line) 170 record.created = dates[0].rstrip(' (CREATED)') 171 record.data_update = dates[1].rstrip(' (DATA UPDATE)') 172 record.info_update = dates[2].rstrip(' (INFO UPDATE)') 173 elif keyword=='DE': 174 record.description = value 175 elif keyword=='PA': 176 record.pattern += value 177 elif keyword=='MA': 178 record.matrix.append(value) 179 elif keyword=='PP': 180 record.postprocessing.extend(value.split(";")) 181 elif keyword=='RU': 182 record.rules.append(value) 183 elif keyword=='NR': 184 cols = value.split(";") 185 for col in cols: 186 if not col: 187 continue 188 qual, data = [word.lstrip() for word in col.split("=")] 189 if qual == '/RELEASE': 190 release, seqs = data.split(",") 191 record.nr_sp_release = release 192 record.nr_sp_seqs = int(seqs) 193 elif qual == '/FALSE_NEG': 194 record.nr_false_neg = int(data) 195 elif qual == '/PARTIAL': 196 record.nr_partial = int(data) 197 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 198 m = re.match(r'(\d+)\((\d+)\)', data) 199 if not m: 200 raise Exception("Broken data %s in comment line\n%s" 201 % (repr(data), line)) 202 hits = tuple(map(int, m.groups())) 203 if(qual == "/TOTAL"): 204 record.nr_total = hits 205 elif(qual == "/POSITIVE"): 206 record.nr_positive = hits 207 elif(qual == "/UNKNOWN"): 208 record.nr_unknown = hits 209 elif(qual == "/FALSE_POS"): 210 record.nr_false_pos = hits 211 else: 212 raise ValueError("Unknown qual %s in comment line\n%s" 213 % (repr(qual), line)) 214 elif keyword=='CC': 215 #Expect CC lines like this: 216 #CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 217 #Can (normally) split on ";" and then on "=" 218 cols = value.split(";") 219 for col in cols: 220 if not col or col[:17] == 'Automatic scaling': 221 # DNAJ_2 in Release 15 has a non-standard comment line: 222 # CC Automatic scaling using reversed database 223 # Throw it away. (Should I keep it?) 224 continue 225 if col.count("=") == 0: 226 #Missing qualifier! Can we recover gracefully? 227 #For example, from Bug 2403, in PS50293 have: 228 #CC /AUTHOR=K_Hofmann; N_Hulo 229 continue 230 qual, data = [word.lstrip() for word in col.split("=")] 231 if qual == '/TAXO-RANGE': 232 record.cc_taxo_range = data 233 elif qual == '/MAX-REPEAT': 234 record.cc_max_repeat = data 235 elif qual == '/SITE': 236 pos, desc = data.split(",") 237 record.cc_site.append((int(pos), desc)) 238 elif qual == '/SKIP-FLAG': 239 record.cc_skip_flag = data 240 elif qual == '/MATRIX_TYPE': 241 record.cc_matrix_type = data 242 elif qual == '/SCALING_DB': 243 record.cc_scaling_db = data 244 elif qual == '/AUTHOR': 245 record.cc_author = data 246 elif qual == '/FT_KEY': 247 record.cc_ft_key = data 248 elif qual == '/FT_DESC': 249 record.cc_ft_desc = data 250 elif qual == '/VERSION': 251 record.cc_version = data 252 else: 253 raise ValueError("Unknown qual %s in comment line\n%s" 254 % (repr(qual), line)) 255 elif keyword=='DR': 256 refs = value.split(";") 257 for ref in refs: 258 if not ref: 259 continue 260 acc, name, type = [word.strip() for word in ref.split(",")] 261 if type == 'T': 262 record.dr_positive.append((acc, name)) 263 elif type == 'F': 264 record.dr_false_pos.append((acc, name)) 265 elif type == 'N': 266 record.dr_false_neg.append((acc, name)) 267 elif type == 'P': 268 record.dr_potential.append((acc, name)) 269 elif type == '?': 270 record.dr_unknown.append((acc, name)) 271 else: 272 raise ValueError("I don't understand type flag %s" % type) 273 elif keyword=='3D': 274 cols = value.split() 275 for id in cols: 276 record.pdb_structs.append(id.rstrip(';')) 277 elif keyword=='PR': 278 rules = value.split(";") 279 record.prorules.extend(rules) 280 elif keyword=='DO': 281 record.pdoc = value.rstrip(';') 282 elif keyword=='CC': 283 continue 284 elif keyword=='//': 285 if not record: 286 # Then this was the copyright statement 287 continue 288 break 289 else: 290 raise ValueError("Unknown keyword %s found" % keyword) 291 else: 292 return 293 if not record: 294 raise ValueError("Unexpected end of stream.") 295 return record
296