Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  This module provides code to work with the sprotXX.dat file from 
  7  SwissProt. 
  8  http://www.expasy.ch/sprot/sprot-top.html 
  9   
 10  Tested with: 
 11  Release 56.9, 03-March-2009. 
 12   
 13   
 14  Classes: 
 15  Record             Holds SwissProt data. 
 16  Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19  read               Read one SwissProt record 
 20  parse              Read multiple SwissProt records 
 21   
 22  """ 
 23   
 24  from Bio._py3k import _as_string 
 25   
26 -class Record(object):
27 """Holds information from a SwissProt record. 28 29 Members: 30 entry_name Name of this entry, e.g. RL1_ECOLI. 31 data_class Either 'STANDARD' or 'PRELIMINARY'. 32 molecule_type Type of molecule, 'PRT', 33 sequence_length Number of residues. 34 35 accessions List of the accession numbers, e.g. ['P00321'] 36 created A tuple of (date, release). 37 sequence_update A tuple of (date, release). 38 annotation_update A tuple of (date, release). 39 40 description Free-format description. 41 gene_name Gene name. See userman.txt for description. 42 organism The source of the sequence. 43 organelle The origin of the sequence. 44 organism_classification The taxonomy classification. List of strings. 45 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 46 taxonomy_id A list of NCBI taxonomy id's. 47 host_organism A list of names of the hosts of a virus, if any. 48 host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 49 references List of Reference objects. 50 comments List of strings. 51 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 52 keywords List of the keywords. 53 features List of tuples (key name, from, to, description). 54 from and to can be either integers for the residue 55 numbers, '<', '>', or '?' 56 57 seqinfo tuple of (length, molecular weight, CRC32 value) 58 sequence The sequence. 59 60 """
61 - def __init__(self):
62 self.entry_name = None 63 self.data_class = None 64 self.molecule_type = None 65 self.sequence_length = None 66 67 self.accessions = [] 68 self.created = None 69 self.sequence_update = None 70 self.annotation_update = None 71 72 self.description = [] 73 self.gene_name = '' 74 self.organism = [] 75 self.organelle = '' 76 self.organism_classification = [] 77 self.taxonomy_id = [] 78 self.host_organism = [] 79 self.host_taxonomy_id = [] 80 self.references = [] 81 self.comments = [] 82 self.cross_references = [] 83 self.keywords = [] 84 self.features = [] 85 86 self.seqinfo = None 87 self.sequence = ''
88 89
90 -class Reference(object):
91 """Holds information from one reference in a SwissProt entry. 92 93 Members: 94 number Number of reference in an entry. 95 positions Describes extent of work. list of strings. 96 comments Comments. List of (token, text). 97 references References. List of (dbname, identifier) 98 authors The authors of the work. 99 title Title of the work. 100 location A citation for the work. 101 102 """
103 - def __init__(self):
104 self.number = None 105 self.positions = [] 106 self.comments = [] 107 self.references = [] 108 self.authors = [] 109 self.title = [] 110 self.location = []
111 112
113 -def parse(handle):
114 while True: 115 record = _read(handle) 116 if not record: 117 return 118 yield record
119 120
121 -def read(handle):
122 record = _read(handle) 123 if not record: 124 raise ValueError("No SwissProt record found") 125 # We should have reached the end of the record by now 126 remainder = handle.read() 127 if remainder: 128 raise ValueError("More than one SwissProt record found") 129 return record
130 131 132 # Everything below is considered private 133 134
135 -def _read(handle):
136 record = None 137 unread = "" 138 for line in handle: 139 #This is for Python 3 to cope with a binary handle (byte strings), 140 #or a text handle (unicode strings): 141 line = _as_string(line) 142 key, value = line[:2], line[5:].rstrip() 143 if unread: 144 value = unread + " " + value 145 unread = "" 146 if key=='**': 147 #See Bug 2353, some files from the EBI have extra lines 148 #starting "**" (two asterisks/stars). They appear 149 #to be unofficial automated annotations. e.g. 150 #** 151 #** ################# INTERNAL SECTION ################## 152 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 153 pass 154 elif key=='ID': 155 record = Record() 156 _read_id(record, line) 157 _sequence_lines = [] 158 elif key=='AC': 159 accessions = [word for word in value.rstrip(";").split("; ")] 160 record.accessions.extend(accessions) 161 elif key=='DT': 162 _read_dt(record, line) 163 elif key=='DE': 164 record.description.append(value.strip()) 165 elif key=='GN': 166 if record.gene_name: 167 record.gene_name += " " 168 record.gene_name += value 169 elif key=='OS': 170 record.organism.append(value) 171 elif key=='OG': 172 record.organelle += line[5:] 173 elif key=='OC': 174 cols = [col for col in value.rstrip(";.").split("; ")] 175 record.organism_classification.extend(cols) 176 elif key=='OX': 177 _read_ox(record, line) 178 elif key=='OH': 179 _read_oh(record, line) 180 elif key=='RN': 181 reference = Reference() 182 _read_rn(reference, value) 183 record.references.append(reference) 184 elif key=='RP': 185 assert record.references, "RP: missing RN" 186 record.references[-1].positions.append(value) 187 elif key=='RC': 188 assert record.references, "RC: missing RN" 189 reference = record.references[-1] 190 unread = _read_rc(reference, value) 191 elif key=='RX': 192 assert record.references, "RX: missing RN" 193 reference = record.references[-1] 194 _read_rx(reference, value) 195 elif key=='RL': 196 assert record.references, "RL: missing RN" 197 reference = record.references[-1] 198 reference.location.append(value) 199 # In UniProt release 1.12 of 6/21/04, there is a new RG 200 # (Reference Group) line, which references a group instead of 201 # an author. Each block must have at least 1 RA or RG line. 202 elif key=='RA': 203 assert record.references, "RA: missing RN" 204 reference = record.references[-1] 205 reference.authors.append(value) 206 elif key=='RG': 207 assert record.references, "RG: missing RN" 208 reference = record.references[-1] 209 reference.authors.append(value) 210 elif key=="RT": 211 assert record.references, "RT: missing RN" 212 reference = record.references[-1] 213 reference.title.append(value) 214 elif key=='CC': 215 _read_cc(record, line) 216 elif key=='DR': 217 _read_dr(record, value) 218 elif key=='PE': 219 #TODO - Record this information? 220 pass 221 elif key=='KW': 222 cols = value.rstrip(";.").split('; ') 223 record.keywords.extend(cols) 224 elif key=='FT': 225 _read_ft(record, line) 226 elif key=='SQ': 227 cols = value.split() 228 assert len(cols) == 7, "I don't understand SQ line %s" % line 229 # Do more checking here? 230 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 231 elif key==' ': 232 _sequence_lines.append(value.replace(" ", "").rstrip()) 233 elif key=='//': 234 # Join multiline data into one string 235 record.description = " ".join(record.description) 236 record.organism = " ".join(record.organism) 237 record.organelle = record.organelle.rstrip() 238 for reference in record.references: 239 reference.authors = " ".join(reference.authors).rstrip(";") 240 reference.title = " ".join(reference.title).rstrip(";") 241 if reference.title.startswith('"') and reference.title.endswith('"'): 242 reference.title = reference.title[1:-1] #remove quotes 243 reference.location = " ".join(reference.location) 244 record.sequence = "".join(_sequence_lines) 245 return record 246 else: 247 raise ValueError("Unknown keyword '%s' found" % key) 248 if record: 249 raise ValueError("Unexpected end of stream.")
250 251
252 -def _read_id(record, line):
253 cols = line[5:].split() 254 #Prior to release 51, included with MoleculeType: 255 #ID EntryName DataClass; MoleculeType; SequenceLength AA. 256 # 257 #Newer files lack the MoleculeType: 258 #ID EntryName DataClass; SequenceLength AA. 259 if len(cols) == 5: 260 record.entry_name = cols[0] 261 record.data_class = cols[1].rstrip(";") 262 record.molecule_type = cols[2].rstrip(";") 263 record.sequence_length = int(cols[3]) 264 elif len(cols) == 4: 265 record.entry_name = cols[0] 266 record.data_class = cols[1].rstrip(";") 267 record.molecule_type = None 268 record.sequence_length = int(cols[2]) 269 else: 270 raise ValueError("ID line has unrecognised format:\n"+line) 271 # check if the data class is one of the allowed values 272 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 273 if record.data_class not in allowed: 274 raise ValueError("Unrecognized data class %s in line\n%s" % \ 275 (record.data_class, line)) 276 # molecule_type should be 'PRT' for PRoTein 277 # Note that has been removed in recent releases (set to None) 278 if record.molecule_type not in (None, 'PRT'): 279 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 280 (record.molecule_type, line))
281 282
283 -def _read_dt(record, line):
284 value = line[5:] 285 uprline = value.upper() 286 cols = value.rstrip().split() 287 if 'CREATED' in uprline \ 288 or 'LAST SEQUENCE UPDATE' in uprline \ 289 or 'LAST ANNOTATION UPDATE' in uprline: 290 # Old style DT line 291 # ================= 292 # e.g. 293 # DT 01-FEB-1995 (Rel. 31, Created) 294 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 295 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 296 # 297 # or: 298 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 299 # ... 300 301 # find where the version information will be located 302 # This is needed for when you have cases like IPI where 303 # the release verison is in a different spot: 304 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 305 uprcols = uprline.split() 306 rel_index = -1 307 for index in range(len(uprcols)): 308 if uprcols[index].find("REL.") >= 0: 309 rel_index = index 310 assert rel_index >= 0, \ 311 "Could not find Rel. in DT line: %s" % line 312 version_index = rel_index + 1 313 # get the version information 314 str_version = cols[version_index].rstrip(",") 315 # no version number 316 if str_version == '': 317 version = 0 318 # dot versioned 319 elif str_version.find(".") >= 0: 320 version = str_version 321 # integer versioned 322 else: 323 version = int(str_version) 324 date = cols[0] 325 326 if 'CREATED' in uprline: 327 record.created = date, version 328 elif 'LAST SEQUENCE UPDATE' in uprline: 329 record.sequence_update = date, version 330 elif 'LAST ANNOTATION UPDATE' in uprline: 331 record.annotation_update = date, version 332 else: 333 assert False, "Shouldn't reach this line!" 334 elif 'INTEGRATED INTO' in uprline \ 335 or 'SEQUENCE VERSION' in uprline \ 336 or 'ENTRY VERSION' in uprline: 337 # New style DT line 338 # ================= 339 # As of UniProt Knowledgebase release 7.0 (including 340 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 341 # format of the DT lines and the version information 342 # in them was changed - the release number was dropped. 343 # 344 # For more information see bug 1948 and 345 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 346 # 347 # e.g. 348 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 349 # DT 15-OCT-2001, sequence version 3. 350 # DT 01-APR-2004, entry version 14. 351 # 352 #This is a new style DT line... 353 354 # The date should be in string cols[1] 355 # Get the version number if there is one. 356 # For the three DT lines above: 0, 3, 14 357 try: 358 version = int(cols[-1]) 359 except ValueError: 360 version = 0 361 date = cols[0].rstrip(",") 362 363 # Re-use the historical property names, even though 364 # the meaning has changed slighty: 365 if "INTEGRATED" in uprline: 366 record.created = date, version 367 elif 'SEQUENCE VERSION' in uprline: 368 record.sequence_update = date, version 369 elif 'ENTRY VERSION' in uprline: 370 record.annotation_update = date, version 371 else: 372 assert False, "Shouldn't reach this line!" 373 else: 374 raise ValueError("I don't understand the date line %s" % line)
375 376
377 -def _read_ox(record, line):
378 # The OX line is in the format: 379 # OX DESCRIPTION=ID[, ID]...; 380 # If there are too many id's to fit onto a line, then the ID's 381 # continue directly onto the next line, e.g. 382 # OX DESCRIPTION=ID[, ID]... 383 # OX ID[, ID]...; 384 # Currently, the description is always "NCBI_TaxID". 385 # To parse this, I need to check to see whether I'm at the 386 # first line. If I am, grab the description and make sure 387 # it's an NCBI ID. Then, grab all the id's. 388 if record.taxonomy_id: 389 ids = line[5:].rstrip().rstrip(";") 390 else: 391 descr, ids = line[5:].rstrip().rstrip(";").split("=") 392 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 393 record.taxonomy_id.extend(ids.split(', '))
394 395
396 -def _read_oh(record, line):
397 # Line type OH (Organism Host) for viral hosts 398 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 399 line = line[16:].rstrip() 400 assert line[-1]=="." and line.count(";")==1, line 401 taxid, name = line[:-1].split(";") 402 record.host_taxonomy_id.append(taxid.strip()) 403 record.host_organism.append(name.strip())
404 405
406 -def _read_rn(reference, rn):
407 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 408 reference.number = int(rn[1:-1])
409 410
411 -def _read_rc(reference, value):
412 cols = value.split(';') 413 if value[-1]==';': 414 unread = "" 415 else: 416 cols, unread = cols[:-1], cols[-1] 417 for col in cols: 418 if not col: # last column will be the empty string 419 return 420 # The token is everything before the first '=' character. 421 i = col.find("=") 422 if i>=0: 423 token, text = col[:i], col[i+1:] 424 comment = token.lstrip(), text 425 reference.comments.append(comment) 426 else: 427 comment = reference.comments[-1] 428 comment = "%s %s" % (comment, col) 429 reference.comments[-1] = comment 430 return unread
431 432
433 -def _read_rx(reference, value):
434 # The basic (older?) RX line is of the form: 435 # RX MEDLINE; 85132727. 436 # but there are variants of this that need to be dealt with (see below) 437 438 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 439 # have extraneous information in the RX line. Check for 440 # this and chop it out of the line. 441 # (noticed by katel@worldpath.net) 442 value = value.replace(' [NCBI, ExPASy, Israel, Japan]','') 443 444 # RX lines can also be used of the form 445 # RX PubMed=9603189; 446 # reported by edvard@farmasi.uit.no 447 # and these can be more complicated like: 448 # RX MEDLINE=95385798; PubMed=7656980; 449 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 450 # We look for these cases first and deal with them 451 warn = False 452 if "=" in value: 453 cols = value.split("; ") 454 cols = [x.strip() for x in cols] 455 cols = [x for x in cols if x] 456 for col in cols: 457 x = col.split("=") 458 if len(x) != 2 or x == ("DOI", "DOI"): 459 warn = True 460 break 461 assert len(x) == 2, "I don't understand RX line %s" % value 462 reference.references.append((x[0], x[1].rstrip(";"))) 463 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 464 else: 465 cols = value.split("; ") 466 # normally we split into the three parts 467 if len(cols) != 2: 468 warn = True 469 else: 470 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 471 if warn: 472 import warnings 473 from Bio import BiopythonParserWarning 474 warnings.warn("Possibly corrupt RX line %r" % value, 475 BiopythonParserWarning)
476
477 -def _read_cc(record, line):
478 key, value = line[5:8], line[9:].rstrip() 479 if key=='-!-': # Make a new comment 480 record.comments.append(value) 481 elif key==' ': # add to the previous comment 482 if not record.comments: 483 # TCMO_STRGA in Release 37 has comment with no topic 484 record.comments.append(value) 485 else: 486 record.comments[-1] += " " + value
487 488
489 -def _read_dr(record, value):
490 # Remove the comments at the end of the line 491 i = value.find(' [') 492 if i >= 0: 493 value = value[:i] 494 cols = value.rstrip(".").split('; ') 495 record.cross_references.append(tuple(cols))
496 497
498 -def _read_ft(record, line):
499 line = line[5:] # get rid of junk in front 500 name = line[0:8].rstrip() 501 try: 502 from_res = int(line[9:15]) 503 except ValueError: 504 from_res = line[9:15].lstrip() 505 try: 506 to_res = int(line[16:22]) 507 except ValueError: 508 to_res = line[16:22].lstrip() 509 #if there is a feature_id (FTId), store it away 510 if line[29:35]==r"/FTId=": 511 ft_id = line[35:70].rstrip()[:-1] 512 description = "" 513 else: 514 ft_id ="" 515 description = line[29:70].rstrip() 516 if not name: # is continuation of last one 517 assert not from_res and not to_res 518 name, from_res, to_res, old_description,old_ft_id = record.features[-1] 519 del record.features[-1] 520 description = ("%s %s" % (old_description, description)).strip() 521 522 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 523 if name == "VARSPLIC": 524 # Remove unwanted spaces in sequences. 525 # During line carryover, the sequences in VARSPLIC can get mangled 526 # with unwanted spaces like: 527 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 528 # We want to check for this case and correct it as it happens. 529 descr_cols = description.split(" -> ") 530 if len(descr_cols) == 2: 531 first_seq, second_seq = descr_cols 532 extra_info = '' 533 # we might have more information at the end of the 534 # second sequence, which should be in parenthesis 535 extra_info_pos = second_seq.find(" (") 536 if extra_info_pos != -1: 537 extra_info = second_seq[extra_info_pos:] 538 second_seq = second_seq[:extra_info_pos] 539 # now clean spaces out of the first and second string 540 first_seq = first_seq.replace(" ", "") 541 second_seq = second_seq.replace(" ", "") 542 # reassemble the description 543 description = first_seq + " -> " + second_seq + extra_info 544 record.features.append((name, from_res, to_res, description,ft_id))
545 546 547 if __name__ == "__main__": 548 print "Quick self test..." 549 550 example_filename = "../../Tests/SwissProt/sp008" 551 552 import os 553 if not os.path.isfile(example_filename): 554 print "Missing test file %s" % example_filename 555 else: 556 #Try parsing it! 557 558 handle = open(example_filename) 559 records = parse(handle) 560 for record in records: 561 print record.entry_name 562 print ",".join(record.accessions) 563 print record.keywords 564 print repr(record.organism) 565 print record.sequence[:20] + "..." 566 handle.close() 567