Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code to work with the sprotXX.dat file from SwissProt. 
  6   
  7  http://www.expasy.ch/sprot/sprot-top.html 
  8   
  9  Tested with: 
 10  Release 56.9, 03-March-2009. 
 11   
 12   
 13  Classes: 
 14   
 15      - Record             Holds SwissProt data. 
 16      - Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19   
 20      - read               Read one SwissProt record 
 21      - parse              Read multiple SwissProt records 
 22   
 23  """ 
 24   
 25  from __future__ import print_function 
 26   
 27  from Bio._py3k import _as_string 
 28   
 29  __docformat__ = "restructuredtext en" 
 30   
31 -class Record(object):
32 """Holds information from a SwissProt record. 33 34 Members: 35 36 - entry_name Name of this entry, e.g. RL1_ECOLI. 37 - data_class Either 'STANDARD' or 'PRELIMINARY'. 38 - molecule_type Type of molecule, 'PRT', 39 - sequence_length Number of residues. 40 41 - accessions List of the accession numbers, e.g. ['P00321'] 42 - created A tuple of (date, release). 43 - sequence_update A tuple of (date, release). 44 - annotation_update A tuple of (date, release). 45 46 - description Free-format description. 47 - gene_name Gene name. See userman.txt for description. 48 - organism The source of the sequence. 49 - organelle The origin of the sequence. 50 - organism_classification The taxonomy classification. List of strings. 51 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 52 - taxonomy_id A list of NCBI taxonomy id's. 53 - host_organism A list of names of the hosts of a virus, if any. 54 - host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 55 - references List of Reference objects. 56 - comments List of strings. 57 - cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 58 - keywords List of the keywords. 59 - features List of tuples (key name, from, to, description). 60 from and to can be either integers for the residue 61 numbers, '<', '>', or '?' 62 63 - seqinfo tuple of (length, molecular weight, CRC32 value) 64 - sequence The sequence. 65 66 """
67 - def __init__(self):
68 self.entry_name = None 69 self.data_class = None 70 self.molecule_type = None 71 self.sequence_length = None 72 73 self.accessions = [] 74 self.created = None 75 self.sequence_update = None 76 self.annotation_update = None 77 78 self.description = [] 79 self.gene_name = '' 80 self.organism = [] 81 self.organelle = '' 82 self.organism_classification = [] 83 self.taxonomy_id = [] 84 self.host_organism = [] 85 self.host_taxonomy_id = [] 86 self.references = [] 87 self.comments = [] 88 self.cross_references = [] 89 self.keywords = [] 90 self.features = [] 91 92 self.seqinfo = None 93 self.sequence = ''
94 95
96 -class Reference(object):
97 """Holds information from one reference in a SwissProt entry. 98 99 Members: 100 number Number of reference in an entry. 101 positions Describes extent of work. List of strings. 102 comments Comments. List of (token, text). 103 references References. List of (dbname, identifier). 104 authors The authors of the work. 105 title Title of the work. 106 location A citation for the work. 107 108 """
109 - def __init__(self):
110 self.number = None 111 self.positions = [] 112 self.comments = [] 113 self.references = [] 114 self.authors = [] 115 self.title = [] 116 self.location = []
117 118
119 -def parse(handle):
120 while True: 121 record = _read(handle) 122 if not record: 123 return 124 yield record
125 126
127 -def read(handle):
128 record = _read(handle) 129 if not record: 130 raise ValueError("No SwissProt record found") 131 # We should have reached the end of the record by now 132 remainder = handle.read() 133 if remainder: 134 raise ValueError("More than one SwissProt record found") 135 return record
136 137 138 # Everything below is considered private 139 140
141 -def _read(handle):
142 record = None 143 unread = "" 144 for line in handle: 145 # This is for Python 3 to cope with a binary handle (byte strings), 146 # or a text handle (unicode strings): 147 line = _as_string(line) 148 key, value = line[:2], line[5:].rstrip() 149 if unread: 150 value = unread + " " + value 151 unread = "" 152 if key == '**': 153 # See Bug 2353, some files from the EBI have extra lines 154 # starting "**" (two asterisks/stars). They appear 155 # to be unofficial automated annotations. e.g. 156 # ** 157 # ** ################# INTERNAL SECTION ################## 158 # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 159 pass 160 elif key == 'ID': 161 record = Record() 162 _read_id(record, line) 163 _sequence_lines = [] 164 elif key == 'AC': 165 accessions = [word for word in value.rstrip(";").split("; ")] 166 record.accessions.extend(accessions) 167 elif key == 'DT': 168 _read_dt(record, line) 169 elif key == 'DE': 170 record.description.append(value.strip()) 171 elif key == 'GN': 172 if record.gene_name: 173 record.gene_name += " " 174 record.gene_name += value 175 elif key == 'OS': 176 record.organism.append(value) 177 elif key == 'OG': 178 record.organelle += line[5:] 179 elif key == 'OC': 180 cols = [col for col in value.rstrip(";.").split("; ")] 181 record.organism_classification.extend(cols) 182 elif key == 'OX': 183 _read_ox(record, line) 184 elif key == 'OH': 185 _read_oh(record, line) 186 elif key == 'RN': 187 reference = Reference() 188 _read_rn(reference, value) 189 record.references.append(reference) 190 elif key == 'RP': 191 assert record.references, "RP: missing RN" 192 record.references[-1].positions.append(value) 193 elif key == 'RC': 194 assert record.references, "RC: missing RN" 195 reference = record.references[-1] 196 unread = _read_rc(reference, value) 197 elif key == 'RX': 198 assert record.references, "RX: missing RN" 199 reference = record.references[-1] 200 _read_rx(reference, value) 201 elif key == 'RL': 202 assert record.references, "RL: missing RN" 203 reference = record.references[-1] 204 reference.location.append(value) 205 # In UniProt release 1.12 of 6/21/04, there is a new RG 206 # (Reference Group) line, which references a group instead of 207 # an author. Each block must have at least 1 RA or RG line. 208 elif key == 'RA': 209 assert record.references, "RA: missing RN" 210 reference = record.references[-1] 211 reference.authors.append(value) 212 elif key == 'RG': 213 assert record.references, "RG: missing RN" 214 reference = record.references[-1] 215 reference.authors.append(value) 216 elif key == "RT": 217 assert record.references, "RT: missing RN" 218 reference = record.references[-1] 219 reference.title.append(value) 220 elif key == 'CC': 221 _read_cc(record, line) 222 elif key == 'DR': 223 _read_dr(record, value) 224 elif key == 'PE': 225 # TODO - Record this information? 226 pass 227 elif key == 'KW': 228 _read_kw(record, value) 229 elif key == 'FT': 230 _read_ft(record, line) 231 elif key == 'SQ': 232 cols = value.split() 233 assert len(cols) == 7, "I don't understand SQ line %s" % line 234 # Do more checking here? 235 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 236 elif key == ' ': 237 _sequence_lines.append(value.replace(" ", "").rstrip()) 238 elif key == '//': 239 # Join multiline data into one string 240 record.description = " ".join(record.description) 241 record.organism = " ".join(record.organism) 242 record.organelle = record.organelle.rstrip() 243 for reference in record.references: 244 reference.authors = " ".join(reference.authors).rstrip(";") 245 reference.title = " ".join(reference.title).rstrip(";") 246 if reference.title.startswith('"') and reference.title.endswith('"'): 247 reference.title = reference.title[1:-1] # remove quotes 248 reference.location = " ".join(reference.location) 249 record.sequence = "".join(_sequence_lines) 250 return record 251 else: 252 raise ValueError("Unknown keyword '%s' found" % key) 253 if record: 254 raise ValueError("Unexpected end of stream.")
255 256
257 -def _read_id(record, line):
258 cols = line[5:].split() 259 # Prior to release 51, included with MoleculeType: 260 # ID EntryName DataClass; MoleculeType; SequenceLength AA. 261 # 262 # Newer files lack the MoleculeType: 263 # ID EntryName DataClass; SequenceLength AA. 264 if len(cols) == 5: 265 record.entry_name = cols[0] 266 record.data_class = cols[1].rstrip(";") 267 record.molecule_type = cols[2].rstrip(";") 268 record.sequence_length = int(cols[3]) 269 elif len(cols) == 4: 270 record.entry_name = cols[0] 271 record.data_class = cols[1].rstrip(";") 272 record.molecule_type = None 273 record.sequence_length = int(cols[2]) 274 else: 275 raise ValueError("ID line has unrecognised format:\n" + line) 276 # check if the data class is one of the allowed values 277 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 278 if record.data_class not in allowed: 279 raise ValueError("Unrecognized data class %s in line\n%s" % 280 (record.data_class, line)) 281 # molecule_type should be 'PRT' for PRoTein 282 # Note that has been removed in recent releases (set to None) 283 if record.molecule_type not in (None, 'PRT'): 284 raise ValueError("Unrecognized molecule type %s in line\n%s" % 285 (record.molecule_type, line))
286 287
288 -def _read_dt(record, line):
289 value = line[5:] 290 uprline = value.upper() 291 cols = value.rstrip().split() 292 if 'CREATED' in uprline \ 293 or 'LAST SEQUENCE UPDATE' in uprline \ 294 or 'LAST ANNOTATION UPDATE' in uprline: 295 # Old style DT line 296 # ================= 297 # e.g. 298 # DT 01-FEB-1995 (Rel. 31, Created) 299 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 300 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 301 # 302 # or: 303 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 304 # ... 305 306 # find where the version information will be located 307 # This is needed for when you have cases like IPI where 308 # the release verison is in a different spot: 309 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 310 uprcols = uprline.split() 311 rel_index = -1 312 for index in range(len(uprcols)): 313 if 'REL.' in uprcols[index]: 314 rel_index = index 315 assert rel_index >= 0, \ 316 "Could not find Rel. in DT line: %s" % line 317 version_index = rel_index + 1 318 # get the version information 319 str_version = cols[version_index].rstrip(",") 320 # no version number 321 if str_version == '': 322 version = 0 323 # dot versioned 324 elif '.' in str_version: 325 version = str_version 326 # integer versioned 327 else: 328 version = int(str_version) 329 date = cols[0] 330 331 if 'CREATED' in uprline: 332 record.created = date, version 333 elif 'LAST SEQUENCE UPDATE' in uprline: 334 record.sequence_update = date, version 335 elif 'LAST ANNOTATION UPDATE' in uprline: 336 record.annotation_update = date, version 337 else: 338 assert False, "Shouldn't reach this line!" 339 elif 'INTEGRATED INTO' in uprline \ 340 or 'SEQUENCE VERSION' in uprline \ 341 or 'ENTRY VERSION' in uprline: 342 # New style DT line 343 # ================= 344 # As of UniProt Knowledgebase release 7.0 (including 345 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 346 # format of the DT lines and the version information 347 # in them was changed - the release number was dropped. 348 # 349 # For more information see bug 1948 and 350 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 351 # 352 # e.g. 353 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 354 # DT 15-OCT-2001, sequence version 3. 355 # DT 01-APR-2004, entry version 14. 356 # 357 # This is a new style DT line... 358 359 # The date should be in string cols[1] 360 # Get the version number if there is one. 361 # For the three DT lines above: 0, 3, 14 362 try: 363 version = int(cols[-1]) 364 except ValueError: 365 version = 0 366 date = cols[0].rstrip(",") 367 368 # Re-use the historical property names, even though 369 # the meaning has changed slighty: 370 if "INTEGRATED" in uprline: 371 record.created = date, version 372 elif 'SEQUENCE VERSION' in uprline: 373 record.sequence_update = date, version 374 elif 'ENTRY VERSION' in uprline: 375 record.annotation_update = date, version 376 else: 377 assert False, "Shouldn't reach this line!" 378 else: 379 raise ValueError("I don't understand the date line %s" % line)
380 381
382 -def _read_ox(record, line):
383 # The OX line used to be in the simple format: 384 # OX DESCRIPTION=ID[, ID]...; 385 # If there are too many id's to fit onto a line, then the ID's 386 # continue directly onto the next line, e.g. 387 # OX DESCRIPTION=ID[, ID]... 388 # OX ID[, ID]...; 389 # Currently, the description is always "NCBI_TaxID". 390 # To parse this, I need to check to see whether I'm at the 391 # first line. If I am, grab the description and make sure 392 # it's an NCBI ID. Then, grab all the id's. 393 # 394 # As of the 2014-10-01 release, there may be an evidence code, e.g. 395 # OX NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1}; 396 # In the short term, we will ignore any evidence codes: 397 line = line.split('{')[0] 398 if record.taxonomy_id: 399 ids = line[5:].rstrip().rstrip(";") 400 else: 401 descr, ids = line[5:].rstrip().rstrip(";").split("=") 402 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 403 record.taxonomy_id.extend(ids.split(', '))
404 405
406 -def _read_oh(record, line):
407 # Line type OH (Organism Host) for viral hosts 408 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 409 line = line[16:].rstrip() 410 assert line[-1] == "." and line.count(";") == 1, line 411 taxid, name = line[:-1].split(";") 412 record.host_taxonomy_id.append(taxid.strip()) 413 record.host_organism.append(name.strip())
414 415
416 -def _read_rn(reference, rn):
417 # This used to be a very simple line with a reference number, e.g. 418 # RN [1] 419 # As of the 2014-10-01 release, there may be an evidence code, e.g. 420 # RN [1] {ECO:0000313|EMBL:AEX14553.1} 421 # We will for now ignore this 422 rn = rn.split()[0] 423 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 424 reference.number = int(rn[1:-1])
425 426
427 -def _read_rc(reference, value):
428 cols = value.split(';') 429 if value[-1] == ';': 430 unread = "" 431 else: 432 cols, unread = cols[:-1], cols[-1] 433 for col in cols: 434 if not col: # last column will be the empty string 435 return 436 # The token is everything before the first '=' character. 437 i = col.find("=") 438 if i >= 0: 439 token, text = col[:i], col[i + 1:] 440 comment = token.lstrip(), text 441 reference.comments.append(comment) 442 else: 443 comment = reference.comments[-1] 444 comment = "%s %s" % (comment, col) 445 reference.comments[-1] = comment 446 return unread
447 448
449 -def _read_rx(reference, value):
450 # The basic (older?) RX line is of the form: 451 # RX MEDLINE; 85132727. 452 # but there are variants of this that need to be dealt with (see below) 453 454 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 455 # have extraneous information in the RX line. Check for 456 # this and chop it out of the line. 457 # (noticed by katel@worldpath.net) 458 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '') 459 460 # RX lines can also be used of the form 461 # RX PubMed=9603189; 462 # reported by edvard@farmasi.uit.no 463 # and these can be more complicated like: 464 # RX MEDLINE=95385798; PubMed=7656980; 465 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 466 # We look for these cases first and deal with them 467 warn = False 468 if "=" in value: 469 cols = value.split("; ") 470 cols = [x.strip() for x in cols] 471 cols = [x for x in cols if x] 472 for col in cols: 473 x = col.split("=") 474 if len(x) != 2 or x == ("DOI", "DOI"): 475 warn = True 476 break 477 assert len(x) == 2, "I don't understand RX line %s" % value 478 reference.references.append((x[0], x[1].rstrip(";"))) 479 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 480 else: 481 cols = value.split("; ") 482 # normally we split into the three parts 483 if len(cols) != 2: 484 warn = True 485 else: 486 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 487 if warn: 488 import warnings 489 from Bio import BiopythonParserWarning 490 warnings.warn("Possibly corrupt RX line %r" % value, 491 BiopythonParserWarning)
492 493
494 -def _read_cc(record, line):
495 key, value = line[5:8], line[9:].rstrip() 496 if key == '-!-': # Make a new comment 497 record.comments.append(value) 498 elif key == ' ': # add to the previous comment 499 if not record.comments: 500 # TCMO_STRGA in Release 37 has comment with no topic 501 record.comments.append(value) 502 else: 503 record.comments[-1] += " " + value
504 505
506 -def _read_dr(record, value):
507 cols = value.rstrip(".").split('; ') 508 record.cross_references.append(tuple(cols))
509 510
511 -def _read_kw(record, value):
512 # Old style - semi-colon separated, multi-line. e.g. Q13639.txt 513 # KW Alternative splicing; Cell membrane; Complete proteome; 514 # KW Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein; 515 # KW Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer; 516 # KW Transmembrane. 517 # 518 # New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt 519 # KW Monooxygenase {ECO:0000313|EMBL:AEX14553.1}; 520 # KW Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}. 521 # For now to match the XML parser, drop the evidence codes. 522 for value in value.rstrip(";.").split('; '): 523 if value.endswith("}"): 524 # Discard the evidence code 525 value = value.rsplit("{", 1)[0] 526 record.keywords.append(value.strip())
527 528
529 -def _read_ft(record, line):
530 line = line[5:] # get rid of junk in front 531 name = line[0:8].rstrip() 532 try: 533 from_res = int(line[9:15]) 534 except ValueError: 535 from_res = line[9:15].lstrip() 536 try: 537 to_res = int(line[16:22]) 538 except ValueError: 539 to_res = line[16:22].lstrip() 540 # if there is a feature_id (FTId), store it away 541 if line[29:35] == r"/FTId=": 542 ft_id = line[35:70].rstrip()[:-1] 543 description = "" 544 else: 545 ft_id = "" 546 description = line[29:70].rstrip() 547 if not name: # is continuation of last one 548 assert not from_res and not to_res 549 name, from_res, to_res, old_description, old_ft_id = record.features[-1] 550 del record.features[-1] 551 description = ("%s %s" % (old_description, description)).strip() 552 553 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 554 if name == "VARSPLIC": 555 # Remove unwanted spaces in sequences. 556 # During line carryover, the sequences in VARSPLIC can get mangled 557 # with unwanted spaces like: 558 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 559 # We want to check for this case and correct it as it happens. 560 descr_cols = description.split(" -> ") 561 if len(descr_cols) == 2: 562 first_seq, second_seq = descr_cols 563 extra_info = '' 564 # we might have more information at the end of the 565 # second sequence, which should be in parenthesis 566 extra_info_pos = second_seq.find(" (") 567 if extra_info_pos != -1: 568 extra_info = second_seq[extra_info_pos:] 569 second_seq = second_seq[:extra_info_pos] 570 # now clean spaces out of the first and second string 571 first_seq = first_seq.replace(" ", "") 572 second_seq = second_seq.replace(" ", "") 573 # reassemble the description 574 description = first_seq + " -> " + second_seq + extra_info 575 record.features.append((name, from_res, to_res, description, ft_id))
576 577 578 if __name__ == "__main__": 579 print("Quick self test...") 580 581 example_filename = "../../Tests/SwissProt/sp008" 582 583 import os 584 if not os.path.isfile(example_filename): 585 print("Missing test file %s" % example_filename) 586 else: 587 # Try parsing it! 588 589 with open(example_filename) as handle: 590 records = parse(handle) 591 for record in records: 592 print(record.entry_name) 593 print(",".join(record.accessions)) 594 print(record.keywords) 595 print(repr(record.organism)) 596 print(record.sequence[:20] + "...") 597