Package Bio :: Package SeqIO :: Module SwissIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.SwissIO

  1  # Copyright 2006-2010 by Peter Cock and Michiel de Hoon. 
  2  # All rights reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format. 
  9   
 10  You are expected to use this module via the Bio.SeqIO functions. 
 11  See also the Bio.SwissProt module which offers more than just accessing 
 12  the sequences as SeqRecord objects. 
 13   
 14  See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format. 
 15  """ 
 16   
 17  from __future__ import print_function 
 18   
 19  from Bio import Seq 
 20  from Bio import SeqRecord 
 21  from Bio import Alphabet 
 22  from Bio import SeqFeature 
 23  from Bio import SwissProt 
 24   
 25   
26 -def _make_position(location_string, offset=0):
27 """Turn a Swiss location position into a SeqFeature position object (PRIVATE). 28 29 An offset of -1 is used with a start location to make it pythonic. 30 """ 31 if location_string == "?": 32 return SeqFeature.UnknownPosition() 33 #Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. 34 try: 35 return SeqFeature.ExactPosition(max(0, offset + int(location_string))) 36 except ValueError: 37 pass 38 if location_string.startswith("<"): 39 try: 40 return SeqFeature.BeforePosition(max(0, offset + int(location_string[1:]))) 41 except ValueError: 42 pass 43 elif location_string.startswith(">"): # e.g. ">13" 44 try: 45 return SeqFeature.AfterPosition(max(0, offset + int(location_string[1:]))) 46 except ValueError: 47 pass 48 elif location_string.startswith("?"): # e.g. "?22" 49 try: 50 return SeqFeature.UncertainPosition(max(0, offset + int(location_string[1:]))) 51 except ValueError: 52 pass 53 raise NotImplementedError("Cannot parse location '%s'" % location_string)
54 55
56 -def _make_seqfeature(name, from_res, to_res, description, ft_id):
57 """Construct SeqFeature from feature data from parser (PRIVATE).""" 58 loc = SeqFeature.FeatureLocation(_make_position(from_res, -1), 59 _make_position(to_res, 0)) 60 if not ft_id: 61 ft_id = "<unknown id>" # The default in SeqFeature object 62 return SeqFeature.SeqFeature(loc, type=name, id=ft_id, 63 qualifiers={"description": description})
64 65
66 -def SwissIterator(handle):
67 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. 68 69 Every section from the ID line to the terminating // becomes 70 a single SeqRecord with associated annotation and features. 71 72 This parser is for the flat file "swiss" format as used by: 73 * Swiss-Prot aka SwissProt 74 * TrEMBL 75 * UniProtKB aka UniProt Knowledgebase 76 77 For consistency with BioPerl and EMBOSS we call this the "swiss" 78 format. See also the SeqIO support for "uniprot-xml" format. 79 """ 80 swiss_records = SwissProt.parse(handle) 81 for swiss_record in swiss_records: 82 # Convert the SwissProt record to a SeqRecord 83 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) 84 record = SeqRecord.SeqRecord(seq, 85 id=swiss_record.accessions[0], 86 name=swiss_record.entry_name, 87 description=swiss_record.description, 88 features=[_make_seqfeature(*f) for f 89 in swiss_record.features], 90 ) 91 record.description = swiss_record.description 92 for cross_reference in swiss_record.cross_references: 93 if len(cross_reference) < 2: 94 continue 95 database, accession = cross_reference[:2] 96 dbxref = "%s:%s" % (database, accession) 97 if not dbxref in record.dbxrefs: 98 record.dbxrefs.append(dbxref) 99 annotations = record.annotations 100 annotations['accessions'] = swiss_record.accessions 101 if swiss_record.created: 102 annotations['date'] = swiss_record.created[0] 103 if swiss_record.sequence_update: 104 annotations[ 105 'date_last_sequence_update'] = swiss_record.sequence_update[0] 106 if swiss_record.annotation_update: 107 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0] 108 if swiss_record.gene_name: 109 annotations['gene_name'] = swiss_record.gene_name 110 annotations['organism'] = swiss_record.organism.rstrip(".") 111 annotations['taxonomy'] = swiss_record.organism_classification 112 annotations['ncbi_taxid'] = swiss_record.taxonomy_id 113 if swiss_record.host_organism: 114 annotations['organism_host'] = swiss_record.host_organism 115 if swiss_record.host_taxonomy_id: 116 annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id 117 if swiss_record.comments: 118 annotations['comment'] = "\n".join(swiss_record.comments) 119 if swiss_record.references: 120 annotations['references'] = [] 121 for reference in swiss_record.references: 122 feature = SeqFeature.Reference() 123 feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) 124 for key, value in reference.references: 125 if key == 'PubMed': 126 feature.pubmed_id = value 127 elif key == 'MEDLINE': 128 feature.medline_id = value 129 elif key == 'DOI': 130 pass 131 elif key == 'AGRICOLA': 132 pass 133 else: 134 raise ValueError( 135 "Unknown key %s found in references" % key) 136 feature.authors = reference.authors 137 feature.title = reference.title 138 feature.journal = reference.location 139 annotations['references'].append(feature) 140 if swiss_record.keywords: 141 record.annotations['keywords'] = swiss_record.keywords 142 yield record
143 144 if __name__ == "__main__": 145 print("Quick self test...") 146 147 example_filename = "../../Tests/SwissProt/sp008" 148 149 import os 150 if not os.path.isfile(example_filename): 151 print("Missing test file %s" % example_filename) 152 else: 153 #Try parsing it! 154 with open(example_filename) as handle: 155 records = SwissIterator(handle) 156 for record in records: 157 print(record.name) 158 print(record.id) 159 print(record.annotations['keywords']) 160 print(repr(record.annotations['organism'])) 161 print(str(record.seq)[:20] + "...") 162 for f in record.features: 163 print(f) 164