Package Bio :: Package SeqIO :: Module SwissIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.SwissIO

  1  # Copyright 2006-2010 by Peter Cock and Michiel de Hoon. 
  2  # All rights reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format. 
  9   
 10  You are expected to use this module via the Bio.SeqIO functions. 
 11  See also the Bio.SwissProt module which offers more than just accessing 
 12  the sequences as SeqRecord objects. 
 13   
 14  See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format. 
 15  """ 
 16   
 17  from Bio import Seq 
 18  from Bio import SeqRecord 
 19  from Bio import Alphabet 
 20  from Bio import SeqFeature 
 21  from Bio import SwissProt 
 22   
 23   
24 -def _make_position(location_string, offset=0):
25 """Turn a Swiss location position into a SeqFeature position object (PRIVATE). 26 27 An offset of -1 is used with a start location to make it pythonic. 28 """ 29 if location_string == "?": 30 return SeqFeature.UnknownPosition() 31 #Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. 32 try: 33 return SeqFeature.ExactPosition(max(0, offset + int(location_string))) 34 except ValueError: 35 pass 36 if location_string.startswith("<"): 37 try: 38 return SeqFeature.BeforePosition(max(0, offset + int(location_string[1:]))) 39 except ValueError: 40 pass 41 elif location_string.startswith(">"): # e.g. ">13" 42 try: 43 return SeqFeature.AfterPosition(max(0, offset + int(location_string[1:]))) 44 except ValueError: 45 pass 46 elif location_string.startswith("?"): # e.g. "?22" 47 try: 48 return SeqFeature.UncertainPosition(max(0, offset + int(location_string[1:]))) 49 except ValueError: 50 pass 51 raise NotImplementedError("Cannot parse location '%s'" % location_string)
52 53
54 -def _make_seqfeature(name, from_res, to_res, description, ft_id):
55 """Construct SeqFeature from feature data from parser (PRIVATE).""" 56 loc = SeqFeature.FeatureLocation(_make_position(from_res, -1), 57 _make_position(to_res, 0)) 58 if not ft_id: 59 ft_id = "<unknown id>" # The default in SeqFeature object 60 return SeqFeature.SeqFeature(loc, type=name, id=ft_id, 61 qualifiers={"description": description})
62 63
64 -def SwissIterator(handle):
65 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. 66 67 Every section from the ID line to the terminating // becomes 68 a single SeqRecord with associated annotation and features. 69 70 This parser is for the flat file "swiss" format as used by: 71 * Swiss-Prot aka SwissProt 72 * TrEMBL 73 * UniProtKB aka UniProt Knowledgebase 74 75 For consistency with BioPerl and EMBOSS we call this the "swiss" 76 format. See also the SeqIO support for "uniprot-xml" format. 77 """ 78 swiss_records = SwissProt.parse(handle) 79 for swiss_record in swiss_records: 80 # Convert the SwissProt record to a SeqRecord 81 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) 82 record = SeqRecord.SeqRecord(seq, 83 id=swiss_record.accessions[0], 84 name=swiss_record.entry_name, 85 description=swiss_record.description, 86 features=[_make_seqfeature(*f) for f 87 in swiss_record.features], 88 ) 89 record.description = swiss_record.description 90 for cross_reference in swiss_record.cross_references: 91 if len(cross_reference) < 2: 92 continue 93 database, accession = cross_reference[:2] 94 dbxref = "%s:%s" % (database, accession) 95 if not dbxref in record.dbxrefs: 96 record.dbxrefs.append(dbxref) 97 annotations = record.annotations 98 annotations['accessions'] = swiss_record.accessions 99 annotations['date'] = swiss_record.created[0] 100 annotations[ 101 'date_last_sequence_update'] = swiss_record.sequence_update[0] 102 if swiss_record.annotation_update: 103 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0] 104 if swiss_record.gene_name: 105 annotations['gene_name'] = swiss_record.gene_name 106 annotations['organism'] = swiss_record.organism.rstrip(".") 107 annotations['taxonomy'] = swiss_record.organism_classification 108 annotations['ncbi_taxid'] = swiss_record.taxonomy_id 109 if swiss_record.host_organism: 110 annotations['organism_host'] = swiss_record.host_organism 111 if swiss_record.host_taxonomy_id: 112 annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id 113 if swiss_record.comments: 114 annotations['comment'] = "\n".join(swiss_record.comments) 115 if swiss_record.references: 116 annotations['references'] = [] 117 for reference in swiss_record.references: 118 feature = SeqFeature.Reference() 119 feature.comment = " ".join(["%s=%s;" % (key, value) 120 for key, value 121 in reference.comments]) 122 for key, value in reference.references: 123 if key == 'PubMed': 124 feature.pubmed_id = value 125 elif key == 'MEDLINE': 126 feature.medline_id = value 127 elif key == 'DOI': 128 pass 129 elif key == 'AGRICOLA': 130 pass 131 else: 132 raise ValueError( 133 "Unknown key %s found in references" % key) 134 feature.authors = reference.authors 135 feature.title = reference.title 136 feature.journal = reference.location 137 annotations['references'].append(feature) 138 if swiss_record.keywords: 139 record.annotations['keywords'] = swiss_record.keywords 140 yield record
141 142 if __name__ == "__main__": 143 print "Quick self test..." 144 145 example_filename = "../../Tests/SwissProt/sp008" 146 147 import os 148 if not os.path.isfile(example_filename): 149 print "Missing test file %s" % example_filename 150 else: 151 #Try parsing it! 152 handle = open(example_filename) 153 records = SwissIterator(handle) 154 for record in records: 155 print record.name 156 print record.id 157 print record.annotations['keywords'] 158 print repr(record.annotations['organism']) 159 print str(record.seq)[:20] + "..." 160 for f in record.features: 161 print f 162 handle.close() 163