Package Bio :: Package SeqIO :: Module UniprotIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.UniprotIO

  1  # Copyright 2010 by Andrea Pierleoni 
  2  # Revisions copyright 2010 by Peter Cock 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "uniprot-xml" file format. 
 10   
 11  See also: 
 12   
 13  http://www.uniprot.org 
 14   
 15  The UniProt XML format essentially replaces the old plain text file format 
 16  originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 
 17  """ 
 18  import sys 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqFeature 
 22  from Bio import Alphabet 
 23  from Bio.SeqRecord import SeqRecord 
 24  from Bio._py3k import StringIO 
 25   
 26  __docformat__ = "restructuredtext en" 
 27   
 28   
 29  # For speed try to use cElementTree rather than ElementTree 
 30  try: 
 31      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 32          # Workaround for bug in python 3.0 and 3.1, 
 33          # see http://bugs.python.org/issue9257 
 34          from xml.etree import ElementTree as ElementTree 
 35      else: 
 36          from xml.etree import cElementTree as ElementTree 
 37  except ImportError: 
 38      from xml.etree import ElementTree as ElementTree 
 39   
 40  NS = "{http://uniprot.org/uniprot}" 
 41  REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 
 42   
 43   
44 -def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
45 """Generator function to parse UniProt XML as SeqRecord objects. 46 47 parses an XML entry at a time from any UniProt XML file 48 returns a SeqRecord for each iteration 49 50 This generator can be used in Bio.SeqIO 51 52 return_raw_comments = True --> comment fields are returned as complete XML to allow further processing 53 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 54 """ 55 if isinstance(alphabet, Alphabet.NucleotideAlphabet): 56 raise ValueError("Wrong alphabet %r" % alphabet) 57 if isinstance(alphabet, Alphabet.Gapped): 58 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): 59 raise ValueError("Wrong alphabet %r" % alphabet) 60 61 if not hasattr(handle, "read"): 62 if isinstance(handle, str): 63 handle = StringIO(handle) 64 else: 65 raise Exception('An XML-containing handler or an XML string must be passed') 66 67 if ElementTree is None: 68 from Bio import MissingExternalDependencyError 69 raise MissingExternalDependencyError( 70 "No ElementTree module was found. " 71 "Use Python 2.5+, lxml or elementtree if you " 72 "want to use Bio.SeqIO.UniprotIO.") 73 74 for event, elem in ElementTree.iterparse(handle, events=("start", "end")): 75 if event == "end" and elem.tag == NS + "entry": 76 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() 77 elem.clear()
78 79
80 -class Parser(object):
81 """Parse a UniProt XML entry to a SeqRecord. 82 83 return_raw_comments=True to get back the complete comment field in XML format 84 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet. 85 """
86 - def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
87 self.entry = elem 88 self.alphabet = alphabet 89 self.return_raw_comments = return_raw_comments
90
91 - def parse(self):
92 """Parse the input.""" 93 assert self.entry.tag == NS + 'entry' 94 95 def append_to_annotations(key, value): 96 if key not in self.ParsedSeqRecord.annotations: 97 self.ParsedSeqRecord.annotations[key] = [] 98 if value not in self.ParsedSeqRecord.annotations[key]: 99 self.ParsedSeqRecord.annotations[key].append(value)
100 101 def _parse_name(element): 102 self.ParsedSeqRecord.name = element.text 103 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)
104 105 def _parse_accession(element): 106 append_to_annotations('accessions', element.text) # to cope with SwissProt plain text parser 107 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text) 108 109 def _parse_protein(element): 110 """Parse protein names (PRIVATE).""" 111 descr_set = False 112 for protein_element in element: 113 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']: # recommendedName tag are parsed before 114 # use protein fields for name and description 115 for rec_name in protein_element: 116 ann_key = '%s_%s' % (protein_element.tag.replace(NS, ''), 117 rec_name.tag.replace(NS, '')) 118 append_to_annotations(ann_key, rec_name.text) 119 if (rec_name.tag == NS + 'fullName') and not descr_set: 120 self.ParsedSeqRecord.description = rec_name.text 121 descr_set = True 122 elif protein_element.tag == NS + 'component': 123 pass # not parsed 124 elif protein_element.tag == NS + 'domain': 125 pass # not parsed 126 127 def _parse_gene(element): 128 for genename_element in element: 129 if 'type' in genename_element.attrib: 130 ann_key = 'gene_%s_%s' % (genename_element.tag.replace(NS, ''), 131 genename_element.attrib['type']) 132 if genename_element.attrib['type'] == 'primary': 133 self.ParsedSeqRecord.annotations[ann_key] = genename_element.text 134 else: 135 append_to_annotations(ann_key, genename_element.text) 136 137 def _parse_geneLocation(element): 138 append_to_annotations('geneLocation', element.attrib['type']) 139 140 def _parse_organism(element): 141 organism_name = com_name = sci_name = '' 142 for organism_element in element: 143 if organism_element.tag == NS + 'name': 144 if organism_element.text: 145 if organism_element.attrib['type'] == 'scientific': 146 sci_name = organism_element.text 147 elif organism_element.attrib['type'] == 'common': 148 com_name = organism_element.text 149 else: 150 # e.g. synonym 151 append_to_annotations("organism_name", organism_element.text) 152 elif organism_element.tag == NS + 'dbReference': 153 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type'] + ':' + organism_element.attrib['id']) 154 elif organism_element.tag == NS + 'lineage': 155 for taxon_element in organism_element: 156 if taxon_element.tag == NS + 'taxon': 157 append_to_annotations('taxonomy', taxon_element.text) 158 if sci_name and com_name: 159 organism_name = '%s (%s)' % (sci_name, com_name) 160 elif sci_name: 161 organism_name = sci_name 162 elif com_name: 163 organism_name = com_name 164 self.ParsedSeqRecord.annotations['organism'] = organism_name 165 166 def _parse_organismHost(element): 167 for organism_element in element: 168 if organism_element.tag == NS + 'name': 169 append_to_annotations("organism_host", organism_element.text) 170 171 def _parse_keyword(element): 172 append_to_annotations('keywords', element.text) 173 174 def _parse_comment(element): 175 """Parse comments (PRIVATE). 176 177 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 178 To store all the contained data, more complex data structures are needed, such as 179 annotated dictionaries. This is left to end user, by optionally setting: 180 181 return_raw_comments=True 182 183 The original XML is returned in the annotation fields. 184 185 Available comment types at december 2009: 186 "allergen" 187 "alternative products" 188 "biotechnology" 189 "biophysicochemical properties" 190 "catalytic activity" 191 "caution" 192 "cofactor" 193 "developmental stage" 194 "disease" 195 "domain" 196 "disruption phenotype" 197 "enzyme regulation" 198 "function" 199 "induction" 200 "miscellaneous" 201 "pathway" 202 "pharmaceutical" 203 "polymorphism" 204 "PTM" 205 "RNA editing" 206 "similarity" 207 "subcellular location" 208 "sequence caution" 209 "subunit" 210 "tissue specificity" 211 "toxic dose" 212 "online information" 213 "mass spectrometry" 214 "interaction" 215 """ 216 217 simple_comments = ["allergen", 218 "biotechnology", 219 "biophysicochemical properties", 220 "catalytic activity", 221 "caution", 222 "cofactor", 223 "developmental stage", 224 "disease", 225 "domain", 226 "disruption phenotype", 227 "enzyme regulation", 228 "function", 229 "induction", 230 "miscellaneous", 231 "pathway", 232 "pharmaceutical", 233 "polymorphism", 234 "PTM", 235 "RNA editing", # positions not parsed 236 "similarity", 237 "subunit", 238 "tissue specificity", 239 "toxic dose", 240 ] 241 242 if element.attrib['type'] in simple_comments: 243 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 244 for text_element in element.getiterator(NS + 'text'): 245 if text_element.text: 246 append_to_annotations(ann_key, text_element.text) 247 elif element.attrib['type'] == 'subcellular location': 248 for subloc_element in element.getiterator(NS + 'subcellularLocation'): 249 for el in subloc_element: 250 if el.text: 251 ann_key = 'comment_%s_%s' % (element.attrib['type'].replace(' ', ''), el.tag.replace(NS, '')) 252 append_to_annotations(ann_key, el.text) 253 elif element.attrib['type'] == 'interaction': 254 for interact_element in element.getiterator(NS + 'interactant'): 255 ann_key = 'comment_%s_intactId' % element.attrib['type'] 256 append_to_annotations(ann_key, interact_element.attrib['intactId']) 257 elif element.attrib['type'] == 'alternative products': 258 for alt_element in element.getiterator(NS + 'isoform'): 259 ann_key = 'comment_%s_isoform' % element.attrib['type'].replace(' ', '') 260 for id_element in alt_element.getiterator(NS + 'id'): 261 append_to_annotations(ann_key, id_element.text) 262 elif element.attrib['type'] == 'mass spectrometry': 263 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 264 start = end = 0 265 for loc_element in element.getiterator(NS + 'location'): 266 pos_els = loc_element.getiterator(NS + 'position') 267 pos_els = list(pos_els) 268 # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry 269 try: 270 if pos_els: 271 end = int(pos_els[0].attrib['position']) 272 start = end - 1 273 else: 274 start = int(list(loc_element.getiterator(NS + 'begin'))[0].attrib['position']) - 1 275 end = int(list(loc_element.getiterator(NS + 'end'))[0].attrib['position']) 276 except: # undefined positions or erroneously mapped 277 pass 278 mass = element.attrib['mass'] 279 method = element.attrib['method'] 280 if start == end == 0: 281 append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method)) 282 else: 283 append_to_annotations(ann_key, '%s..%s:%s|%s' % (start, end, mass, method)) 284 elif element.attrib['type'] == 'sequence caution': 285 pass # not parsed: few information, complex structure 286 elif element.attrib['type'] == 'online information': 287 for link_element in element.getiterator(NS + 'link'): 288 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 289 for id_element in link_element.getiterator(NS + 'link'): 290 append_to_annotations(ann_key, 291 '%s@%s' % (element.attrib['name'], link_element.attrib['uri'])) 292 293 # return raw XML comments if needed 294 if self.return_raw_comments: 295 ann_key = 'comment_%s_xml' % element.attrib['type'].replace(' ', '') 296 append_to_annotations(ann_key, ElementTree.tostring(element)) 297 298 def _parse_dbReference(element): 299 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id']) 300 # e.g. 301 # <dbReference type="PDB" key="11" id="2GEZ"> 302 # <property value="X-ray" type="method"/> 303 # <property value="2.60 A" type="resolution"/> 304 # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 305 # </dbReference> 306 if 'type' in element.attrib: 307 if element.attrib['type'] == 'PDB': 308 method = "" 309 resolution = "" 310 for ref_element in element: 311 if ref_element.tag == NS + 'property': 312 dat_type = ref_element.attrib['type'] 313 if dat_type == 'method': 314 method = ref_element.attrib['value'] 315 if dat_type == 'resolution': 316 resolution = ref_element.attrib['value'] 317 if dat_type == 'chains': 318 pairs = ref_element.attrib['value'].split(',') 319 for elem in pairs: 320 pair = elem.strip().split('=') 321 if pair[1] != '-': 322 # TODO - How best to store these, do SeqFeatures make sense? 323 feature = SeqFeature.SeqFeature() 324 feature.type = element.attrib['type'] 325 feature.qualifiers['name'] = element.attrib['id'] 326 feature.qualifiers['method'] = method 327 feature.qualifiers['resolution'] = resolution 328 feature.qualifiers['chains'] = pair[0].split('/') 329 start = int(pair[1].split('-')[0]) - 1 330 end = int(pair[1].split('-')[1]) 331 feature.location = SeqFeature.FeatureLocation(start, end) 332 # self.ParsedSeqRecord.features.append(feature) 333 334 for ref_element in element: 335 if ref_element.tag == NS + 'property': 336 pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 337 338 def _parse_reference(element): 339 reference = SeqFeature.Reference() 340 authors = [] 341 scopes = [] 342 tissues = [] 343 journal_name = '' 344 pub_type = '' 345 pub_date = '' 346 for ref_element in element: 347 if ref_element.tag == NS + 'citation': 348 pub_type = ref_element.attrib['type'] 349 if pub_type == 'submission': 350 pub_type += ' to the ' + ref_element.attrib['db'] 351 if 'name' in ref_element.attrib: 352 journal_name = ref_element.attrib['name'] 353 pub_date = ref_element.attrib.get('date', '') 354 j_volume = ref_element.attrib.get('volume', '') 355 j_first = ref_element.attrib.get('first', '') 356 j_last = ref_element.attrib.get('last', '') 357 for cit_element in ref_element: 358 if cit_element.tag == NS + 'title': 359 reference.title = cit_element.text 360 elif cit_element.tag == NS + 'authorList': 361 for person_element in cit_element: 362 authors.append(person_element.attrib['name']) 363 elif cit_element.tag == NS + 'dbReference': 364 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] 365 + ':' + cit_element.attrib['id']) 366 if cit_element.attrib['type'] == 'PubMed': 367 reference.pubmed_id = cit_element.attrib['id'] 368 elif ref_element.attrib['type'] == 'MEDLINE': 369 reference.medline_id = cit_element.attrib['id'] 370 elif ref_element.tag == NS + 'scope': 371 scopes.append(ref_element.text) 372 elif ref_element.tag == NS + 'source': 373 for source_element in ref_element: 374 if source_element.tag == NS + 'tissue': 375 tissues.append(source_element.text) 376 if scopes: 377 scopes_str = 'Scope: ' + ', '.join(scopes) 378 else: 379 scopes_str = '' 380 if tissues: 381 tissues_str = 'Tissue: ' + ', '.join(tissues) 382 else: 383 tissues_str = '' 384 385 # locations cannot be parsed since they are actually written in 386 # free text inside scopes so all the references are put in the 387 # annotation. 388 reference.location = [] 389 reference.authors = ', '.join(authors) 390 if journal_name: 391 if pub_date and j_volume and j_first and j_last: 392 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, 393 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) 394 else: 395 reference.journal = journal_name 396 reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) 397 append_to_annotations('references', reference) 398 399 def _parse_position(element, offset=0): 400 try: 401 position = int(element.attrib['position']) + offset 402 except KeyError as err: 403 position = None 404 status = element.attrib.get('status', '') 405 if status == 'unknown': 406 assert position is None 407 return SeqFeature.UnknownPosition() 408 elif not status: 409 return SeqFeature.ExactPosition(position) 410 elif status == 'greater than': 411 return SeqFeature.AfterPosition(position) 412 elif status == 'less than': 413 return SeqFeature.BeforePosition(position) 414 elif status == 'uncertain': 415 return SeqFeature.UncertainPosition(position) 416 else: 417 raise NotImplementedError("Position status %r" % status) 418 419 def _parse_feature(element): 420 feature = SeqFeature.SeqFeature() 421 for k, v in element.attrib.items(): 422 feature.qualifiers[k] = v 423 feature.type = element.attrib.get('type', '') 424 if 'id' in element.attrib: 425 feature.id = element.attrib['id'] 426 for feature_element in element: 427 if feature_element.tag == NS + 'location': 428 position_elements = feature_element.findall(NS + 'position') 429 if position_elements: 430 element = position_elements[0] 431 start_position = _parse_position(element, -1) 432 end_position = _parse_position(element) 433 else: 434 element = feature_element.findall(NS + 'begin')[0] 435 start_position = _parse_position(element, -1) 436 element = feature_element.findall(NS + 'end')[0] 437 end_position = _parse_position(element) 438 feature.location = SeqFeature.FeatureLocation(start_position, end_position) 439 else: 440 try: 441 feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text 442 except: 443 pass # skip unparsable tag 444 self.ParsedSeqRecord.features.append(feature) 445 446 def _parse_proteinExistence(element): 447 append_to_annotations('proteinExistence', element.attrib['type']) 448 449 def _parse_evidence(element): 450 for k, v in element.attrib.items(): 451 ann_key = k 452 append_to_annotations(ann_key, v) 453 454 def _parse_sequence(element): 455 for k, v in element.attrib.items(): 456 if k in ("length", "mass", "version"): 457 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v) 458 else: 459 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v 460 seq = ''.join((element.text.split())) 461 self.ParsedSeqRecord.seq = Seq.Seq(seq, self.alphabet) 462 463 # ============================================# 464 # Initialize SeqRecord 465 self.ParsedSeqRecord = SeqRecord('', id='') 466 467 # Entry attribs parsing 468 # Unknown dataset should not happen! 469 self.dbname = self.entry.attrib.get('dataset', 'UnknownDataset') 470 # add attribs to annotations 471 for k, v in self.entry.attrib.items(): 472 if k in ("version"): 473 # original 474 # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 475 # To cope with swissProt plain text parser. this can cause errors 476 # if the attrib has the same name of an other annotation 477 self.ParsedSeqRecord.annotations[k] = int(v) 478 else: 479 # self.ParsedSeqRecord.annotations["entry_%s" % k] = v 480 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser 481 482 # Top-to-bottom entry children parsing 483 for element in self.entry: 484 if element.tag == NS + 'name': 485 _parse_name(element) 486 elif element.tag == NS + 'accession': 487 _parse_accession(element) 488 elif element.tag == NS + 'protein': 489 _parse_protein(element) 490 elif element.tag == NS + 'gene': 491 _parse_gene(element) 492 elif element.tag == NS + 'geneLocation': 493 _parse_geneLocation(element) 494 elif element.tag == NS + 'organism': 495 _parse_organism(element) 496 elif element.tag == NS + 'organismHost': 497 _parse_organismHost(element) 498 elif element.tag == NS + 'keyword': 499 _parse_keyword(element) 500 elif element.tag == NS + 'comment': 501 _parse_comment(element) 502 elif element.tag == NS + 'dbReference': 503 _parse_dbReference(element) 504 elif element.tag == NS + 'reference': 505 _parse_reference(element) 506 elif element.tag == NS + 'feature': 507 _parse_feature(element) 508 elif element.tag == NS + 'proteinExistence': 509 _parse_proteinExistence(element) 510 elif element.tag == NS + 'evidence': 511 _parse_evidence(element) 512 elif element.tag == NS + 'sequence': 513 _parse_sequence(element) 514 else: 515 pass 516 517 # remove duplicate dbxrefs 518 self.ParsedSeqRecord.dbxrefs = sorted(list(set(self.ParsedSeqRecord.dbxrefs))) 519 520 # use first accession as id 521 if not self.ParsedSeqRecord.id: 522 self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations['accessions'][0] 523 524 return self.ParsedSeqRecord 525