Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  """Internal code for parsing GenBank and EMBL files (PRIVATE). 
   7   
   8  This code is NOT intended for direct use.  It provides a basic scanner 
   9  (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
  10  to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  11   
  12  It is used by Bio.GenBank to parse GenBank files 
  13  It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  14   
  15  Feature Table Documentation: 
  16  http://www.insdc.org/files/feature_table.html 
  17  http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  18  ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  19  """ 
  20  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  21  # These are GenBank files that summarize the content of a project, and provide lists of 
  22  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  23  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  24  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  25  # http://is.gd/nNgk 
  26  # for more details of this format, and an example. 
  27  # Added by Ying Huang & Iddo Friedberg 
  28   
  29  import warnings 
  30  import re 
  31  from Bio.Seq import Seq 
  32  from Bio.SeqRecord import SeqRecord 
  33  from Bio.Alphabet import generic_protein 
  34   
35 -class InsdcScanner(object):
36 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 37 38 The International Nucleotide Sequence Database Collaboration (INSDC) 39 between the DDBJ, EMBL, and GenBank. These organisations all use the 40 same "Feature Table" layout in their plain text flat file formats. 41 42 However, the header and sequence sections of an EMBL file are very 43 different in layout to those produced by GenBank/DDBJ.""" 44 45 #These constants get redefined with sensible values in the sub classes: 46 RECORD_START = "XXX" # "LOCUS " or "ID " 47 HEADER_WIDTH = 3 # 12 or 5 48 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 49 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 50 FEATURE_QUALIFIER_INDENT = 0 51 FEATURE_QUALIFIER_SPACER = "" 52 SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed 53
54 - def __init__(self, debug=0):
55 assert len(self.RECORD_START)==self.HEADER_WIDTH 56 for marker in self.SEQUENCE_HEADERS: 57 assert marker==marker.rstrip() 58 assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT 59 self.debug = debug 60 self.line = None
61
62 - def set_handle(self, handle):
63 self.handle = handle 64 self.line = ""
65
66 - def find_start(self):
67 """Read in lines until find the ID/LOCUS line, which is returned. 68 69 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 70 will we ignored.""" 71 while True: 72 if self.line: 73 line = self.line 74 self.line = "" 75 else: 76 line = self.handle.readline() 77 if not line: 78 if self.debug : print "End of file" 79 return None 80 if line[:self.HEADER_WIDTH]==self.RECORD_START: 81 if self.debug > 1: print "Found the start of a record:\n" + line 82 break 83 line = line.rstrip() 84 if line == "//": 85 if self.debug > 1: print "Skipping // marking end of last record" 86 elif line == "": 87 if self.debug > 1: print "Skipping blank line before record" 88 else: 89 #Ignore any header before the first ID/LOCUS line. 90 if self.debug > 1: 91 print "Skipping header line before record:\n" + line 92 self.line = line 93 return line
94
95 - def parse_header(self):
96 """Return list of strings making up the header 97 98 New line characters are removed. 99 100 Assumes you have just read in the ID/LOCUS line. 101 """ 102 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \ 103 "Not at start of record" 104 105 header_lines = [] 106 while True: 107 line = self.handle.readline() 108 if not line: 109 raise ValueError("Premature end of line during sequence data") 110 line = line.rstrip() 111 if line in self.FEATURE_START_MARKERS: 112 if self.debug : print "Found header table" 113 break 114 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 115 # if self.debug : print "Found header table (?)" 116 # break 117 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 118 if self.debug : print "Found start of sequence" 119 break 120 if line == "//": 121 raise ValueError("Premature end of sequence data marker '//' found") 122 header_lines.append(line) 123 self.line = line 124 return header_lines
125
126 - def parse_features(self, skip=False):
127 """Return list of tuples for the features (if present) 128 129 Each feature is returned as a tuple (key, location, qualifiers) 130 where key and location are strings (e.g. "CDS" and 131 "complement(join(490883..490885,1..879))") while qualifiers 132 is a list of two string tuples (feature qualifier keys and values). 133 134 Assumes you have already read to the start of the features table. 135 """ 136 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 137 if self.debug : print "Didn't find any feature table" 138 return [] 139 140 while self.line.rstrip() in self.FEATURE_START_MARKERS: 141 self.line = self.handle.readline() 142 143 features = [] 144 line = self.line 145 while True: 146 if not line: 147 raise ValueError("Premature end of line during features table") 148 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 149 if self.debug : print "Found start of sequence" 150 break 151 line = line.rstrip() 152 if line == "//": 153 raise ValueError("Premature end of features table, marker '//' found") 154 if line in self.FEATURE_END_MARKERS: 155 if self.debug : print "Found end of features" 156 line = self.handle.readline() 157 break 158 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 159 #This is an empty feature line between qualifiers. Empty 160 #feature lines within qualifiers are handled below (ignored). 161 line = self.handle.readline() 162 continue 163 164 if skip: 165 line = self.handle.readline() 166 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 167 line = self.handle.readline() 168 else: 169 #Build up a list of the lines making up this feature: 170 if line[self.FEATURE_QUALIFIER_INDENT]!=" " \ 171 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 172 #The feature table design enforces a length limit on the feature keys. 173 #Some third party files (e.g. IGMT's EMBL like files) solve this by 174 #over indenting the location and qualifiers. 175 feature_key, line = line[2:].strip().split(None,1) 176 feature_lines = [line] 177 warnings.warn("Overindented %s feature?" % feature_key) 178 else: 179 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 180 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 181 line = self.handle.readline() 182 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 183 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 184 #Use strip to remove any harmless trailing white space AND and leading 185 #white space (e.g. out of spec files with too much intentation) 186 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 187 line = self.handle.readline() 188 features.append(self.parse_feature(feature_key, feature_lines)) 189 self.line = line 190 return features
191
192 - def parse_feature(self, feature_key, lines):
193 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 194 195 For example given this GenBank feature: 196 197 CDS complement(join(490883..490885,1..879)) 198 /locus_tag="NEQ001" 199 /note="conserved hypothetical [Methanococcus jannaschii]; 200 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 201 localization signal; IPR002743: Protein of unknown 202 function DUF57" 203 /codon_start=1 204 /transl_table=11 205 /product="hypothetical protein" 206 /protein_id="NP_963295.1" 207 /db_xref="GI:41614797" 208 /db_xref="GeneID:2732620" 209 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 210 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 211 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 212 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 213 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 214 LNSMGFGFVNTKKNSAR" 215 216 Then should give input key="CDS" and the rest of the data as a list of strings 217 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 218 where the leading spaces and trailing newlines have been removed. 219 220 Returns tuple containing: (key as string, location string, qualifiers as list) 221 as follows for this example: 222 223 key = "CDS", string 224 location = "complement(join(490883..490885,1..879))", string 225 qualifiers = list of string tuples: 226 227 [('locus_tag', '"NEQ001"'), 228 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 229 ('codon_start', '1'), 230 ('transl_table', '11'), 231 ('product', '"hypothetical protein"'), 232 ('protein_id', '"NP_963295.1"'), 233 ('db_xref', '"GI:41614797"'), 234 ('db_xref', '"GeneID:2732620"'), 235 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 236 237 In the above example, the "note" and "translation" were edited for compactness, 238 and they would contain multiple new line characters (displayed above as \n) 239 240 If a qualifier is quoted (in this case, everything except codon_start and 241 transl_table) then the quotes are NOT removed. 242 243 Note that no whitespace is removed. 244 """ 245 #Skip any blank lines 246 iterator = iter(filter(None, lines)) 247 try: 248 line = iterator.next() 249 250 feature_location = line.strip() 251 while feature_location[-1:]==",": 252 #Multiline location, still more to come! 253 line = iterator.next() 254 feature_location += line.strip() 255 256 qualifiers=[] 257 258 for i, line in enumerate(iterator): 259 # check for extra wrapping of the location closing parentheses 260 if i == 0 and line.startswith(")"): 261 feature_location += line.strip() 262 elif line[0]=="/": 263 #New qualifier 264 i = line.find("=") 265 key = line[1:i] #does not work if i==-1 266 value = line[i+1:] #we ignore 'value' if i==-1 267 if i==-1: 268 #Qualifier with no key, e.g. /pseudo 269 key = line[1:] 270 qualifiers.append((key,None)) 271 elif not value: 272 #ApE can output /note= 273 qualifiers.append((key,"")) 274 elif value[0]=='"': 275 #Quoted... 276 if value[-1]!='"' or value!='"': 277 #No closing quote on the first line... 278 while value[-1] != '"': 279 value += "\n" + iterator.next() 280 else: 281 #One single line (quoted) 282 assert value == '"' 283 if self.debug : print "Quoted line %s:%s" % (key, value) 284 #DO NOT remove the quotes... 285 qualifiers.append((key,value)) 286 else: 287 #Unquoted 288 #if debug : print "Unquoted line %s:%s" % (key,value) 289 qualifiers.append((key,value)) 290 else: 291 #Unquoted continuation 292 assert len(qualifiers) > 0 293 assert key==qualifiers[-1][0] 294 #if debug : print "Unquoted Cont %s:%s" % (key, line) 295 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 296 return (feature_key, feature_location, qualifiers) 297 except StopIteration: 298 #Bummer 299 raise ValueError("Problem with '%s' feature:\n%s" \ 300 % (feature_key, "\n".join(lines)))
301 322
323 - def _feed_first_line(self, consumer, line):
324 """Handle the LOCUS/ID line, passing data to the comsumer 325 326 This should be implemented by the EMBL / GenBank specific subclass 327 328 Used by the parse_records() and parse() methods. 329 """ 330 pass
331
332 - def _feed_header_lines(self, consumer, lines):
333 """Handle the header lines (list of strings), passing data to the comsumer 334 335 This should be implemented by the EMBL / GenBank specific subclass 336 337 Used by the parse_records() and parse() methods. 338 """ 339 pass
340 341
342 - def _feed_feature_table(self, consumer, feature_tuples):
343 """Handle the feature table (list of tuples), passing data to the comsumer 344 345 Used by the parse_records() and parse() methods. 346 """ 347 consumer.start_feature_table() 348 for feature_key, location_string, qualifiers in feature_tuples: 349 consumer.feature_key(feature_key) 350 consumer.location(location_string) 351 for q_key, q_value in qualifiers: 352 if q_value is None: 353 consumer.feature_qualifier(q_key, q_value) 354 else: 355 consumer.feature_qualifier(q_key, q_value.replace("\n"," "))
356 357
358 - def _feed_misc_lines(self, consumer, lines):
359 """Handle any lines between features and sequence (list of strings), passing data to the consumer 360 361 This should be implemented by the EMBL / GenBank specific subclass 362 363 Used by the parse_records() and parse() methods. 364 """ 365 pass
366
367 - def feed(self, handle, consumer, do_features=True):
368 """Feed a set of data into the consumer. 369 370 This method is intended for use with the "old" code in Bio.GenBank 371 372 Arguments: 373 handle - A handle with the information to parse. 374 consumer - The consumer that should be informed of events. 375 do_features - Boolean, should the features be parsed? 376 Skipping the features can be much faster. 377 378 Return values: 379 true - Passed a record 380 false - Did not find a record 381 """ 382 #Should work with both EMBL and GenBank files provided the 383 #equivalent Bio.GenBank._FeatureConsumer methods are called... 384 self.set_handle(handle) 385 if not self.find_start(): 386 #Could not find (another) record 387 consumer.data=None 388 return False 389 390 #We use the above class methods to parse the file into a simplified format. 391 #The first line, header lines and any misc lines after the features will be 392 #dealt with by GenBank / EMBL specific derived classes. 393 394 #First line and header: 395 self._feed_first_line(consumer, self.line) 396 self._feed_header_lines(consumer, self.parse_header()) 397 398 #Features (common to both EMBL and GenBank): 399 if do_features: 400 self._feed_feature_table(consumer, self.parse_features(skip=False)) 401 else: 402 self.parse_features(skip=True) # ignore the data 403 404 #Footer and sequence 405 misc_lines, sequence_string = self.parse_footer() 406 self._feed_misc_lines(consumer, misc_lines) 407 408 consumer.sequence(sequence_string) 409 #Calls to consumer.base_number() do nothing anyway 410 consumer.record_end("//") 411 412 assert self.line == "//" 413 414 #And we are done 415 return True
416
417 - def parse(self, handle, do_features=True):
418 """Returns a SeqRecord (with SeqFeatures if do_features=True) 419 420 See also the method parse_records() for use on multi-record files. 421 """ 422 from Bio.GenBank import _FeatureConsumer 423 from Bio.GenBank.utils import FeatureValueCleaner 424 425 consumer = _FeatureConsumer(use_fuzziness = 1, 426 feature_cleaner = FeatureValueCleaner()) 427 428 if self.feed(handle, consumer, do_features): 429 return consumer.data 430 else: 431 return None
432 433
434 - def parse_records(self, handle, do_features=True):
435 """Returns a SeqRecord object iterator 436 437 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 438 439 The SeqRecord objects include SeqFeatures if do_features=True 440 441 This method is intended for use in Bio.SeqIO 442 """ 443 #This is a generator function 444 while True: 445 record = self.parse(handle, do_features) 446 if record is None : break 447 assert record.id is not None 448 assert record.name != "<unknown name>" 449 assert record.description != "<unknown description>" 450 yield record
451
452 - def parse_cds_features(self, handle, 453 alphabet=generic_protein, 454 tags2id=('protein_id','locus_tag','product')):
455 """Returns SeqRecord object iterator 456 457 Each CDS feature becomes a SeqRecord. 458 459 alphabet - Used for any sequence found in a translation field. 460 tags2id - Tupple of three strings, the feature keys to use 461 for the record id, name and description, 462 463 This method is intended for use in Bio.SeqIO 464 """ 465 self.set_handle(handle) 466 while self.find_start(): 467 #Got an EMBL or GenBank record... 468 self.parse_header() # ignore header lines! 469 feature_tuples = self.parse_features() 470 #self.parse_footer() # ignore footer lines! 471 while True: 472 line = self.handle.readline() 473 if not line : break 474 if line[:2]=="//" : break 475 self.line = line.rstrip() 476 477 #Now go though those features... 478 for key, location_string, qualifiers in feature_tuples: 479 if key=="CDS": 480 #Create SeqRecord 481 #================ 482 #SeqRecord objects cannot be created with annotations, they 483 #must be added afterwards. So create an empty record and 484 #then populate it: 485 record = SeqRecord(seq=None) 486 annotations = record.annotations 487 488 #Should we add a location object to the annotations? 489 #I *think* that only makes sense for SeqFeatures with their 490 #sub features... 491 annotations['raw_location'] = location_string.replace(' ','') 492 493 for (qualifier_name, qualifier_data) in qualifiers: 494 if qualifier_data is not None \ 495 and qualifier_data[0]=='"' and qualifier_data[-1]=='"': 496 #Remove quotes 497 qualifier_data = qualifier_data[1:-1] 498 #Append the data to the annotation qualifier... 499 if qualifier_name == "translation": 500 assert record.seq is None, "Multiple translations!" 501 record.seq = Seq(qualifier_data.replace("\n",""), alphabet) 502 elif qualifier_name == "db_xref": 503 #its a list, possibly empty. Its safe to extend 504 record.dbxrefs.append(qualifier_data) 505 else: 506 if qualifier_data is not None: 507 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ") 508 try: 509 annotations[qualifier_name] += " " + qualifier_data 510 except KeyError: 511 #Not an addition to existing data, its the first bit 512 annotations[qualifier_name]= qualifier_data 513 514 #Fill in the ID, Name, Description 515 #================================= 516 try: 517 record.id = annotations[tags2id[0]] 518 except KeyError: 519 pass 520 try: 521 record.name = annotations[tags2id[1]] 522 except KeyError: 523 pass 524 try: 525 record.description = annotations[tags2id[2]] 526 except KeyError: 527 pass 528 529 yield record
530 531
532 -class EmblScanner(InsdcScanner):
533 """For extracting chunks of information in EMBL files""" 534 535 RECORD_START = "ID " 536 HEADER_WIDTH = 5 537 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"] 538 FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things! 539 FEATURE_QUALIFIER_INDENT = 21 540 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2) 541 SEQUENCE_HEADERS=["SQ", "CO"] #Remove trailing spaces 542 577
578 - def _feed_first_line(self, consumer, line):
579 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 580 if line[self.HEADER_WIDTH:].count(";") == 6: 581 #Looks like the semi colon separated style introduced in 2006 582 self._feed_first_line_new(consumer, line) 583 elif line[self.HEADER_WIDTH:].count(";") == 3: 584 #Looks like the pre 2006 style 585 self._feed_first_line_old(consumer, line) 586 else: 587 raise ValueError('Did not recognise the ID line layout:\n' + line)
588
589 - def _feed_first_line_old(self, consumer, line):
590 #Expects an ID line in the style before 2006, e.g. 591 #ID SC10H5 standard; DNA; PRO; 4870 BP. 592 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 593 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 594 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]] 595 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";")) 596 fields = [entry.strip() for entry in fields] 597 """ 598 The tokens represent: 599 0. Primary accession number 600 (space sep) 601 1. ??? (e.g. standard) 602 (semi-colon) 603 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 604 3. Taxonomic division (e.g. 'PRO') 605 4. Sequence length (e.g. '4639675 BP.') 606 """ 607 consumer.locus(fields[0]) #Should we also call the accession consumer? 608 consumer.residue_type(fields[2]) 609 consumer.data_file_division(fields[3]) 610 self._feed_seq_length(consumer, fields[4])
611
612 - def _feed_first_line_new(self, consumer, line):
613 #Expects an ID line in the style introduced in 2006, e.g. 614 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 615 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 616 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 617 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 618 assert len(fields) == 7 619 """ 620 The tokens represent: 621 0. Primary accession number 622 1. Sequence version number 623 2. Topology: 'circular' or 'linear' 624 3. Molecule type (e.g. 'genomic DNA') 625 4. Data class (e.g. 'STD') 626 5. Taxonomic division (e.g. 'PRO') 627 6. Sequence length (e.g. '4639675 BP.') 628 """ 629 630 consumer.locus(fields[0]) 631 632 #Call the accession consumer now, to make sure we record 633 #something as the record.id, in case there is no AC line 634 consumer.accession(fields[0]) 635 636 #TODO - How to deal with the version field? At the moment the consumer 637 #will try and use this for the ID which isn't ideal for EMBL files. 638 version_parts = fields[1].split() 639 if len(version_parts)==2 \ 640 and version_parts[0]=="SV" \ 641 and version_parts[1].isdigit(): 642 consumer.version_suffix(version_parts[1]) 643 644 #Based on how the old GenBank parser worked, merge these two: 645 consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields? 646 647 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 648 649 consumer.data_file_division(fields[5]) 650 651 self._feed_seq_length(consumer, fields[6])
652
653 - def _feed_seq_length(self, consumer, text):
654 length_parts = text.split() 655 assert len(length_parts) == 2 656 assert length_parts[1].upper() in ["BP", "BP.", "AA."] 657 consumer.size(length_parts[0])
658
659 - def _feed_header_lines(self, consumer, lines):
660 EMBL_INDENT = self.HEADER_WIDTH 661 EMBL_SPACER = " " * EMBL_INDENT 662 consumer_dict = { 663 'AC' : 'accession', 664 'SV' : 'version', # SV line removed in June 2006, now part of ID line 665 'DE' : 'definition', 666 #'RN' : 'reference_num', 667 #'RC' : reference comment... TODO 668 #'RP' : 'reference_bases', 669 #'RX' : reference cross reference... DOI or Pubmed 670 'RG' : 'consrtm', #optional consortium 671 #'RA' : 'authors', 672 #'RT' : 'title', 673 'RL' : 'journal', 674 'OS' : 'organism', 675 'OC' : 'taxonomy', 676 #'DR' : data reference 677 'CC' : 'comment', 678 #'XX' : splitter 679 } 680 #We have to handle the following specially: 681 #RX (depending on reference type...) 682 for line in lines: 683 line_type = line[:EMBL_INDENT].strip() 684 data = line[EMBL_INDENT:].strip() 685 if line_type == 'XX': 686 pass 687 elif line_type == 'RN': 688 # Reformat reference numbers for the GenBank based consumer 689 # e.g. '[1]' becomes '1' 690 if data[0] == "[" and data[-1] == "]" : data = data[1:-1] 691 consumer.reference_num(data) 692 elif line_type == 'RP': 693 # Reformat reference numbers for the GenBank based consumer 694 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 695 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 696 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")] 697 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 698 elif line_type == 'RT': 699 #Remove the enclosing quotes and trailing semi colon. 700 #Note the title can be split over multiple lines. 701 if data.startswith('"'): 702 data = data[1:] 703 if data.endswith('";'): 704 data = data[:-2] 705 consumer.title(data) 706 elif line_type == 'RX': 707 # EMBL support three reference types at the moment: 708 # - PUBMED PUBMED bibliographic database (NLM) 709 # - DOI Digital Object Identifier (International DOI Foundation) 710 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 711 # of Agriculture (USDA) 712 # 713 # Format: 714 # RX resource_identifier; identifier. 715 # 716 # e.g. 717 # RX DOI; 10.1016/0024-3205(83)90010-3. 718 # RX PUBMED; 264242. 719 # 720 # Currently our reference object only supports PUBMED and MEDLINE 721 # (as these were in GenBank files?). 722 key, value = data.split(";",1) 723 if value.endswith(".") : value = value[:-1] 724 value = value.strip() 725 if key == "PUBMED": 726 consumer.pubmed_id(value) 727 #TODO - Handle other reference types (here and in BioSQL bindings) 728 elif line_type == 'CC': 729 # Have to pass a list of strings for this one (not just a string) 730 consumer.comment([data]) 731 elif line_type == 'DR': 732 # Database Cross-reference, format: 733 # DR database_identifier; primary_identifier; secondary_identifier. 734 # 735 # e.g. 736 # DR MGI; 98599; Tcrb-V4. 737 # 738 # TODO - How should we store any secondary identifier? 739 parts = data.rstrip(".").split(";") 740 #Turn it into "database_identifier:primary_identifier" to 741 #mimic the GenBank parser. e.g. "MGI:98599" 742 consumer.dblink("%s:%s" % (parts[0].strip(), 743 parts[1].strip())) 744 elif line_type == 'RA': 745 # Remove trailing ; at end of authors list 746 consumer.authors(data.rstrip(";")) 747 elif line_type == 'PR': 748 # Remove trailing ; at end of the project reference 749 # In GenBank files this corresponds to the old PROJECT 750 # line which is being replaced with the DBLINK line. 751 consumer.project(data.rstrip(";")) 752 elif line_type in consumer_dict: 753 #Its a semi-automatic entry! 754 getattr(consumer, consumer_dict[line_type])(data) 755 else: 756 if self.debug: 757 print "Ignoring EMBL header line:\n%s" % line
758
759 - def _feed_misc_lines(self, consumer, lines):
760 #TODO - Should we do something with the information on the SQ line(s)? 761 lines.append("") 762 line_iter = iter(lines) 763 try: 764 for line in line_iter: 765 if line.startswith("CO "): 766 line = line[5:].strip() 767 contig_location = line 768 while True: 769 line = line_iter.next() 770 if not line: 771 break 772 elif line.startswith("CO "): 773 #Don't need to preseve the whitespace here. 774 contig_location += line[5:].strip() 775 else: 776 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 777 consumer.contig_location(contig_location) 778 return 779 except StopIteration: 780 raise ValueError("Problem in misc lines before sequence")
781 782
783 -class _ImgtScanner(EmblScanner):
784 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 785 786 IMGT files are like EMBL files but in order to allow longer feature types 787 the features should be indented by 25 characters not 21 characters. In 788 practice the IMGT flat files tend to use either 21 or 25 characters, so we 789 must cope with both. 790 791 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 792 """ 793 794 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 795 "FH Key Location/Qualifiers (from EMBL)", 796 "FH Key Location/Qualifiers", 797 "FH"] 798
799 - def parse_features(self, skip=False):
800 """Return list of tuples for the features (if present) 801 802 Each feature is returned as a tuple (key, location, qualifiers) 803 where key and location are strings (e.g. "CDS" and 804 "complement(join(490883..490885,1..879))") while qualifiers 805 is a list of two string tuples (feature qualifier keys and values). 806 807 Assumes you have already read to the start of the features table. 808 """ 809 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 810 if self.debug : print "Didn't find any feature table" 811 return [] 812 813 while self.line.rstrip() in self.FEATURE_START_MARKERS: 814 self.line = self.handle.readline() 815 816 bad_position_re = re.compile(r'([0-9]+)>{1}') 817 818 features = [] 819 line = self.line 820 while True: 821 if not line: 822 raise ValueError("Premature end of line during features table") 823 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 824 if self.debug : print "Found start of sequence" 825 break 826 line = line.rstrip() 827 if line == "//": 828 raise ValueError("Premature end of features table, marker '//' found") 829 if line in self.FEATURE_END_MARKERS: 830 if self.debug : print "Found end of features" 831 line = self.handle.readline() 832 break 833 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 834 #This is an empty feature line between qualifiers. Empty 835 #feature lines within qualifiers are handled below (ignored). 836 line = self.handle.readline() 837 continue 838 839 if skip: 840 line = self.handle.readline() 841 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 842 line = self.handle.readline() 843 else: 844 assert line[:2] == "FT" 845 try: 846 feature_key, location_start = line[2:].strip().split() 847 except ValueError: 848 #e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 849 #Assume indent of 25 as per IMGT spec, with the location 850 #start in column 26 (one-based). 851 feature_key = line[2:25].strip() 852 location_start = line[25:].strip() 853 feature_lines = [location_start] 854 line = self.handle.readline() 855 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 856 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 857 #Use strip to remove any harmless trailing white space AND and leading 858 #white space (copes with 21 or 26 indents and orther variants) 859 assert line[:2] == "FT" 860 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 861 line = self.handle.readline() 862 feature_key, location, qualifiers = \ 863 self.parse_feature(feature_key, feature_lines) 864 #Try to handle known problems with IMGT locations here: 865 if ">" in location: 866 #Nasty hack for common IMGT bug, should be >123 not 123> 867 #in a location string. At least here the meaning is clear, 868 #and since it is so common I don't want to issue a warning 869 #warnings.warn("Feature location %s is invalid, " 870 # "moving greater than sign before position" 871 # % location) 872 location = bad_position_re.sub(r'>\1',location) 873 features.append((feature_key, location, qualifiers)) 874 self.line = line 875 return features
876
877 -class GenBankScanner(InsdcScanner):
878 """For extracting chunks of information in GenBank files""" 879 880 RECORD_START = "LOCUS " 881 HEADER_WIDTH = 12 882 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"] 883 FEATURE_END_MARKERS = [] 884 FEATURE_QUALIFIER_INDENT = 21 885 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 886 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 887 931
932 - def _feed_first_line(self, consumer, line):
933 """Scan over and parse GenBank LOCUS line (PRIVATE). 934 935 This must cope with several variants, primarily the old and new column 936 based standards from GenBank. Additionally EnsEMBL produces GenBank 937 files where the LOCUS line is space separated rather that following 938 the column based layout. 939 940 We also try to cope with GenBank like files with partial LOCUS lines. 941 """ 942 ##################################### 943 # LOCUS line # 944 ##################################### 945 GENBANK_INDENT = self.HEADER_WIDTH 946 GENBANK_SPACER = " "*GENBANK_INDENT 947 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 948 'LOCUS line does not start correctly:\n' + line 949 950 #Have to break up the locus line, and handle the different bits of it. 951 #There are at least two different versions of the locus line... 952 if line[29:33] in [' bp ', ' aa ',' rc '] and line[55:62] == ' ': 953 #Old... note we insist on the 55:62 being empty to avoid trying 954 #to parse space separated LOCUS lines from Ensembl etc, see below. 955 # 956 # Positions Contents 957 # --------- -------- 958 # 00:06 LOCUS 959 # 06:12 spaces 960 # 12:?? Locus name 961 # ??:?? space 962 # ??:29 Length of sequence, right-justified 963 # 29:33 space, bp, space 964 # 33:41 strand type 965 # 41:42 space 966 # 42:51 Blank (implies linear), linear or circular 967 # 51:52 space 968 # 52:55 The division code (e.g. BCT, VRL, INV) 969 # 55:62 space 970 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 971 # 972 #assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 973 # 'LOCUS line does not contain size units at expected position:\n' + line 974 assert line[41:42] == ' ', \ 975 'LOCUS line does not contain space at position 42:\n' + line 976 assert line[42:51].strip() in ['','linear','circular'], \ 977 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 978 assert line[51:52] == ' ', \ 979 'LOCUS line does not contain space at position 52:\n' + line 980 #assert line[55:62] == ' ', \ 981 # 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 982 if line[62:73].strip(): 983 assert line[64:65] == '-', \ 984 'LOCUS line does not contain - at position 65 in date:\n' + line 985 assert line[68:69] == '-', \ 986 'LOCUS line does not contain - at position 69 in date:\n' + line 987 988 name_and_length_str = line[GENBANK_INDENT:29] 989 while name_and_length_str.find(' ')!=-1: 990 name_and_length_str = name_and_length_str.replace(' ',' ') 991 name_and_length = name_and_length_str.split(' ') 992 assert len(name_and_length)<=2, \ 993 'Cannot parse the name and length in the LOCUS line:\n' + line 994 assert len(name_and_length)!=1, \ 995 'Name and length collide in the LOCUS line:\n' + line 996 #Should be possible to split them based on position, if 997 #a clear definition of the standard exists THAT AGREES with 998 #existing files. 999 consumer.locus(name_and_length[0]) 1000 consumer.size(name_and_length[1]) 1001 #consumer.residue_type(line[33:41].strip()) 1002 1003 if line[33:51].strip() == "" and line[29:33] == ' aa ': 1004 #Amino acids -> protein (even if there is no residue type given) 1005 #We want to use a protein alphabet in this case, rather than a 1006 #generic one. Not sure if this is the best way to achieve this, 1007 #but it works because the scanner checks for this: 1008 consumer.residue_type("PROTEIN") 1009 else: 1010 consumer.residue_type(line[33:51].strip()) 1011 1012 consumer.data_file_division(line[52:55]) 1013 if line[62:73].strip(): 1014 consumer.date(line[62:73]) 1015 elif line[40:44] in [' bp ', ' aa ',' rc '] \ 1016 and line[54:64].strip() in ['','linear','circular']: 1017 #New... linear/circular/big blank test should avoid EnsEMBL style 1018 #LOCUS line being treated like a proper column based LOCUS line. 1019 # 1020 # Positions Contents 1021 # --------- -------- 1022 # 00:06 LOCUS 1023 # 06:12 spaces 1024 # 12:?? Locus name 1025 # ??:?? space 1026 # ??:40 Length of sequence, right-justified 1027 # 40:44 space, bp, space 1028 # 44:47 Blank, ss-, ds-, ms- 1029 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1030 # 54:55 space 1031 # 55:63 Blank (implies linear), linear or circular 1032 # 63:64 space 1033 # 64:67 The division code (e.g. BCT, VRL, INV) 1034 # 67:68 space 1035 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1036 # 1037 assert line[40:44] in [' bp ', ' aa ',' rc '] , \ 1038 'LOCUS line does not contain size units at expected position:\n' + line 1039 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1040 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1041 assert line[47:54].strip() == "" \ 1042 or line[47:54].strip().find('DNA') != -1 \ 1043 or line[47:54].strip().find('RNA') != -1, \ 1044 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1045 assert line[54:55] == ' ', \ 1046 'LOCUS line does not contain space at position 55:\n' + line 1047 assert line[55:63].strip() in ['','linear','circular'], \ 1048 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1049 assert line[63:64] == ' ', \ 1050 'LOCUS line does not contain space at position 64:\n' + line 1051 assert line[67:68] == ' ', \ 1052 'LOCUS line does not contain space at position 68:\n' + line 1053 if line[68:79].strip(): 1054 assert line[70:71] == '-', \ 1055 'LOCUS line does not contain - at position 71 in date:\n' + line 1056 assert line[74:75] == '-', \ 1057 'LOCUS line does not contain - at position 75 in date:\n' + line 1058 1059 name_and_length_str = line[GENBANK_INDENT:40] 1060 while name_and_length_str.find(' ')!=-1: 1061 name_and_length_str = name_and_length_str.replace(' ',' ') 1062 name_and_length = name_and_length_str.split(' ') 1063 assert len(name_and_length)<=2, \ 1064 'Cannot parse the name and length in the LOCUS line:\n' + line 1065 assert len(name_and_length)!=1, \ 1066 'Name and length collide in the LOCUS line:\n' + line 1067 #Should be possible to split them based on position, if 1068 #a clear definition of the stand exists THAT AGREES with 1069 #existing files. 1070 consumer.locus(name_and_length[0]) 1071 consumer.size(name_and_length[1]) 1072 1073 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1074 #Amino acids -> protein (even if there is no residue type given) 1075 #We want to use a protein alphabet in this case, rather than a 1076 #generic one. Not sure if this is the best way to achieve this, 1077 #but it works because the scanner checks for this: 1078 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1079 else: 1080 consumer.residue_type(line[44:63].strip()) 1081 1082 consumer.data_file_division(line[64:67]) 1083 if line[68:79].strip(): 1084 consumer.date(line[68:79]) 1085 elif line[GENBANK_INDENT:].strip().count(" ")==0 : 1086 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1087 # 1088 #e.g. 1089 # 1090 # "LOCUS U00096" 1091 # 1092 #rather than: 1093 # 1094 # "LOCUS U00096 4639675 bp DNA circular BCT" 1095 # 1096 # Positions Contents 1097 # --------- -------- 1098 # 00:06 LOCUS 1099 # 06:12 spaces 1100 # 12:?? Locus name 1101 if line[GENBANK_INDENT:].strip() != "": 1102 consumer.locus(line[GENBANK_INDENT:].strip()) 1103 else: 1104 #Must just have just "LOCUS ", is this even legitimate? 1105 #We should be able to continue parsing... we need real world testcases! 1106 warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line) 1107 elif len(line.split())==7 and line.split()[3] in ["aa","bp"]: 1108 #Cope with EnsEMBL genbank files which use space separation rather 1109 #than the expected column based layout. e.g. 1110 #LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 1111 #LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 1112 #LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 1113 #LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 1114 #Notice that the 'bp' can occur in the position expected by either 1115 #the old or the new fixed column standards (parsed above). 1116 splitline = line.split() 1117 consumer.locus(splitline[1]) 1118 consumer.size(splitline[2]) 1119 consumer.residue_type(splitline[4]) 1120 consumer.data_file_division(splitline[5]) 1121 consumer.date(splitline[6]) 1122 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]: 1123 #Cope with EMBOSS seqret output where it seems the locus id can cause 1124 #the other fields to overflow. We just IGNORE the other fields! 1125 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line) 1126 consumer.locus(line.split()[1]) 1127 consumer.size(line.split()[2]) 1128 elif len(line.split())>=4 and line.split()[-1] in ["aa","bp"]: 1129 #Cope with psuedo-GenBank files like this: 1130 # "LOCUS RNA5 complete 1718 bp" 1131 #Treat everything between LOCUS and the size as the identifier. 1132 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line) 1133 consumer.locus(line[5:].rsplit(None,2)[0].strip()) 1134 consumer.size(line.split()[-2]) 1135 else: 1136 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1137 1138
1139 - def _feed_header_lines(self, consumer, lines):
1140 #Following dictionary maps GenBank lines to the associated 1141 #consumer methods - the special cases like LOCUS where one 1142 #genbank line triggers several consumer calls have to be 1143 #handled individually. 1144 GENBANK_INDENT = self.HEADER_WIDTH 1145 GENBANK_SPACER = " "*GENBANK_INDENT 1146 consumer_dict = { 1147 'DEFINITION' : 'definition', 1148 'ACCESSION' : 'accession', 1149 'NID' : 'nid', 1150 'PID' : 'pid', 1151 'DBSOURCE' : 'db_source', 1152 'KEYWORDS' : 'keywords', 1153 'SEGMENT' : 'segment', 1154 'SOURCE' : 'source', 1155 'AUTHORS' : 'authors', 1156 'CONSRTM' : 'consrtm', 1157 'PROJECT' : 'project', 1158 'DBLINK' : 'dblink', 1159 'TITLE' : 'title', 1160 'JOURNAL' : 'journal', 1161 'MEDLINE' : 'medline_id', 1162 'PUBMED' : 'pubmed_id', 1163 'REMARK' : 'remark'} 1164 #We have to handle the following specially: 1165 #ORIGIN (locus, size, residue_type, data_file_division and date) 1166 #COMMENT (comment) 1167 #VERSION (version and gi) 1168 #REFERENCE (eference_num and reference_bases) 1169 #ORGANISM (organism and taxonomy) 1170 lines = filter(None,lines) 1171 lines.append("") #helps avoid getting StopIteration all the time 1172 line_iter = iter(lines) 1173 try: 1174 line = line_iter.next() 1175 while True: 1176 if not line : break 1177 line_type = line[:GENBANK_INDENT].strip() 1178 data = line[GENBANK_INDENT:].strip() 1179 1180 if line_type == 'VERSION': 1181 #Need to call consumer.version(), and maybe also consumer.gi() as well. 1182 #e.g. 1183 # VERSION AC007323.5 GI:6587720 1184 while data.find(' ')!=-1: 1185 data = data.replace(' ',' ') 1186 if data.find(' GI:')==-1: 1187 consumer.version(data) 1188 else: 1189 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 1190 consumer.version(data.split(' GI:')[0]) 1191 consumer.gi(data.split(' GI:')[1]) 1192 #Read in the next line! 1193 line = line_iter.next() 1194 elif line_type == 'REFERENCE': 1195 if self.debug >1 : print "Found reference [" + data + "]" 1196 #Need to call consumer.reference_num() and consumer.reference_bases() 1197 #e.g. 1198 # REFERENCE 1 (bases 1 to 86436) 1199 # 1200 #Note that this can be multiline, see Bug 1968, e.g. 1201 # 1202 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1203 # 28259) 1204 # 1205 #For such cases we will call the consumer once only. 1206 data = data.strip() 1207 1208 #Read in the next line, and see if its more of the reference: 1209 while True: 1210 line = line_iter.next() 1211 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1212 #Add this continuation to the data string 1213 data += " " + line[GENBANK_INDENT:] 1214 if self.debug >1 : print "Extended reference text [" + data + "]" 1215 else: 1216 #End of the reference, leave this text in the variable "line" 1217 break 1218 1219 #We now have all the reference line(s) stored in a string, data, 1220 #which we pass to the consumer 1221 while data.find(' ')!=-1: 1222 data = data.replace(' ',' ') 1223 if data.find(' ')==-1: 1224 if self.debug >2 : print 'Reference number \"' + data + '\"' 1225 consumer.reference_num(data) 1226 else: 1227 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"' 1228 consumer.reference_num(data[:data.find(' ')]) 1229 consumer.reference_bases(data[data.find(' ')+1:]) 1230 elif line_type == 'ORGANISM': 1231 #Typically the first line is the organism, and subsequent lines 1232 #are the taxonomy lineage. However, given longer and longer 1233 #species names (as more and more strains and sub strains get 1234 #sequenced) the oragnism name can now get wrapped onto multiple 1235 #lines. The NCBI say we have to recognise the lineage line by 1236 #the presense of semi-colon delimited entries. In the long term, 1237 #they are considering adding a new keyword (e.g. LINEAGE). 1238 #See Bug 2591 for details. 1239 organism_data = data 1240 lineage_data = "" 1241 while True: 1242 line = line_iter.next() 1243 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1244 if lineage_data or ";" in line: 1245 lineage_data += " " + line[GENBANK_INDENT:] 1246 else: 1247 organism_data += " " + line[GENBANK_INDENT:].strip() 1248 else: 1249 #End of organism and taxonomy 1250 break 1251 consumer.organism(organism_data) 1252 if lineage_data.strip() == "" and self.debug > 1: 1253 print "Taxonomy line(s) missing or blank" 1254 consumer.taxonomy(lineage_data.strip()) 1255 del organism_data, lineage_data 1256 elif line_type == 'COMMENT': 1257 if self.debug > 1 : print "Found comment" 1258 #This can be multiline, and should call consumer.comment() once 1259 #with a list where each entry is a line. 1260 comment_list=[] 1261 comment_list.append(data) 1262 while True: 1263 line = line_iter.next() 1264 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1265 data = line[GENBANK_INDENT:] 1266 comment_list.append(data) 1267 if self.debug > 2 : print "Comment continuation [" + data + "]" 1268 else: 1269 #End of the comment 1270 break 1271 consumer.comment(comment_list) 1272 del comment_list 1273 elif line_type in consumer_dict: 1274 #Its a semi-automatic entry! 1275 #Now, this may be a multi line entry... 1276 while True: 1277 line = line_iter.next() 1278 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1279 data += ' ' + line[GENBANK_INDENT:] 1280 else: 1281 #We now have all the data for this entry: 1282 getattr(consumer, consumer_dict[line_type])(data) 1283 #End of continuation - return to top of loop! 1284 break 1285 else: 1286 if self.debug: 1287 print "Ignoring GenBank header line:\n" % line 1288 #Read in next line 1289 line = line_iter.next() 1290 except StopIteration: 1291 raise ValueError("Problem in header")
1292
1293 - def _feed_misc_lines(self, consumer, lines):
1294 #Deals with a few misc lines between the features and the sequence 1295 GENBANK_INDENT = self.HEADER_WIDTH 1296 GENBANK_SPACER = " "*GENBANK_INDENT 1297 lines.append("") 1298 line_iter = iter(lines) 1299 try: 1300 for line in line_iter: 1301 if line.find('BASE COUNT')==0: 1302 line = line[10:].strip() 1303 if line: 1304 if self.debug : print "base_count = " + line 1305 consumer.base_count(line) 1306 if line.find("ORIGIN")==0: 1307 line = line[6:].strip() 1308 if line: 1309 if self.debug : print "origin_name = " + line 1310 consumer.origin_name(line) 1311 if line.find("WGS ")==0 : 1312 line = line[3:].strip() 1313 consumer.wgs(line) 1314 if line.find("WGS_SCAFLD")==0 : 1315 line = line[10:].strip() 1316 consumer.add_wgs_scafld(line) 1317 if line.find("CONTIG")==0: 1318 line = line[6:].strip() 1319 contig_location = line 1320 while True: 1321 line = line_iter.next() 1322 if not line: 1323 break 1324 elif line[:GENBANK_INDENT]==GENBANK_SPACER: 1325 #Don't need to preseve the whitespace here. 1326 contig_location += line[GENBANK_INDENT:].rstrip() 1327 else: 1328 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1329 consumer.contig_location(contig_location) 1330 return 1331 except StopIteration: 1332 raise ValueError("Problem in misc lines before sequence")
1333 1334 if __name__ == "__main__": 1335 from StringIO import StringIO 1336 1337 gbk_example = \ 1338 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1339 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1340 (AXL2) and Rev7p (REV7) genes, complete cds. 1341 ACCESSION U49845 1342 VERSION U49845.1 GI:1293613 1343 KEYWORDS . 1344 SOURCE Saccharomyces cerevisiae (baker's yeast) 1345 ORGANISM Saccharomyces cerevisiae 1346 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1347 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1348 REFERENCE 1 (bases 1 to 5028) 1349 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1350 TITLE Cloning and sequence of REV7, a gene whose function is required for 1351 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1352 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1353 PUBMED 7871890 1354 REFERENCE 2 (bases 1 to 5028) 1355 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1356 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1357 plasma membrane glycoprotein 1358 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1359 PUBMED 8846915 1360 REFERENCE 3 (bases 1 to 5028) 1361 AUTHORS Roemer,T. 1362 TITLE Direct Submission 1363 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1364 Haven, CT, USA 1365 FEATURES Location/Qualifiers 1366 source 1..5028 1367 /organism="Saccharomyces cerevisiae" 1368 /db_xref="taxon:4932" 1369 /chromosome="IX" 1370 /map="9" 1371 CDS <1..206 1372 /codon_start=3 1373 /product="TCP1-beta" 1374 /protein_id="AAA98665.1" 1375 /db_xref="GI:1293614" 1376 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1377 AEVLLRVDNIIRARPRTANRQHM" 1378 gene 687..3158 1379 /gene="AXL2" 1380 CDS 687..3158 1381 /gene="AXL2" 1382 /note="plasma membrane glycoprotein" 1383 /codon_start=1 1384 /function="required for axial budding pattern of S. 1385 cerevisiae" 1386 /product="Axl2p" 1387 /protein_id="AAA98666.1" 1388 /db_xref="GI:1293615" 1389 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1390 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1391 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1392 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1393 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1394 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1395 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1396 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1397 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1398 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1399 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1400 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1401 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1402 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1403 VDFSNKSNVNVGQVKDIHGRIPEML" 1404 gene complement(3300..4037) 1405 /gene="REV7" 1406 CDS complement(3300..4037) 1407 /gene="REV7" 1408 /codon_start=1 1409 /product="Rev7p" 1410 /protein_id="AAA98667.1" 1411 /db_xref="GI:1293616" 1412 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1413 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1414 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1415 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1416 LISGDDKILNGVYSQYEEGESIFGSLF" 1417 ORIGIN 1418 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1419 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1420 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1421 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1422 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1423 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1424 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1425 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1426 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1427 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1428 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1429 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1430 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1431 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1432 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1433 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1434 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1435 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1436 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1437 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1438 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1439 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1440 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1441 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1442 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1443 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1444 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1445 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1446 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1447 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1448 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1449 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1450 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1451 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1452 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1453 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1454 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1455 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1456 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1457 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1458 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1459 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1460 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1461 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1462 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1463 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1464 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1465 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1466 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1467 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1468 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1469 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1470 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1471 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1472 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1473 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1474 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1475 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1476 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1477 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1478 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1479 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1480 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1481 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1482 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1483 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1484 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1485 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1486 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1487 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1488 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1489 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1490 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1491 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1492 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1493 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1494 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1495 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1496 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1497 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1498 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1499 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1500 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1501 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1502 //""" 1503 1504 # GenBank format protein (aka GenPept) file from: 1505 # http://www.molecularevolution.org/resources/fileformats/ 1506 gbk_example2 = \ 1507 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1508 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1509 ACCESSION AAD51968 1510 VERSION AAD51968.1 GI:5805369 1511 DBSOURCE locus AF171097 accession AF171097.1 1512 KEYWORDS . 1513 SOURCE Yersinia enterocolitica 1514 ORGANISM Yersinia enterocolitica 1515 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1516 Enterobacteriaceae; Yersinia. 1517 REFERENCE 1 (residues 1 to 143) 1518 AUTHORS Revell,P.A. and Miller,V.L. 1519 TITLE A chromosomally encoded regulator is required for expression of the 1520 Yersinia enterocolitica inv gene and for virulence 1521 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1522 MEDLINE 20138369 1523 PUBMED 10672189 1524 REFERENCE 2 (residues 1 to 143) 1525 AUTHORS Revell,P.A. and Miller,V.L. 1526 TITLE Direct Submission 1527 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1528 University School of Medicine, Campus Box 8230, 660 South Euclid, 1529 St. Louis, MO 63110, USA 1530 COMMENT Method: conceptual translation. 1531 FEATURES Location/Qualifiers 1532 source 1..143 1533 /organism="Yersinia enterocolitica" 1534 /mol_type="unassigned DNA" 1535 /strain="JB580v" 1536 /serotype="O:8" 1537 /db_xref="taxon:630" 1538 Protein 1..143 1539 /product="transcriptional regulator RovA" 1540 /name="regulates inv expression" 1541 CDS 1..143 1542 /gene="rovA" 1543 /coded_by="AF171097.1:380..811" 1544 /note="regulator of virulence" 1545 /transl_table=11 1546 ORIGIN 1547 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1548 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1549 121 deiellsgli dklerniiql qsk 1550 // 1551 """ 1552 1553 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1554 XX 1555 AC X56734; S46826; 1556 XX 1557 DT 12-SEP-1991 (Rel. 29, Created) 1558 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1559 XX 1560 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1561 XX 1562 KW beta-glucosidase. 1563 XX 1564 OS Trifolium repens (white clover) 1565 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1566 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1567 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1568 XX 1569 RN [5] 1570 RP 1-1859 1571 RX PUBMED; 1907511. 1572 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1573 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1574 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1575 RL Plant Mol. Biol. 17(2):209-219(1991). 1576 XX 1577 RN [6] 1578 RP 1-1859 1579 RA Hughes M.A.; 1580 RT ; 1581 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1582 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1583 RL Upon Tyne, NE2 4HH, UK 1584 XX 1585 FH Key Location/Qualifiers 1586 FH 1587 FT source 1..1859 1588 FT /organism="Trifolium repens" 1589 FT /mol_type="mRNA" 1590 FT /clone_lib="lambda gt10" 1591 FT /clone="TRE361" 1592 FT /tissue_type="leaves" 1593 FT /db_xref="taxon:3899" 1594 FT CDS 14..1495 1595 FT /product="beta-glucosidase" 1596 FT /EC_number="3.2.1.21" 1597 FT /note="non-cyanogenic" 1598 FT /db_xref="GOA:P26204" 1599 FT /db_xref="InterPro:IPR001360" 1600 FT /db_xref="InterPro:IPR013781" 1601 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1602 FT /protein_id="CAA40058.1" 1603 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1604 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1605 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1606 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1607 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1608 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1609 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1610 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1611 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1612 FT mRNA 1..1859 1613 FT /experiment="experimental evidence, no additional details 1614 FT recorded" 1615 XX 1616 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1617 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1618 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1619 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1620 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1621 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1622 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1623 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1624 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1625 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1626 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1627 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1628 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1629 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1630 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1631 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1632 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1633 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1634 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1635 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1636 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1637 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1638 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1639 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1640 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1641 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1642 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1643 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1644 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1645 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1646 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1647 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1648 // 1649 """ 1650 1651 print "GenBank CDS Iteration" 1652 print "=====================" 1653 1654 g = GenBankScanner() 1655 for record in g.parse_cds_features(StringIO(gbk_example)): 1656 print record 1657 1658 g = GenBankScanner() 1659 for record in g.parse_cds_features(StringIO(gbk_example2), 1660 tags2id=('gene','locus_tag','product')): 1661 print record 1662 1663 g = GenBankScanner() 1664 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1665 tags2id=('gene','locus_tag','product')): 1666 print record 1667 1668 print 1669 print "GenBank Iteration" 1670 print "=================" 1671 g = GenBankScanner() 1672 for record in g.parse_records(StringIO(gbk_example),do_features=False): 1673 print record.id, record.name, record.description 1674 print record.seq 1675 1676 g = GenBankScanner() 1677 for record in g.parse_records(StringIO(gbk_example),do_features=True): 1678 print record.id, record.name, record.description 1679 print record.seq 1680 1681 g = GenBankScanner() 1682 for record in g.parse_records(StringIO(gbk_example2),do_features=False): 1683 print record.id, record.name, record.description 1684 print record.seq 1685 1686 g = GenBankScanner() 1687 for record in g.parse_records(StringIO(gbk_example2),do_features=True): 1688 print record.id, record.name, record.description 1689 print record.seq 1690 1691 print 1692 print "EMBL CDS Iteration" 1693 print "==================" 1694 1695 e = EmblScanner() 1696 for record in e.parse_cds_features(StringIO(embl_example)): 1697 print record 1698 1699 print 1700 print "EMBL Iteration" 1701 print "==============" 1702 e = EmblScanner() 1703 for record in e.parse_records(StringIO(embl_example),do_features=True): 1704 print record.id, record.name, record.description 1705 print record.seq 1706