1
2
3
4
5
6 """Internal code for parsing GenBank and EMBL files (PRIVATE).
7
8 This code is NOT intended for direct use. It provides a basic scanner
9 (for use with a event consumer such as Bio.GenBank._FeatureConsumer)
10 to parse a GenBank or EMBL file (with their shared INSDC feature table).
11
12 It is used by Bio.GenBank to parse GenBank files
13 It is also used by Bio.SeqIO to parse GenBank and EMBL files
14
15 Feature Table Documentation:
16 http://www.insdc.org/files/feature_table.html
17 http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html
18 ftp://ftp.ncbi.nih.gov/genbank/docs/
19 """
20
21
22
23
24
25
26
27
28
29 import warnings
30 import re
31 from Bio.Seq import Seq
32 from Bio.SeqRecord import SeqRecord
33 from Bio.Alphabet import generic_protein
34
36 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
37
38 The International Nucleotide Sequence Database Collaboration (INSDC)
39 between the DDBJ, EMBL, and GenBank. These organisations all use the
40 same "Feature Table" layout in their plain text flat file formats.
41
42 However, the header and sequence sections of an EMBL file are very
43 different in layout to those produced by GenBank/DDBJ."""
44
45
46 RECORD_START = "XXX"
47 HEADER_WIDTH = 3
48 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
49 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
50 FEATURE_QUALIFIER_INDENT = 0
51 FEATURE_QUALIFIER_SPACER = ""
52 SEQUENCE_HEADERS=["XXX"]
53
61
65
67 """Read in lines until find the ID/LOCUS line, which is returned.
68
69 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
70 will we ignored."""
71 while True:
72 if self.line:
73 line = self.line
74 self.line = ""
75 else:
76 line = self.handle.readline()
77 if not line:
78 if self.debug : print "End of file"
79 return None
80 if line[:self.HEADER_WIDTH]==self.RECORD_START:
81 if self.debug > 1: print "Found the start of a record:\n" + line
82 break
83 line = line.rstrip()
84 if line == "//":
85 if self.debug > 1: print "Skipping // marking end of last record"
86 elif line == "":
87 if self.debug > 1: print "Skipping blank line before record"
88 else:
89
90 if self.debug > 1:
91 print "Skipping header line before record:\n" + line
92 self.line = line
93 return line
94
96 """Return list of strings making up the header
97
98 New line characters are removed.
99
100 Assumes you have just read in the ID/LOCUS line.
101 """
102 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
103 "Not at start of record"
104
105 header_lines = []
106 while True:
107 line = self.handle.readline()
108 if not line:
109 raise ValueError("Premature end of line during sequence data")
110 line = line.rstrip()
111 if line in self.FEATURE_START_MARKERS:
112 if self.debug : print "Found header table"
113 break
114
115
116
117 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
118 if self.debug : print "Found start of sequence"
119 break
120 if line == "//":
121 raise ValueError("Premature end of sequence data marker '//' found")
122 header_lines.append(line)
123 self.line = line
124 return header_lines
125
191
193 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
194
195 For example given this GenBank feature:
196
197 CDS complement(join(490883..490885,1..879))
198 /locus_tag="NEQ001"
199 /note="conserved hypothetical [Methanococcus jannaschii];
200 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
201 localization signal; IPR002743: Protein of unknown
202 function DUF57"
203 /codon_start=1
204 /transl_table=11
205 /product="hypothetical protein"
206 /protein_id="NP_963295.1"
207 /db_xref="GI:41614797"
208 /db_xref="GeneID:2732620"
209 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
210 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
211 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
212 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
213 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
214 LNSMGFGFVNTKKNSAR"
215
216 Then should give input key="CDS" and the rest of the data as a list of strings
217 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
218 where the leading spaces and trailing newlines have been removed.
219
220 Returns tuple containing: (key as string, location string, qualifiers as list)
221 as follows for this example:
222
223 key = "CDS", string
224 location = "complement(join(490883..490885,1..879))", string
225 qualifiers = list of string tuples:
226
227 [('locus_tag', '"NEQ001"'),
228 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
229 ('codon_start', '1'),
230 ('transl_table', '11'),
231 ('product', '"hypothetical protein"'),
232 ('protein_id', '"NP_963295.1"'),
233 ('db_xref', '"GI:41614797"'),
234 ('db_xref', '"GeneID:2732620"'),
235 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
236
237 In the above example, the "note" and "translation" were edited for compactness,
238 and they would contain multiple new line characters (displayed above as \n)
239
240 If a qualifier is quoted (in this case, everything except codon_start and
241 transl_table) then the quotes are NOT removed.
242
243 Note that no whitespace is removed.
244 """
245
246 iterator = iter(filter(None, lines))
247 try:
248 line = iterator.next()
249
250 feature_location = line.strip()
251 while feature_location[-1:]==",":
252
253 line = iterator.next()
254 feature_location += line.strip()
255
256 qualifiers=[]
257
258 for i, line in enumerate(iterator):
259
260 if i == 0 and line.startswith(")"):
261 feature_location += line.strip()
262 elif line[0]=="/":
263
264 i = line.find("=")
265 key = line[1:i]
266 value = line[i+1:]
267 if i==-1:
268
269 key = line[1:]
270 qualifiers.append((key,None))
271 elif not value:
272
273 qualifiers.append((key,""))
274 elif value[0]=='"':
275
276 if value[-1]!='"' or value!='"':
277
278 while value[-1] != '"':
279 value += "\n" + iterator.next()
280 else:
281
282 assert value == '"'
283 if self.debug : print "Quoted line %s:%s" % (key, value)
284
285 qualifiers.append((key,value))
286 else:
287
288
289 qualifiers.append((key,value))
290 else:
291
292 assert len(qualifiers) > 0
293 assert key==qualifiers[-1][0]
294
295 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
296 return (feature_key, feature_location, qualifiers)
297 except StopIteration:
298
299 raise ValueError("Problem with '%s' feature:\n%s" \
300 % (feature_key, "\n".join(lines)))
301
322
324 """Handle the LOCUS/ID line, passing data to the comsumer
325
326 This should be implemented by the EMBL / GenBank specific subclass
327
328 Used by the parse_records() and parse() methods.
329 """
330 pass
331
333 """Handle the header lines (list of strings), passing data to the comsumer
334
335 This should be implemented by the EMBL / GenBank specific subclass
336
337 Used by the parse_records() and parse() methods.
338 """
339 pass
340
341
343 """Handle the feature table (list of tuples), passing data to the comsumer
344
345 Used by the parse_records() and parse() methods.
346 """
347 consumer.start_feature_table()
348 for feature_key, location_string, qualifiers in feature_tuples:
349 consumer.feature_key(feature_key)
350 consumer.location(location_string)
351 for q_key, q_value in qualifiers:
352 if q_value is None:
353 consumer.feature_qualifier(q_key, q_value)
354 else:
355 consumer.feature_qualifier(q_key, q_value.replace("\n"," "))
356
357
359 """Handle any lines between features and sequence (list of strings), passing data to the consumer
360
361 This should be implemented by the EMBL / GenBank specific subclass
362
363 Used by the parse_records() and parse() methods.
364 """
365 pass
366
367 - def feed(self, handle, consumer, do_features=True):
368 """Feed a set of data into the consumer.
369
370 This method is intended for use with the "old" code in Bio.GenBank
371
372 Arguments:
373 handle - A handle with the information to parse.
374 consumer - The consumer that should be informed of events.
375 do_features - Boolean, should the features be parsed?
376 Skipping the features can be much faster.
377
378 Return values:
379 true - Passed a record
380 false - Did not find a record
381 """
382
383
384 self.set_handle(handle)
385 if not self.find_start():
386
387 consumer.data=None
388 return False
389
390
391
392
393
394
395 self._feed_first_line(consumer, self.line)
396 self._feed_header_lines(consumer, self.parse_header())
397
398
399 if do_features:
400 self._feed_feature_table(consumer, self.parse_features(skip=False))
401 else:
402 self.parse_features(skip=True)
403
404
405 misc_lines, sequence_string = self.parse_footer()
406 self._feed_misc_lines(consumer, misc_lines)
407
408 consumer.sequence(sequence_string)
409
410 consumer.record_end("//")
411
412 assert self.line == "//"
413
414
415 return True
416
417 - def parse(self, handle, do_features=True):
432
433
435 """Returns a SeqRecord object iterator
436
437 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
438
439 The SeqRecord objects include SeqFeatures if do_features=True
440
441 This method is intended for use in Bio.SeqIO
442 """
443
444 while True:
445 record = self.parse(handle, do_features)
446 if record is None : break
447 assert record.id is not None
448 assert record.name != "<unknown name>"
449 assert record.description != "<unknown description>"
450 yield record
451
455 """Returns SeqRecord object iterator
456
457 Each CDS feature becomes a SeqRecord.
458
459 alphabet - Used for any sequence found in a translation field.
460 tags2id - Tupple of three strings, the feature keys to use
461 for the record id, name and description,
462
463 This method is intended for use in Bio.SeqIO
464 """
465 self.set_handle(handle)
466 while self.find_start():
467
468 self.parse_header()
469 feature_tuples = self.parse_features()
470
471 while True:
472 line = self.handle.readline()
473 if not line : break
474 if line[:2]=="//" : break
475 self.line = line.rstrip()
476
477
478 for key, location_string, qualifiers in feature_tuples:
479 if key=="CDS":
480
481
482
483
484
485 record = SeqRecord(seq=None)
486 annotations = record.annotations
487
488
489
490
491 annotations['raw_location'] = location_string.replace(' ','')
492
493 for (qualifier_name, qualifier_data) in qualifiers:
494 if qualifier_data is not None \
495 and qualifier_data[0]=='"' and qualifier_data[-1]=='"':
496
497 qualifier_data = qualifier_data[1:-1]
498
499 if qualifier_name == "translation":
500 assert record.seq is None, "Multiple translations!"
501 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
502 elif qualifier_name == "db_xref":
503
504 record.dbxrefs.append(qualifier_data)
505 else:
506 if qualifier_data is not None:
507 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
508 try:
509 annotations[qualifier_name] += " " + qualifier_data
510 except KeyError:
511
512 annotations[qualifier_name]= qualifier_data
513
514
515
516 try:
517 record.id = annotations[tags2id[0]]
518 except KeyError:
519 pass
520 try:
521 record.name = annotations[tags2id[1]]
522 except KeyError:
523 pass
524 try:
525 record.description = annotations[tags2id[2]]
526 except KeyError:
527 pass
528
529 yield record
530
531
533 """For extracting chunks of information in EMBL files"""
534
535 RECORD_START = "ID "
536 HEADER_WIDTH = 5
537 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
538 FEATURE_END_MARKERS = ["XX"]
539 FEATURE_QUALIFIER_INDENT = 21
540 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
541 SEQUENCE_HEADERS=["SQ", "CO"]
542
577
588
590
591
592
593 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
594 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
595 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
596 fields = [entry.strip() for entry in fields]
597 """
598 The tokens represent:
599 0. Primary accession number
600 (space sep)
601 1. ??? (e.g. standard)
602 (semi-colon)
603 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
604 3. Taxonomic division (e.g. 'PRO')
605 4. Sequence length (e.g. '4639675 BP.')
606 """
607 consumer.locus(fields[0])
608 consumer.residue_type(fields[2])
609 consumer.data_file_division(fields[3])
610 self._feed_seq_length(consumer, fields[4])
611
613
614
615
616 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
617 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
618 assert len(fields) == 7
619 """
620 The tokens represent:
621 0. Primary accession number
622 1. Sequence version number
623 2. Topology: 'circular' or 'linear'
624 3. Molecule type (e.g. 'genomic DNA')
625 4. Data class (e.g. 'STD')
626 5. Taxonomic division (e.g. 'PRO')
627 6. Sequence length (e.g. '4639675 BP.')
628 """
629
630 consumer.locus(fields[0])
631
632
633
634 consumer.accession(fields[0])
635
636
637
638 version_parts = fields[1].split()
639 if len(version_parts)==2 \
640 and version_parts[0]=="SV" \
641 and version_parts[1].isdigit():
642 consumer.version_suffix(version_parts[1])
643
644
645 consumer.residue_type(" ".join(fields[2:4]))
646
647
648
649 consumer.data_file_division(fields[5])
650
651 self._feed_seq_length(consumer, fields[6])
652
654 length_parts = text.split()
655 assert len(length_parts) == 2
656 assert length_parts[1].upper() in ["BP", "BP.", "AA."]
657 consumer.size(length_parts[0])
658
660 EMBL_INDENT = self.HEADER_WIDTH
661 EMBL_SPACER = " " * EMBL_INDENT
662 consumer_dict = {
663 'AC' : 'accession',
664 'SV' : 'version',
665 'DE' : 'definition',
666
667
668
669
670 'RG' : 'consrtm',
671
672
673 'RL' : 'journal',
674 'OS' : 'organism',
675 'OC' : 'taxonomy',
676
677 'CC' : 'comment',
678
679 }
680
681
682 for line in lines:
683 line_type = line[:EMBL_INDENT].strip()
684 data = line[EMBL_INDENT:].strip()
685 if line_type == 'XX':
686 pass
687 elif line_type == 'RN':
688
689
690 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
691 consumer.reference_num(data)
692 elif line_type == 'RP':
693
694
695
696 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")]
697 consumer.reference_bases("(bases %s)" % "; ".join(parts))
698 elif line_type == 'RT':
699
700
701 if data.startswith('"'):
702 data = data[1:]
703 if data.endswith('";'):
704 data = data[:-2]
705 consumer.title(data)
706 elif line_type == 'RX':
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722 key, value = data.split(";",1)
723 if value.endswith(".") : value = value[:-1]
724 value = value.strip()
725 if key == "PUBMED":
726 consumer.pubmed_id(value)
727
728 elif line_type == 'CC':
729
730 consumer.comment([data])
731 elif line_type == 'DR':
732
733
734
735
736
737
738
739 parts = data.rstrip(".").split(";")
740
741
742 consumer.dblink("%s:%s" % (parts[0].strip(),
743 parts[1].strip()))
744 elif line_type == 'RA':
745
746 consumer.authors(data.rstrip(";"))
747 elif line_type == 'PR':
748
749
750
751 consumer.project(data.rstrip(";"))
752 elif line_type in consumer_dict:
753
754 getattr(consumer, consumer_dict[line_type])(data)
755 else:
756 if self.debug:
757 print "Ignoring EMBL header line:\n%s" % line
758
760
761 lines.append("")
762 line_iter = iter(lines)
763 try:
764 for line in line_iter:
765 if line.startswith("CO "):
766 line = line[5:].strip()
767 contig_location = line
768 while True:
769 line = line_iter.next()
770 if not line:
771 break
772 elif line.startswith("CO "):
773
774 contig_location += line[5:].strip()
775 else:
776 raise ValueError('Expected CO (contig) continuation line, got:\n' + line)
777 consumer.contig_location(contig_location)
778 return
779 except StopIteration:
780 raise ValueError("Problem in misc lines before sequence")
781
782
784 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE).
785
786 IMGT files are like EMBL files but in order to allow longer feature types
787 the features should be indented by 25 characters not 21 characters. In
788 practice the IMGT flat files tend to use either 21 or 25 characters, so we
789 must cope with both.
790
791 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank.
792 """
793
794 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers",
795 "FH Key Location/Qualifiers (from EMBL)",
796 "FH Key Location/Qualifiers",
797 "FH"]
798
800 """Return list of tuples for the features (if present)
801
802 Each feature is returned as a tuple (key, location, qualifiers)
803 where key and location are strings (e.g. "CDS" and
804 "complement(join(490883..490885,1..879))") while qualifiers
805 is a list of two string tuples (feature qualifier keys and values).
806
807 Assumes you have already read to the start of the features table.
808 """
809 if self.line.rstrip() not in self.FEATURE_START_MARKERS:
810 if self.debug : print "Didn't find any feature table"
811 return []
812
813 while self.line.rstrip() in self.FEATURE_START_MARKERS:
814 self.line = self.handle.readline()
815
816 bad_position_re = re.compile(r'([0-9]+)>{1}')
817
818 features = []
819 line = self.line
820 while True:
821 if not line:
822 raise ValueError("Premature end of line during features table")
823 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
824 if self.debug : print "Found start of sequence"
825 break
826 line = line.rstrip()
827 if line == "//":
828 raise ValueError("Premature end of features table, marker '//' found")
829 if line in self.FEATURE_END_MARKERS:
830 if self.debug : print "Found end of features"
831 line = self.handle.readline()
832 break
833 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "":
834
835
836 line = self.handle.readline()
837 continue
838
839 if skip:
840 line = self.handle.readline()
841 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER:
842 line = self.handle.readline()
843 else:
844 assert line[:2] == "FT"
845 try:
846 feature_key, location_start = line[2:].strip().split()
847 except ValueError:
848
849
850
851 feature_key = line[2:25].strip()
852 location_start = line[25:].strip()
853 feature_lines = [location_start]
854 line = self.handle.readline()
855 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \
856 or line.rstrip() == "" :
857
858
859 assert line[:2] == "FT"
860 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip())
861 line = self.handle.readline()
862 feature_key, location, qualifiers = \
863 self.parse_feature(feature_key, feature_lines)
864
865 if ">" in location:
866
867
868
869
870
871
872 location = bad_position_re.sub(r'>\1',location)
873 features.append((feature_key, location, qualifiers))
874 self.line = line
875 return features
876
878 """For extracting chunks of information in GenBank files"""
879
880 RECORD_START = "LOCUS "
881 HEADER_WIDTH = 12
882 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
883 FEATURE_END_MARKERS = []
884 FEATURE_QUALIFIER_INDENT = 21
885 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
886 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
887
931
933 """Scan over and parse GenBank LOCUS line (PRIVATE).
934
935 This must cope with several variants, primarily the old and new column
936 based standards from GenBank. Additionally EnsEMBL produces GenBank
937 files where the LOCUS line is space separated rather that following
938 the column based layout.
939
940 We also try to cope with GenBank like files with partial LOCUS lines.
941 """
942
943
944
945 GENBANK_INDENT = self.HEADER_WIDTH
946 GENBANK_SPACER = " "*GENBANK_INDENT
947 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
948 'LOCUS line does not start correctly:\n' + line
949
950
951
952 if line[29:33] in [' bp ', ' aa ',' rc '] and line[55:62] == ' ':
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974 assert line[41:42] == ' ', \
975 'LOCUS line does not contain space at position 42:\n' + line
976 assert line[42:51].strip() in ['','linear','circular'], \
977 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
978 assert line[51:52] == ' ', \
979 'LOCUS line does not contain space at position 52:\n' + line
980
981
982 if line[62:73].strip():
983 assert line[64:65] == '-', \
984 'LOCUS line does not contain - at position 65 in date:\n' + line
985 assert line[68:69] == '-', \
986 'LOCUS line does not contain - at position 69 in date:\n' + line
987
988 name_and_length_str = line[GENBANK_INDENT:29]
989 while name_and_length_str.find(' ')!=-1:
990 name_and_length_str = name_and_length_str.replace(' ',' ')
991 name_and_length = name_and_length_str.split(' ')
992 assert len(name_and_length)<=2, \
993 'Cannot parse the name and length in the LOCUS line:\n' + line
994 assert len(name_and_length)!=1, \
995 'Name and length collide in the LOCUS line:\n' + line
996
997
998
999 consumer.locus(name_and_length[0])
1000 consumer.size(name_and_length[1])
1001
1002
1003 if line[33:51].strip() == "" and line[29:33] == ' aa ':
1004
1005
1006
1007
1008 consumer.residue_type("PROTEIN")
1009 else:
1010 consumer.residue_type(line[33:51].strip())
1011
1012 consumer.data_file_division(line[52:55])
1013 if line[62:73].strip():
1014 consumer.date(line[62:73])
1015 elif line[40:44] in [' bp ', ' aa ',' rc '] \
1016 and line[54:64].strip() in ['','linear','circular']:
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037 assert line[40:44] in [' bp ', ' aa ',' rc '] , \
1038 'LOCUS line does not contain size units at expected position:\n' + line
1039 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
1040 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
1041 assert line[47:54].strip() == "" \
1042 or line[47:54].strip().find('DNA') != -1 \
1043 or line[47:54].strip().find('RNA') != -1, \
1044 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
1045 assert line[54:55] == ' ', \
1046 'LOCUS line does not contain space at position 55:\n' + line
1047 assert line[55:63].strip() in ['','linear','circular'], \
1048 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
1049 assert line[63:64] == ' ', \
1050 'LOCUS line does not contain space at position 64:\n' + line
1051 assert line[67:68] == ' ', \
1052 'LOCUS line does not contain space at position 68:\n' + line
1053 if line[68:79].strip():
1054 assert line[70:71] == '-', \
1055 'LOCUS line does not contain - at position 71 in date:\n' + line
1056 assert line[74:75] == '-', \
1057 'LOCUS line does not contain - at position 75 in date:\n' + line
1058
1059 name_and_length_str = line[GENBANK_INDENT:40]
1060 while name_and_length_str.find(' ')!=-1:
1061 name_and_length_str = name_and_length_str.replace(' ',' ')
1062 name_and_length = name_and_length_str.split(' ')
1063 assert len(name_and_length)<=2, \
1064 'Cannot parse the name and length in the LOCUS line:\n' + line
1065 assert len(name_and_length)!=1, \
1066 'Name and length collide in the LOCUS line:\n' + line
1067
1068
1069
1070 consumer.locus(name_and_length[0])
1071 consumer.size(name_and_length[1])
1072
1073 if line[44:54].strip() == "" and line[40:44] == ' aa ':
1074
1075
1076
1077
1078 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
1079 else:
1080 consumer.residue_type(line[44:63].strip())
1081
1082 consumer.data_file_division(line[64:67])
1083 if line[68:79].strip():
1084 consumer.date(line[68:79])
1085 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101 if line[GENBANK_INDENT:].strip() != "":
1102 consumer.locus(line[GENBANK_INDENT:].strip())
1103 else:
1104
1105
1106 warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line)
1107 elif len(line.split())==7 and line.split()[3] in ["aa","bp"]:
1108
1109
1110
1111
1112
1113
1114
1115
1116 splitline = line.split()
1117 consumer.locus(splitline[1])
1118 consumer.size(splitline[2])
1119 consumer.residue_type(splitline[4])
1120 consumer.data_file_division(splitline[5])
1121 consumer.date(splitline[6])
1122 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]:
1123
1124
1125 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
1126 consumer.locus(line.split()[1])
1127 consumer.size(line.split()[2])
1128 elif len(line.split())>=4 and line.split()[-1] in ["aa","bp"]:
1129
1130
1131
1132 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
1133 consumer.locus(line[5:].rsplit(None,2)[0].strip())
1134 consumer.size(line.split()[-2])
1135 else:
1136 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1137
1138
1140
1141
1142
1143
1144 GENBANK_INDENT = self.HEADER_WIDTH
1145 GENBANK_SPACER = " "*GENBANK_INDENT
1146 consumer_dict = {
1147 'DEFINITION' : 'definition',
1148 'ACCESSION' : 'accession',
1149 'NID' : 'nid',
1150 'PID' : 'pid',
1151 'DBSOURCE' : 'db_source',
1152 'KEYWORDS' : 'keywords',
1153 'SEGMENT' : 'segment',
1154 'SOURCE' : 'source',
1155 'AUTHORS' : 'authors',
1156 'CONSRTM' : 'consrtm',
1157 'PROJECT' : 'project',
1158 'DBLINK' : 'dblink',
1159 'TITLE' : 'title',
1160 'JOURNAL' : 'journal',
1161 'MEDLINE' : 'medline_id',
1162 'PUBMED' : 'pubmed_id',
1163 'REMARK' : 'remark'}
1164
1165
1166
1167
1168
1169
1170 lines = filter(None,lines)
1171 lines.append("")
1172 line_iter = iter(lines)
1173 try:
1174 line = line_iter.next()
1175 while True:
1176 if not line : break
1177 line_type = line[:GENBANK_INDENT].strip()
1178 data = line[GENBANK_INDENT:].strip()
1179
1180 if line_type == 'VERSION':
1181
1182
1183
1184 while data.find(' ')!=-1:
1185 data = data.replace(' ',' ')
1186 if data.find(' GI:')==-1:
1187 consumer.version(data)
1188 else:
1189 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
1190 consumer.version(data.split(' GI:')[0])
1191 consumer.gi(data.split(' GI:')[1])
1192
1193 line = line_iter.next()
1194 elif line_type == 'REFERENCE':
1195 if self.debug >1 : print "Found reference [" + data + "]"
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206 data = data.strip()
1207
1208
1209 while True:
1210 line = line_iter.next()
1211 if line[:GENBANK_INDENT] == GENBANK_SPACER:
1212
1213 data += " " + line[GENBANK_INDENT:]
1214 if self.debug >1 : print "Extended reference text [" + data + "]"
1215 else:
1216
1217 break
1218
1219
1220
1221 while data.find(' ')!=-1:
1222 data = data.replace(' ',' ')
1223 if data.find(' ')==-1:
1224 if self.debug >2 : print 'Reference number \"' + data + '\"'
1225 consumer.reference_num(data)
1226 else:
1227 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
1228 consumer.reference_num(data[:data.find(' ')])
1229 consumer.reference_bases(data[data.find(' ')+1:])
1230 elif line_type == 'ORGANISM':
1231
1232
1233
1234
1235
1236
1237
1238
1239 organism_data = data
1240 lineage_data = ""
1241 while True:
1242 line = line_iter.next()
1243 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1244 if lineage_data or ";" in line:
1245 lineage_data += " " + line[GENBANK_INDENT:]
1246 else:
1247 organism_data += " " + line[GENBANK_INDENT:].strip()
1248 else:
1249
1250 break
1251 consumer.organism(organism_data)
1252 if lineage_data.strip() == "" and self.debug > 1:
1253 print "Taxonomy line(s) missing or blank"
1254 consumer.taxonomy(lineage_data.strip())
1255 del organism_data, lineage_data
1256 elif line_type == 'COMMENT':
1257 if self.debug > 1 : print "Found comment"
1258
1259
1260 comment_list=[]
1261 comment_list.append(data)
1262 while True:
1263 line = line_iter.next()
1264 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1265 data = line[GENBANK_INDENT:]
1266 comment_list.append(data)
1267 if self.debug > 2 : print "Comment continuation [" + data + "]"
1268 else:
1269
1270 break
1271 consumer.comment(comment_list)
1272 del comment_list
1273 elif line_type in consumer_dict:
1274
1275
1276 while True:
1277 line = line_iter.next()
1278 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1279 data += ' ' + line[GENBANK_INDENT:]
1280 else:
1281
1282 getattr(consumer, consumer_dict[line_type])(data)
1283
1284 break
1285 else:
1286 if self.debug:
1287 print "Ignoring GenBank header line:\n" % line
1288
1289 line = line_iter.next()
1290 except StopIteration:
1291 raise ValueError("Problem in header")
1292
1333
1334 if __name__ == "__main__":
1335 from StringIO import StringIO
1336
1337 gbk_example = \
1338 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1339 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1340 (AXL2) and Rev7p (REV7) genes, complete cds.
1341 ACCESSION U49845
1342 VERSION U49845.1 GI:1293613
1343 KEYWORDS .
1344 SOURCE Saccharomyces cerevisiae (baker's yeast)
1345 ORGANISM Saccharomyces cerevisiae
1346 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1347 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1348 REFERENCE 1 (bases 1 to 5028)
1349 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1350 TITLE Cloning and sequence of REV7, a gene whose function is required for
1351 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1352 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1353 PUBMED 7871890
1354 REFERENCE 2 (bases 1 to 5028)
1355 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1356 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1357 plasma membrane glycoprotein
1358 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1359 PUBMED 8846915
1360 REFERENCE 3 (bases 1 to 5028)
1361 AUTHORS Roemer,T.
1362 TITLE Direct Submission
1363 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1364 Haven, CT, USA
1365 FEATURES Location/Qualifiers
1366 source 1..5028
1367 /organism="Saccharomyces cerevisiae"
1368 /db_xref="taxon:4932"
1369 /chromosome="IX"
1370 /map="9"
1371 CDS <1..206
1372 /codon_start=3
1373 /product="TCP1-beta"
1374 /protein_id="AAA98665.1"
1375 /db_xref="GI:1293614"
1376 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1377 AEVLLRVDNIIRARPRTANRQHM"
1378 gene 687..3158
1379 /gene="AXL2"
1380 CDS 687..3158
1381 /gene="AXL2"
1382 /note="plasma membrane glycoprotein"
1383 /codon_start=1
1384 /function="required for axial budding pattern of S.
1385 cerevisiae"
1386 /product="Axl2p"
1387 /protein_id="AAA98666.1"
1388 /db_xref="GI:1293615"
1389 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1390 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1391 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1392 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1393 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1394 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1395 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1396 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1397 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1398 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1399 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1400 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1401 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1402 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1403 VDFSNKSNVNVGQVKDIHGRIPEML"
1404 gene complement(3300..4037)
1405 /gene="REV7"
1406 CDS complement(3300..4037)
1407 /gene="REV7"
1408 /codon_start=1
1409 /product="Rev7p"
1410 /protein_id="AAA98667.1"
1411 /db_xref="GI:1293616"
1412 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1413 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1414 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1415 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1416 LISGDDKILNGVYSQYEEGESIFGSLF"
1417 ORIGIN
1418 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1419 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1420 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1421 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1422 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1423 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1424 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1425 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1426 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1427 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1428 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1429 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1430 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1431 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1432 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1433 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1434 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1435 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1436 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1437 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1438 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1439 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1440 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1441 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1442 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1443 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1444 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1445 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1446 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1447 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1448 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1449 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1450 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1451 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1452 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1453 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1454 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1455 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1456 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1457 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1458 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1459 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1460 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1461 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1462 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1463 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1464 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1465 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1466 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1467 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1468 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1469 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1470 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1471 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1472 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1473 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1474 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1475 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1476 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1477 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1478 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1479 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1480 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1481 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1482 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1483 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1484 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1485 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1486 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1487 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1488 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1489 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1490 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1491 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1492 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1493 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1494 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1495 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1496 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1497 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1498 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1499 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1500 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1501 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1502 //"""
1503
1504
1505
1506 gbk_example2 = \
1507 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1508 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1509 ACCESSION AAD51968
1510 VERSION AAD51968.1 GI:5805369
1511 DBSOURCE locus AF171097 accession AF171097.1
1512 KEYWORDS .
1513 SOURCE Yersinia enterocolitica
1514 ORGANISM Yersinia enterocolitica
1515 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1516 Enterobacteriaceae; Yersinia.
1517 REFERENCE 1 (residues 1 to 143)
1518 AUTHORS Revell,P.A. and Miller,V.L.
1519 TITLE A chromosomally encoded regulator is required for expression of the
1520 Yersinia enterocolitica inv gene and for virulence
1521 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1522 MEDLINE 20138369
1523 PUBMED 10672189
1524 REFERENCE 2 (residues 1 to 143)
1525 AUTHORS Revell,P.A. and Miller,V.L.
1526 TITLE Direct Submission
1527 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1528 University School of Medicine, Campus Box 8230, 660 South Euclid,
1529 St. Louis, MO 63110, USA
1530 COMMENT Method: conceptual translation.
1531 FEATURES Location/Qualifiers
1532 source 1..143
1533 /organism="Yersinia enterocolitica"
1534 /mol_type="unassigned DNA"
1535 /strain="JB580v"
1536 /serotype="O:8"
1537 /db_xref="taxon:630"
1538 Protein 1..143
1539 /product="transcriptional regulator RovA"
1540 /name="regulates inv expression"
1541 CDS 1..143
1542 /gene="rovA"
1543 /coded_by="AF171097.1:380..811"
1544 /note="regulator of virulence"
1545 /transl_table=11
1546 ORIGIN
1547 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1548 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1549 121 deiellsgli dklerniiql qsk
1550 //
1551 """
1552
1553 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1554 XX
1555 AC X56734; S46826;
1556 XX
1557 DT 12-SEP-1991 (Rel. 29, Created)
1558 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1559 XX
1560 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1561 XX
1562 KW beta-glucosidase.
1563 XX
1564 OS Trifolium repens (white clover)
1565 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1566 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1567 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1568 XX
1569 RN [5]
1570 RP 1-1859
1571 RX PUBMED; 1907511.
1572 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1573 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1574 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1575 RL Plant Mol. Biol. 17(2):209-219(1991).
1576 XX
1577 RN [6]
1578 RP 1-1859
1579 RA Hughes M.A.;
1580 RT ;
1581 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1582 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1583 RL Upon Tyne, NE2 4HH, UK
1584 XX
1585 FH Key Location/Qualifiers
1586 FH
1587 FT source 1..1859
1588 FT /organism="Trifolium repens"
1589 FT /mol_type="mRNA"
1590 FT /clone_lib="lambda gt10"
1591 FT /clone="TRE361"
1592 FT /tissue_type="leaves"
1593 FT /db_xref="taxon:3899"
1594 FT CDS 14..1495
1595 FT /product="beta-glucosidase"
1596 FT /EC_number="3.2.1.21"
1597 FT /note="non-cyanogenic"
1598 FT /db_xref="GOA:P26204"
1599 FT /db_xref="InterPro:IPR001360"
1600 FT /db_xref="InterPro:IPR013781"
1601 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1602 FT /protein_id="CAA40058.1"
1603 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1604 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1605 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1606 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1607 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1608 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1609 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1610 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1611 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1612 FT mRNA 1..1859
1613 FT /experiment="experimental evidence, no additional details
1614 FT recorded"
1615 XX
1616 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1617 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1618 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1619 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1620 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1621 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1622 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1623 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1624 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1625 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1626 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1627 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1628 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1629 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1630 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1631 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1632 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1633 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1634 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1635 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1636 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1637 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1638 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1639 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1640 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1641 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1642 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1643 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1644 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1645 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1646 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1647 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1648 //
1649 """
1650
1651 print "GenBank CDS Iteration"
1652 print "====================="
1653
1654 g = GenBankScanner()
1655 for record in g.parse_cds_features(StringIO(gbk_example)):
1656 print record
1657
1658 g = GenBankScanner()
1659 for record in g.parse_cds_features(StringIO(gbk_example2),
1660 tags2id=('gene','locus_tag','product')):
1661 print record
1662
1663 g = GenBankScanner()
1664 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1665 tags2id=('gene','locus_tag','product')):
1666 print record
1667
1668 print
1669 print "GenBank Iteration"
1670 print "================="
1671 g = GenBankScanner()
1672 for record in g.parse_records(StringIO(gbk_example),do_features=False):
1673 print record.id, record.name, record.description
1674 print record.seq
1675
1676 g = GenBankScanner()
1677 for record in g.parse_records(StringIO(gbk_example),do_features=True):
1678 print record.id, record.name, record.description
1679 print record.seq
1680
1681 g = GenBankScanner()
1682 for record in g.parse_records(StringIO(gbk_example2),do_features=False):
1683 print record.id, record.name, record.description
1684 print record.seq
1685
1686 g = GenBankScanner()
1687 for record in g.parse_records(StringIO(gbk_example2),do_features=True):
1688 print record.id, record.name, record.description
1689 print record.seq
1690
1691 print
1692 print "EMBL CDS Iteration"
1693 print "=================="
1694
1695 e = EmblScanner()
1696 for record in e.parse_cds_features(StringIO(embl_example)):
1697 print record
1698
1699 print
1700 print "EMBL Iteration"
1701 print "=============="
1702 e = EmblScanner()
1703 for record in e.parse_records(StringIO(embl_example),do_features=True):
1704 print record.id, record.name, record.description
1705 print record.seq
1706