1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import warnings
21 from Bio import BiopythonDeprecationWarning
22
23 warnings.warn("The module Bio.UniGene.UniGene is now deprecated, "
24 "and will be removed in a future release of Biopython."
25 "To parse UniGene flat files, please use the parser in "
26 "Bio.UniGene instead",
27 BiopythonDeprecationWarning)
28
29 import string
30 import sgmllib
31 import UserDict
32 import Bio.File
33
34
36
38 sgmllib.SGMLParser.reset( self )
39 self.text = ''
40 self.queue = UserDict.UserDict()
41 self.open_tag_stack = []
42 self.open_tag = 'open_html'
43 self.key_waiting = ''
44 self.master_key = ''
45 self.context = 'general_info'
46
47 - def parse( self, handle ):
48 self.reset()
49 self.feed( handle )
50 for key in self.queue:
51 if( self.queue[ key ] == {} ):
52 if( key[ :15 ] == 'UniGene Cluster' ):
53 self.queue[ 'UniGene Cluster' ] = key[ 16: ]
54 del self.queue[ key ]
55 return self.queue
56
57
58
59
60 - def feed( self, handle ):
74
76 newtext = string.strip( newtext )
77 self.text = self.text + newtext
78
80 if( self.context == 'seq_info' ):
81 if( self.open_tag != 'open_b' ):
82 self.text = ''
83
84
85
87 if( self.context == 'seq_info' ):
88 if( self.open_tag != 'open_b' ):
89 if( self.key_waiting == '' ):
90 self.key_waiting = self.text
91 self.text = ''
92
94
95 self.open_tag_stack.append( self.open_tag )
96 self.open_tag = 'open_b'
97 if( self.key_waiting == '' ):
98 self.text = ''
99
101 if( self.text[ :15 ] == 'UniGene Cluster' ):
102 self.queue[ 'UniGene Cluster' ] = self.text[ 16: ]
103 self.text = ''
104 elif( self.key_waiting == '' ):
105 self.extract_key()
106
108 text = string.strip( self.text )
109 key = string.join( string.split( text ) )
110 words = string.split( key )
111 key = string.join( words[ :2 ] )
112 self.text = ''
113
114 try:
115 self.open_tag = self.open_tag_stack.pop()
116 except:
117 self.open_tag = 'open_html'
118 if( self.open_tag == 'open_table_data' ):
119 if( self.context == 'general_info' ):
120 if( self.key_waiting == '' ):
121 self.key_waiting = key
122 self.text = ''
123 elif( self.context == 'seq_info' ):
124 if( text == 'Key to Symbols' ):
125 self.context = 'legend'
126 self.master_key = key
127 elif( self.context == 'general_info' ):
128 self.master_key = key
129 if 'SEQUENCE' in key:
130 self.context = 'seq_info'
131 self.queue[ key ] = UserDict.UserDict()
132 elif( self.context == 'seq_info' ):
133 self.queue[ key ] = UserDict.UserDict()
134 self.master_key = key
135
137 self.open_tag_stack.append( self.open_tag )
138 self.open_tag = 'open_table'
139
141 try:
142 self.open_tag = self.open_tag_stack.pop()
143 except:
144 self.open_tag = 'open_html'
145 self.key_waiting = ''
146
148 self.open_tag_stack.append( self.open_tag )
149 self.open_tag = 'open_table_row'
150 self.text = ''
151
153 try:
154 self.open_tag = self.open_tag_stack.pop()
155 except:
156 self.open_tag = 'open_html'
157 text = self.text
158 if text:
159 self.text = ''
160 if( text[ 0 ] == ':' ):
161 text = text[ 1: ]
162 text = string.join( string.split( text ) )
163 if self.context == 'general_info' or \
164 self.context == 'seq_info':
165 try:
166 contents = self.queue[ self.master_key ][ self.key_waiting ]
167 if isinstance(contents, list):
168 contents.append( text )
169 else:
170 self.queue[ self.master_key ][ self.key_waiting ] = \
171 [ contents , text ]
172 except:
173 self.queue[ self.master_key ][ self.key_waiting ] = text
174
175 self.key_waiting = ''
176
178 self.open_tag_stack.append( self.open_tag )
179 self.open_tag = 'open_table_data'
180
182 try:
183 self.open_tag = self.open_tag_stack.pop()
184 except:
185 self.open_tag = 'open_html'
186 if( self.context == 'seq_info' ):
187 self.text = self.text + ' '
188
190 indent = ' '
191 for j in range( 0, level ):
192 indent = indent + ' '
193 if isinstance(item, str):
194 if( item != '' ):
195 print '%s%s' % ( indent, item )
196 elif isinstance(item, list):
197 for subitem in item:
198 self.print_item( subitem, level + 1 )
199 elif( isinstance( item, UserDict.UserDict ) ):
200 for subitem in item:
201 print '%skey is %s' % ( indent, subitem )
202 self.print_item( item[ subitem ], level + 1 )
203 else:
204 print item
205
210
211
212 if( __name__ == '__main__' ):
213 handle = open( 'Hs13225.htm')
214 undo_handle = Bio.File.UndoHandle( handle )
215 unigene_parser = UniGeneParser()
216 unigene_parser.parse( handle )
217 unigene_parser.print_tags()
218