Package Bio :: Package UniGene :: Module UniGene
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniGene.UniGene

  1  # Copyright 2001 by Katharine Lindner. All Rights Reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  from __future__ import print_function 
  7   
  8  import warnings 
  9  from Bio import BiopythonDeprecationWarning 
 10   
 11  warnings.warn("The module Bio.UniGene.UniGene is now deprecated, " 
 12                "and will be removed in a future release of Biopython." 
 13                "To parse UniGene flat files, please use the parser in " 
 14                "Bio.UniGene instead", 
 15                BiopythonDeprecationWarning) 
 16   
 17  import string 
 18  import sgmllib 
 19  import UserDict 
 20  import Bio.File 
 21   
 22   
23 -class UniGeneParser( sgmllib.SGMLParser ):
24
25 - def reset( self ):
26 sgmllib.SGMLParser.reset( self ) 27 self.text = '' 28 self.queue = UserDict.UserDict() 29 self.open_tag_stack = [] 30 self.open_tag = 'open_html' 31 self.key_waiting = '' 32 self.master_key = '' 33 self.context = 'general_info'
34
35 - def parse( self, handle ):
36 self.reset() 37 self.feed( handle ) 38 for key in self.queue: 39 if( self.queue[ key ] == {} ): 40 if( key[ :15 ] == 'UniGene Cluster' ): 41 self.queue[ 'UniGene Cluster' ] = key[ 16: ] 42 del self.queue[ key ] 43 return self.queue
44 45 # 46 # Assumes an empty line between records 47 #
48 - def feed( self, handle ):
49 if isinstance(handle, Bio.File.UndoHandle): 50 uhandle = handle 51 else: 52 uhandle = Bio.File.UndoHandle(handle) 53 text = '' 54 while True: 55 line = uhandle.readline() 56 line = string.strip( line ) 57 if( line == '' ): 58 break 59 text = text + ' ' + line 60 61 sgmllib.SGMLParser.feed( self, text )
62
63 - def handle_data(self, newtext ):
64 newtext = string.strip( newtext ) 65 self.text = self.text + newtext
66
67 - def start_a( self, attrs ):
68 if( self.context == 'seq_info' ): 69 if( self.open_tag != 'open_b' ): 70 self.text = ''
71 72 # self.queue.append( attrs ) 73
74 - def end_a( self ):
75 if( self.context == 'seq_info' ): 76 if( self.open_tag != 'open_b' ): 77 if( self.key_waiting == '' ): 78 self.key_waiting = self.text 79 self.text = ''
80
81 - def start_b( self, attrs ):
82 83 self.open_tag_stack.append( self.open_tag ) 84 self.open_tag = 'open_b' 85 if( self.key_waiting == '' ): 86 self.text = ''
87
88 - def end_b( self ):
89 if( self.text[ :15 ] == 'UniGene Cluster' ): 90 self.queue[ 'UniGene Cluster' ] = self.text[ 16: ] 91 self.text = '' 92 elif( self.key_waiting == '' ): 93 self.extract_key()
94
95 - def extract_key( self ):
96 text = string.strip( self.text ) 97 key = string.join( string.split( text ) ) 98 words = string.split( key ) 99 key = string.join( words[ :2 ] ) 100 self.text = '' 101 102 try: 103 self.open_tag = self.open_tag_stack.pop() 104 except: 105 self.open_tag = 'open_html' 106 if( self.open_tag == 'open_table_data' ): 107 if( self.context == 'general_info' ): 108 if( self.key_waiting == '' ): 109 self.key_waiting = key 110 self.text = '' 111 elif( self.context == 'seq_info' ): 112 if( text == 'Key to Symbols' ): 113 self.context = 'legend' 114 self.master_key = key 115 elif( self.context == 'general_info' ): 116 self.master_key = key 117 if 'SEQUENCE' in key: 118 self.context = 'seq_info' 119 self.queue[ key ] = UserDict.UserDict() 120 elif( self.context == 'seq_info' ): 121 self.queue[ key ] = UserDict.UserDict() 122 self.master_key = key
123
124 - def start_table( self, attrs ):
125 self.open_tag_stack.append( self.open_tag ) 126 self.open_tag = 'open_table'
127
128 - def end_table( self ):
129 try: 130 self.open_tag = self.open_tag_stack.pop() 131 except: 132 self.open_tag = 'open_html' 133 self.key_waiting = ''
134
135 - def start_tr( self, attrs ):
136 self.open_tag_stack.append( self.open_tag ) 137 self.open_tag = 'open_table_row' 138 self.text = ''
139
140 - def end_tr( self ):
141 try: 142 self.open_tag = self.open_tag_stack.pop() 143 except: 144 self.open_tag = 'open_html' 145 text = self.text 146 if text: 147 self.text = '' 148 if( text[ 0 ] == ':' ): 149 text = text[ 1: ] 150 text = string.join( string.split( text ) ) 151 if self.context == 'general_info' or \ 152 self.context == 'seq_info': 153 try: 154 contents = self.queue[ self.master_key ][ self.key_waiting ] 155 if isinstance(contents, list): 156 contents.append( text ) 157 else: 158 self.queue[ self.master_key ][ self.key_waiting ] = \ 159 [ contents, text ] 160 except: 161 self.queue[ self.master_key ][ self.key_waiting ] = text 162 163 self.key_waiting = ''
164
165 - def start_td( self, attrs ):
166 self.open_tag_stack.append( self.open_tag ) 167 self.open_tag = 'open_table_data'
168
169 - def end_td( self ):
170 try: 171 self.open_tag = self.open_tag_stack.pop() 172 except: 173 self.open_tag = 'open_html' 174 if( self.context == 'seq_info' ): 175 self.text = self.text + ' '
176
177 - def print_item( self, item, level = 1 ):
178 indent = ' ' 179 for j in range( 0, level ): 180 indent = indent + ' ' 181 if isinstance(item, str): 182 if( item != '' ): 183 print('%s%s' % ( indent, item )) 184 elif isinstance(item, list): 185 for subitem in item: 186 self.print_item( subitem, level + 1 ) 187 elif( isinstance( item, UserDict.UserDict ) ): 188 for subitem in item: 189 print('%skey is %s' % ( indent, subitem )) 190 self.print_item( item[ subitem ], level + 1 ) 191 else: 192 print(item)
193
194 - def print_tags( self ):
195 for key in self.queue: 196 print('key %s' % key) 197 self.print_item( self.queue[ key ] )
198 199 200 if( __name__ == '__main__' ): 201 with open( 'Hs13225.htm') as handle: 202 undo_handle = Bio.File.UndoHandle( handle ) 203 unigene_parser = UniGeneParser() 204 unigene_parser.parse( handle ) 205 unigene_parser.print_tags() 206