Package Bio :: Package UniGene :: Module UniGene
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniGene.UniGene

  1   
  2  # Permission to use, copy, modify, and distribute this software and 
  3  # its documentation with or without modifications and for any purpose 
  4  # and without fee is hereby granted, provided that any copyright 
  5  # notices appear in all copies and that both those copyright notices 
  6  # and this permission notice appear in supporting documentation, and 
  7  # that the names of the contributors or copyright holders not be used 
  8  # in advertising or publicity pertaining to distribution of the software 
  9  # without specific prior permission. 
 10  # 
 11  # THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL 
 12  # WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED 
 13  # WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE 
 14  # CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT 
 15  # OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 
 16  # LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 
 17  # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 
 18  # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 19   
 20  import warnings 
 21  from Bio import BiopythonDeprecationWarning 
 22   
 23  warnings.warn("The module Bio.UniGene.UniGene is now deprecated, " 
 24                "and will be removed in a future release of Biopython." 
 25                "To parse UniGene flat files, please use the parser in " 
 26                "Bio.UniGene instead", 
 27                BiopythonDeprecationWarning) 
 28   
 29  import string 
 30  import sgmllib 
 31  import UserDict 
 32  import Bio.File 
 33   
 34   
35 -class UniGeneParser( sgmllib.SGMLParser ):
36
37 - def reset( self ):
38 sgmllib.SGMLParser.reset( self ) 39 self.text = '' 40 self.queue = UserDict.UserDict() 41 self.open_tag_stack = [] 42 self.open_tag = 'open_html' 43 self.key_waiting = '' 44 self.master_key = '' 45 self.context = 'general_info'
46
47 - def parse( self, handle ):
48 self.reset() 49 self.feed( handle ) 50 for key in self.queue: 51 if( self.queue[ key ] == {} ): 52 if( key[ :15 ] == 'UniGene Cluster' ): 53 self.queue[ 'UniGene Cluster' ] = key[ 16: ] 54 del self.queue[ key ] 55 return self.queue
56 57 # 58 # Assumes an empty line between records 59 #
60 - def feed( self, handle ):
61 if isinstance(handle, Bio.File.UndoHandle): 62 uhandle = handle 63 else: 64 uhandle = Bio.File.UndoHandle(handle) 65 text = '' 66 while 1: 67 line = uhandle.readline() 68 line = string.strip( line ) 69 if( line == '' ): 70 break 71 text = text + ' ' + line 72 73 sgmllib.SGMLParser.feed( self, text )
74
75 - def handle_data(self, newtext ):
76 newtext = string.strip( newtext ) 77 self.text = self.text + newtext
78
79 - def start_a( self, attrs ):
80 if( self.context == 'seq_info' ): 81 if( self.open_tag != 'open_b' ): 82 self.text = ''
83 84 # self.queue.append( attrs ) 85
86 - def end_a( self ):
87 if( self.context == 'seq_info' ): 88 if( self.open_tag != 'open_b' ): 89 if( self.key_waiting == '' ): 90 self.key_waiting = self.text 91 self.text = ''
92
93 - def start_b( self, attrs ):
94 95 self.open_tag_stack.append( self.open_tag ) 96 self.open_tag = 'open_b' 97 if( self.key_waiting == '' ): 98 self.text = ''
99
100 - def end_b( self ):
101 if( self.text[ :15 ] == 'UniGene Cluster' ): 102 self.queue[ 'UniGene Cluster' ] = self.text[ 16: ] 103 self.text = '' 104 elif( self.key_waiting == '' ): 105 self.extract_key()
106
107 - def extract_key( self ):
108 text = string.strip( self.text ) 109 key = string.join( string.split( text ) ) 110 words = string.split( key ) 111 key = string.join( words[ :2 ] ) 112 self.text = '' 113 114 try: 115 self.open_tag = self.open_tag_stack.pop() 116 except: 117 self.open_tag = 'open_html' 118 if( self.open_tag == 'open_table_data' ): 119 if( self.context == 'general_info' ): 120 if( self.key_waiting == '' ): 121 self.key_waiting = key 122 self.text = '' 123 elif( self.context == 'seq_info' ): 124 if( text == 'Key to Symbols' ): 125 self.context = 'legend' 126 self.master_key = key 127 elif( self.context == 'general_info' ): 128 self.master_key = key 129 if 'SEQUENCE' in key: 130 self.context = 'seq_info' 131 self.queue[ key ] = UserDict.UserDict() 132 elif( self.context == 'seq_info' ): 133 self.queue[ key ] = UserDict.UserDict() 134 self.master_key = key
135
136 - def start_table( self, attrs ):
137 self.open_tag_stack.append( self.open_tag ) 138 self.open_tag = 'open_table'
139
140 - def end_table( self ):
141 try: 142 self.open_tag = self.open_tag_stack.pop() 143 except: 144 self.open_tag = 'open_html' 145 self.key_waiting = ''
146
147 - def start_tr( self, attrs ):
148 self.open_tag_stack.append( self.open_tag ) 149 self.open_tag = 'open_table_row' 150 self.text = ''
151
152 - def end_tr( self ):
153 try: 154 self.open_tag = self.open_tag_stack.pop() 155 except: 156 self.open_tag = 'open_html' 157 text = self.text 158 if text: 159 self.text = '' 160 if( text[ 0 ] == ':' ): 161 text = text[ 1: ] 162 text = string.join( string.split( text ) ) 163 if self.context == 'general_info' or \ 164 self.context == 'seq_info': 165 try: 166 contents = self.queue[ self.master_key ][ self.key_waiting ] 167 if isinstance(contents, list): 168 contents.append( text ) 169 else: 170 self.queue[ self.master_key ][ self.key_waiting ] = \ 171 [ contents , text ] 172 except: 173 self.queue[ self.master_key ][ self.key_waiting ] = text 174 175 self.key_waiting = ''
176
177 - def start_td( self, attrs ):
178 self.open_tag_stack.append( self.open_tag ) 179 self.open_tag = 'open_table_data'
180
181 - def end_td( self ):
182 try: 183 self.open_tag = self.open_tag_stack.pop() 184 except: 185 self.open_tag = 'open_html' 186 if( self.context == 'seq_info' ): 187 self.text = self.text + ' '
188
189 - def print_item( self, item, level = 1 ):
190 indent = ' ' 191 for j in range( 0, level ): 192 indent = indent + ' ' 193 if isinstance(item, str): 194 if( item != '' ): 195 print '%s%s' % ( indent, item ) 196 elif isinstance(item, list): 197 for subitem in item: 198 self.print_item( subitem, level + 1 ) 199 elif( isinstance( item, UserDict.UserDict ) ): 200 for subitem in item: 201 print '%skey is %s' % ( indent, subitem ) 202 self.print_item( item[ subitem ], level + 1 ) 203 else: 204 print item
205
206 - def print_tags( self ):
207 for key in self.queue: 208 print 'key %s' % key 209 self.print_item( self.queue[ key ] )
210 211 212 if( __name__ == '__main__' ): 213 handle = open( 'Hs13225.htm') 214 undo_handle = Bio.File.UndoHandle( handle ) 215 unigene_parser = UniGeneParser() 216 unigene_parser.parse( handle ) 217 unigene_parser.print_tags() 218