Package Bio :: Package InterPro
[hide private]
[frames] | no frames]

Source Code for Package Bio.InterPro

  1   
  2  # Copyright 2001 by Katharine Lindner.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """ 
  8  This module provides code to work with html files from InterPro. 
  9  http://www.ebi.ac.uk/interpro/ 
 10   
 11   
 12  Classes: 
 13  Record             Holds interpro sequence data. 
 14  InterProParser     Parses interpro sequence data into a Record object. 
 15  """ 
 16  from types import * 
 17  import string 
 18  from Bio import File 
 19  from Bio import Index 
 20  import urllib 
 21  import sgmllib 
 22  from Bio.ParserSupport import * 
 23  from Bio.SeqFeature import Reference 
 24   
25 -class Record( dict ):
26
27 - def __str__( self ):
28 keys = self.keys() 29 keys.sort() 30 out = '' 31 for key in keys: 32 val = self[ key ] 33 if( key == 'References' ): 34 out = out + '\n%s\n' % key 35 for reference in val: 36 out = out + '%s\n' % str( reference ) 37 out = out + '\n' 38 elif( key == 'Examples' ): 39 out = out + '\n%s\n' % key 40 for example in val: 41 out = out + '%s\n' % example 42 elif( key == 'Abstract' ): 43 out = out + '\n%s\n' % key 44 out = out + '%s...\n' % val[ : 80 ] 45 elif( type( self[ key ] ) == type( [] ) ): 46 out = out + '\n%s\n' % key 47 for item in val: 48 out = out + '%s\n' % item 49 50 else: 51 out = out + '%s: %s\n' % ( key, self[ key ] ) 52 return out
53
54 -class InterProParser( sgmllib.SGMLParser ):
55 """Parses InterPro sequence data into a Record object. 56 57 """
58 - def reset(self):
59 sgmllib.SGMLParser.reset( self ) 60 self.text = '' 61 self.inter_pro_dict = Record() 62 self.inter_pro_dict[ 'Database' ] = '' 63 self.inter_pro_dict[ 'Accession' ] = '' 64 self.inter_pro_dict[ 'Name' ] = '' 65 self.inter_pro_dict[ 'Dates' ] = '' 66 self.inter_pro_dict[ 'Type' ] = '' 67 self.inter_pro_dict[ 'Parent' ] = '' 68 self.inter_pro_dict[ 'Process' ] = '' 69 self.inter_pro_dict[ 'Function' ] = '' 70 self.inter_pro_dict[ 'Component' ] = '' 71 self.inter_pro_dict[ 'Signatures' ] = [] 72 self.inter_pro_dict[ 'Abstract' ] = '' 73 self.inter_pro_dict[ 'Examples' ] = [] 74 self.inter_pro_dict[ 'References' ] = [] 75 self.inter_pro_dict[ 'Database links' ] = [] 76 self._state = 'title' 77 self._reference_state = '' 78 self._key_waiting = '' 79 self._current_reference = ''
80
81 - def parse(self, handle):
82 self.reset() 83 self.feed(handle) 84 return self.inter_pro_dict
85
86 - def feed(self, handle):
87 """feed(self, handle ) 88 89 Feed in interpro data for scanning. handle is a file-like object 90 containing interpro data. consumer is a Consumer object that will 91 receive events as the ndb data is scanned. 92 93 """ 94 if isinstance(handle, File.UndoHandle): 95 uhandle = handle 96 else: 97 uhandle = File.UndoHandle(handle) 98 text = '' 99 while 1: 100 line = uhandle.readline() 101 if( not line ): 102 break 103 line = string.strip( line ) 104 if( line[ -7: ] == '</HTML>' ): 105 break 106 text = text + ' ' + line 107 108 sgmllib.SGMLParser.feed( self, text )
109 110
111 - def handle_data(self, newtext ):
112 newtext = string.strip( newtext ) 113 self.text = self.text + newtext
114
115 - def start_table( self, attrs ):
116 dict = pairlist_to_dict( attrs ) 117 for key in dict.keys(): 118 val = dict[ key ]
119
120 - def start_h2( self, attrs ):
121 pass
122
123 - def end_h2( self ):
124 self._state = 'chugging_along'
125
126 - def start_td( self, attrs ):
127 dict = pairlist_to_dict( attrs ) 128 if( self._state == 'chugging_along' ): 129 if( dict.has_key( 'class' ) ): 130 if( dict[ 'class' ] == 'tag' ): 131 self._state = 'waiting_tag' 132 self._flush_text() 133 elif( dict[ 'class' ] == 'inf' ): 134 self._state = 'waiting_inf' 135 self._flush_text()
136
137 - def end_td( self ):
138 if( self._state == 'waiting_tag' ): 139 self._key_waiting = self._flush_text() 140 self._state = 'chugging_along' 141 elif( self._state == 'waiting_inf' ): 142 key = self._key_waiting 143 if( self.inter_pro_dict.has_key( key ) ): 144 val = self._flush_text() 145 if( key == 'Signatures' ): 146 pass 147 elif( key == 'Database links' ): 148 pass 149 else: 150 self.inter_pro_dict[ key ] = val 151 self._key_waiting = '' 152 self._state = 'chugging_along'
153 154
155 - def start_ul( self, attrs ):
156 if( self._key_waiting == 'Examples' ): 157 self._state = 'examples' 158 self._flush_text()
159
160 - def end_ul( self ):
161 self._key_waiting = '' 162 self._state = 'chugging_along'
163
164 - def start_ol( self, attrs ):
165 if( self._key_waiting == 'References' ): 166 self._state = 'references' 167 self._reference_state = 'pubmed_id' 168 self._flush_text() 169 self._references = []
170
171 - def end_ol( self ):
172 if( self._state == 'references' ): 173 self._references.append( self._current_reference ) 174 self.inter_pro_dict[ 'References' ] = self._references 175 self._state = 'chugging_along'
176
177 - def start_li( self, attrs ):
178 if( self._state == 'references' ): 179 self._reference_state = 'pubmed_id' 180 self._flush_text() 181 if( self._current_reference != '' ): 182 self._references.append( self._current_reference ) 183 self._current_reference = Reference()
184
185 - def end_li( self ):
186 if( self._state == 'examples' ): 187 text = self._flush_text() 188 ( self.inter_pro_dict[ 'Examples' ] ).append( text )
189
190 - def start_a( self, attrs ):
191 dict = pairlist_to_dict( attrs ) 192 if( self._state == 'references' ): 193 if( self._reference_state == 'pubmed_id' ): 194 if( dict.has_key( 'name' ) ): 195 self._current_reference.pubmed_id = dict[ 'name' ] 196 self._reference_state = 'authors' 197 elif( self._reference_state == 'journal' ): 198 self._current_reference.journal = self._flush_text() 199 self._reference_state = 'medline_id'
200
201 - def end_a( self ):
202 if( self._state == 'references' ): 203 if( self._reference_state == 'medline_id' ): 204 text = self._flush_text() 205 cols = text.split( ':' ) 206 try: 207 medline_id = cols[ 1 ] 208 except IndexError: 209 medline_id = None 210 else: 211 medline_id = medline_id[ : -1 ] 212 self._current_reference.medline_id = medline_id
213
214 - def do_br( self, attrs ):
215 if( self._state == 'references' ): 216 if( self._reference_state == 'authors' ): 217 self._current_reference.authors = self._flush_text() 218 self._reference_state = 'title' 219 elif( self._key_waiting == 'Signatures' ): 220 self.inter_pro_dict[ 'Signatures' ].append( self._flush_text() ) 221 elif( self._key_waiting == 'Database links' ): 222 self.inter_pro_dict[ 'Database links' ].append( self._flush_text() )
223
224 - def start_i( self, attrs ):
225 pass
226
227 - def end_i( self ):
228 if( self._state == 'references' ): 229 if( self._reference_state == 'title' ): 230 text = self._flush_text() 231 self._current_reference.title = text 232 self._reference_state = 'journal'
233 234
235 - def handle_starttag(self, tag, method, attrs):
236 if( self._state == 'references' ): 237 if( tag == 'li' ): 238 self.stack.pop() 239 elif( tag == 'a' ): 240 if( self._reference_state == 'pubmed_id' ): 241 self.stack.pop() 242 method(attrs)
243 244
245 - def _flush_text( self ):
246 text = string.strip( self.text ) 247 self.text = '' 248 return text[:]
249
250 -def pairlist_to_dict( pairs ):
251 dict = {} 252 for pair in pairs: 253 key = pair[ 0 ] 254 val = pair[ 1 ] 255 dict[key ] = val 256 return dict
257 258 259 260 if( __name__ == '__main__' ): 261 handle = open( 'IPR001064.htm') 262 undo_handle = Bio.File.UndoHandle( handle ) 263 interpro_parser = InterProParser() 264 record = interpro_parser.parse( handle ) 265 print str( record ) 266