1
2
3
4
5
6
7 """
8 This module provides code to work with html files from InterPro.
9 http://www.ebi.ac.uk/interpro/
10
11
12 Classes:
13 Record Holds interpro sequence data.
14 InterProParser Parses interpro sequence data into a Record object.
15 """
16 from types import *
17 import string
18 from Bio import File
19 from Bio import Index
20 import urllib
21 import sgmllib
22 from Bio.ParserSupport import *
23 from Bio.SeqFeature import Reference
24
26
28 keys = self.keys()
29 keys.sort()
30 out = ''
31 for key in keys:
32 val = self[ key ]
33 if( key == 'References' ):
34 out = out + '\n%s\n' % key
35 for reference in val:
36 out = out + '%s\n' % str( reference )
37 out = out + '\n'
38 elif( key == 'Examples' ):
39 out = out + '\n%s\n' % key
40 for example in val:
41 out = out + '%s\n' % example
42 elif( key == 'Abstract' ):
43 out = out + '\n%s\n' % key
44 out = out + '%s...\n' % val[ : 80 ]
45 elif( type( self[ key ] ) == type( [] ) ):
46 out = out + '\n%s\n' % key
47 for item in val:
48 out = out + '%s\n' % item
49
50 else:
51 out = out + '%s: %s\n' % ( key, self[ key ] )
52 return out
53
55 """Parses InterPro sequence data into a Record object.
56
57 """
59 sgmllib.SGMLParser.reset( self )
60 self.text = ''
61 self.inter_pro_dict = Record()
62 self.inter_pro_dict[ 'Database' ] = ''
63 self.inter_pro_dict[ 'Accession' ] = ''
64 self.inter_pro_dict[ 'Name' ] = ''
65 self.inter_pro_dict[ 'Dates' ] = ''
66 self.inter_pro_dict[ 'Type' ] = ''
67 self.inter_pro_dict[ 'Parent' ] = ''
68 self.inter_pro_dict[ 'Process' ] = ''
69 self.inter_pro_dict[ 'Function' ] = ''
70 self.inter_pro_dict[ 'Component' ] = ''
71 self.inter_pro_dict[ 'Signatures' ] = []
72 self.inter_pro_dict[ 'Abstract' ] = ''
73 self.inter_pro_dict[ 'Examples' ] = []
74 self.inter_pro_dict[ 'References' ] = []
75 self.inter_pro_dict[ 'Database links' ] = []
76 self._state = 'title'
77 self._reference_state = ''
78 self._key_waiting = ''
79 self._current_reference = ''
80
85
86 - def feed(self, handle):
87 """feed(self, handle )
88
89 Feed in interpro data for scanning. handle is a file-like object
90 containing interpro data. consumer is a Consumer object that will
91 receive events as the ndb data is scanned.
92
93 """
94 if isinstance(handle, File.UndoHandle):
95 uhandle = handle
96 else:
97 uhandle = File.UndoHandle(handle)
98 text = ''
99 while 1:
100 line = uhandle.readline()
101 if( not line ):
102 break
103 line = string.strip( line )
104 if( line[ -7: ] == '</HTML>' ):
105 break
106 text = text + ' ' + line
107
108 sgmllib.SGMLParser.feed( self, text )
109
110
112 newtext = string.strip( newtext )
113 self.text = self.text + newtext
114
119
122
124 self._state = 'chugging_along'
125
127 dict = pairlist_to_dict( attrs )
128 if( self._state == 'chugging_along' ):
129 if( dict.has_key( 'class' ) ):
130 if( dict[ 'class' ] == 'tag' ):
131 self._state = 'waiting_tag'
132 self._flush_text()
133 elif( dict[ 'class' ] == 'inf' ):
134 self._state = 'waiting_inf'
135 self._flush_text()
136
138 if( self._state == 'waiting_tag' ):
139 self._key_waiting = self._flush_text()
140 self._state = 'chugging_along'
141 elif( self._state == 'waiting_inf' ):
142 key = self._key_waiting
143 if( self.inter_pro_dict.has_key( key ) ):
144 val = self._flush_text()
145 if( key == 'Signatures' ):
146 pass
147 elif( key == 'Database links' ):
148 pass
149 else:
150 self.inter_pro_dict[ key ] = val
151 self._key_waiting = ''
152 self._state = 'chugging_along'
153
154
156 if( self._key_waiting == 'Examples' ):
157 self._state = 'examples'
158 self._flush_text()
159
161 self._key_waiting = ''
162 self._state = 'chugging_along'
163
165 if( self._key_waiting == 'References' ):
166 self._state = 'references'
167 self._reference_state = 'pubmed_id'
168 self._flush_text()
169 self._references = []
170
172 if( self._state == 'references' ):
173 self._references.append( self._current_reference )
174 self.inter_pro_dict[ 'References' ] = self._references
175 self._state = 'chugging_along'
176
178 if( self._state == 'references' ):
179 self._reference_state = 'pubmed_id'
180 self._flush_text()
181 if( self._current_reference != '' ):
182 self._references.append( self._current_reference )
183 self._current_reference = Reference()
184
189
191 dict = pairlist_to_dict( attrs )
192 if( self._state == 'references' ):
193 if( self._reference_state == 'pubmed_id' ):
194 if( dict.has_key( 'name' ) ):
195 self._current_reference.pubmed_id = dict[ 'name' ]
196 self._reference_state = 'authors'
197 elif( self._reference_state == 'journal' ):
198 self._current_reference.journal = self._flush_text()
199 self._reference_state = 'medline_id'
200
213
214 - def do_br( self, attrs ):
215 if( self._state == 'references' ):
216 if( self._reference_state == 'authors' ):
217 self._current_reference.authors = self._flush_text()
218 self._reference_state = 'title'
219 elif( self._key_waiting == 'Signatures' ):
220 self.inter_pro_dict[ 'Signatures' ].append( self._flush_text() )
221 elif( self._key_waiting == 'Database links' ):
222 self.inter_pro_dict[ 'Database links' ].append( self._flush_text() )
223
226
228 if( self._state == 'references' ):
229 if( self._reference_state == 'title' ):
230 text = self._flush_text()
231 self._current_reference.title = text
232 self._reference_state = 'journal'
233
234
236 if( self._state == 'references' ):
237 if( tag == 'li' ):
238 self.stack.pop()
239 elif( tag == 'a' ):
240 if( self._reference_state == 'pubmed_id' ):
241 self.stack.pop()
242 method(attrs)
243
244
245 - def _flush_text( self ):
246 text = string.strip( self.text )
247 self.text = ''
248 return text[:]
249
251 dict = {}
252 for pair in pairs:
253 key = pair[ 0 ]
254 val = pair[ 1 ]
255 dict[key ] = val
256 return dict
257
258
259
260 if( __name__ == '__main__' ):
261 handle = open( 'IPR001064.htm')
262 undo_handle = Bio.File.UndoHandle( handle )
263 interpro_parser = InterProParser()
264 record = interpro_parser.parse( handle )
265 print str( record )
266