1
2
3
4
5
6 """
7 This module provides code to work with html files from NDB.
8 http://ndbserver.rutgers.edu/NDB/structure-finder/ndb/index.html
9
10
11 Classes:
12 Record Holds NDB sequence data.
13 NdbParser Parses NDB sequence data into a Record object.
14
15 The algorithm is based on a state machine because the record has multiple
16 sections and the handling of tags varies depending on the section.
17 Citations have their own state machine.
18 """
19
20 from types import *
21 import string
22 from Bio import File
23 from Bio import Index
24 from Bio.Crystal import Hetero
25 from Bio.Crystal import Chain
26 from Bio.Crystal import Crystal
27 from Bio.SeqFeature import Reference
28 import urllib
29 import sgmllib
30 from Bio.ParserSupport import *
31 from Bio.SeqFeature import Reference
32
33
35
37 self[ 'Id' ] = ''
38 self[ 'Features' ] = ''
39 self[ 'Name' ] = ''
40 self[ 'Sequence' ] = Crystal( {} )
41 self[ 'Citation' ] = Reference()
42 self[ 'Space Group' ] = ''
43 self[ 'Cell Constants' ] = {}
44 self[ 'Crystallization Conditions' ] = []
45 self[ 'Refinement' ] = ''
46 self[ 'Coordinates' ] = ''
47
49 keys = self.keys()
50 keys.sort()
51 out = ''
52 for key in keys:
53 val = self[ key ]
54 if( type( val ) == type( [] ) ):
55 out = out + '\n%s\n' % key
56 for item in val:
57 out = out + '%s\n' % item
58
59 elif( type( val ) == type( {} ) ):
60 out = out + '\n%s\n' % key
61 subkeys = val.keys()
62 subkeys.sort()
63 for item in subkeys:
64 out = out + '%s : %s\n' % ( item, val[ item ] )
65 elif( isinstance( val, dict ) ):
66 out = out + '\n%s\n' % key
67 subkeys = val.keys()
68 subkeys.sort()
69 for item in subkeys:
70 out = out + '%s : %s\n' % ( item, val[ item ] )
71
72 else:
73 out = out + '%s: %s\n' % ( key, self[ key ] )
74 return out
75
94
95
96
97
98
100 """Parses Ndb sequence data into a Record object.
101 data available at: http://ndbserver.rutgers.edu/NDB/NDBATLAS/index.html
102 """
104 sgmllib.SGMLParser.reset( self )
105 self.ndb_dict = Record()
106 self.text = ''
107 self._space_group = ''
108 self._state = 'id'
109 self._reference_state = 'authors'
110 self._current_reference = Reference()
111
112 - def parse(self, handle):
116
117 - def feed(self, handle):
118 """feed(self, handle )
119
120 Feed in ndb data for scanning. handle is a file-like object
121 containing ndb data. consumer is a Consumer object that will
122 receive events as the ndb data is scanned.
123
124 """
125 if isinstance(handle, File.UndoHandle):
126 uhandle = handle
127 else:
128 uhandle = File.UndoHandle(handle)
129 text = ''
130 while 1:
131 line = uhandle.readline()
132 if( not line ):
133 break
134 line = string.strip( line )
135 if( line[ -7: ] == '</HTML>' ):
136 break
137 text = text + ' ' + line
138
139 sgmllib.SGMLParser.feed( self, text )
140
141
143 newtext = string.strip( newtext )
144 self.text = self.text + newtext
145
148
150 text = self._flush_text()
151 if( self._state == 'id' ):
152 cols = text.split( ':' )
153 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper()
154 self._state = 'id_found'
155
157 text = self._flush_text()
158 if( self._state == 'features' ):
159 self.ndb_dict[ 'Features' ] = text
160 elif( self._state == 'name' ):
161 self.ndb_dict[ 'Name' ] = text
162 elif( self._state == 'sequence' ):
163 pass
164 elif( self._state == 'citation' ):
165 if( self._reference_state == 'journal' ):
166 self._current_reference.journal = text
167 self.ndb_dict[ 'Citation' ] = self._current_reference
168 elif( self._state == 'space' ):
169 self._space_group = self._space_group + text
170 self.ndb_dict[ 'Space Group' ] = self._space_group
171 elif( self._state == 'constants' ):
172 self.ndb_dict[ 'Cell Constants' ] = _parse_constants( text )
173 elif( self._state == 'crystallization' ):
174 pass
175 elif( self._state == 'refinement' ):
176 self.ndb_dict[ 'Refinement' ] = text
177 elif( self._state == 'coordinates' ):
178 self.ndb_dict[ 'Coordinates' ] = text
179
181 text = self._flush_text()
182 text = text.lower()
183 if( self._state == 'id' ):
184 if( text.find( 'id' ) >= 0 ):
185 cols = text.split( ':' )
186 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper()
187 self._state = 'id_found'
188 elif( text.find( 'feature' ) >= 0 ):
189 self._state = 'features'
190 elif( text.find( 'name' ) >= 0 ):
191 self._state = 'name'
192 elif( text.find( 'sequence' ) >= 0 ):
193 self._state = 'sequence'
194 elif( text.find( 'citation' ) >= 0 ):
195 self._state = 'citation'
196 elif( text.find( 'space' ) >= 0 ):
197 self._state = 'space'
198 elif( text.find( 'constants' ) >= 0 ):
199 self._state = 'constants'
200 elif( text.find( 'crystallization' ) >= 0 ):
201 self._state = 'crystallization'
202 elif( text.find( 'refinement' ) >= 0 ):
203 self._state = 'refinement'
204 elif( text.find( 'coordinates' ) >= 0 ):
205 self._state = 'coordinates'
206
207
209 if( self._state == 'sequence' ):
210 self._flush_text()
211
212 elif( self._state == 'crystallization' ):
213 self._flush_text()
214
216 if( self._state == 'sequence' ):
217 self._parse_chain()
218 elif( self._state == 'crystallization' ):
219 text = self._flush_text()
220 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
221 elif( self._state == 'citation' ):
222 if( self._reference_state == 'journal' ):
223 self._current_reference.journal = self._flush_text()
224 self._reference_state = 'done'
225
227 if( self._state == 'space' ):
228 self._space_group = self._space_group + self._flush_text()
229
231 if( self._state == 'space' ):
232 self._space_group = self._space_group + '(%s) ' % self._flush_text()
233
235 if( self._state == 'sequence' ):
236 self._parse_chain()
237 elif( self._state == 'crystallization' ):
238 text = self._flush_text()
239 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
240
242 if( self._state == 'sequence' ):
243 self._parse_chain()
244 elif( self._state == 'crystallization' ):
245 text = self._flush_text()
246 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
247
248 - def do_br( self, attrs ):
249 if( self._state == 'citation' ):
250 if( self._reference_state == 'authors' ):
251 self._current_reference.authors = self._flush_text()
252 self._reference_state = 'title'
253 elif( self._reference_state == 'title' ):
254 self._current_reference.title = self._flush_text()
255 self._reference_state = 'journal'
256
259
261 if( self._state == 'references' ):
262 if( self._reference_state == 'title' ):
263 text = self._flush_text()
264 self._current_reference.title = text
265 self._reference_state = 'journal'
266
267
277
278
279
280 - def _flush_text( self ):
281 text = string.strip( self.text )
282 self.text = ''
283 return text[:]
284
285
286 if( __name__ == '__main__' ):
287 handle = open( 'PR0004.htm')
288 undo_handle = File.UndoHandle( handle )
289 ndb_parser = NdbParser()
290 record = ndb_parser.parse( handle )
291 print str( record )
292