Package Bio :: Package Ndb
[hide private]
[frames] | no frames]

Source Code for Package Bio.Ndb

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with html files from NDB. 
  8  http://ndbserver.rutgers.edu/NDB/structure-finder/ndb/index.html 
  9   
 10   
 11  Classes: 
 12  Record             Holds NDB sequence data. 
 13  NdbParser          Parses NDB sequence data into a Record object. 
 14   
 15  The algorithm is based on a state machine because the record has multiple  
 16  sections and the handling of tags varies depending on the section.   
 17  Citations have their own state machine. 
 18  """ 
 19   
 20  from types import * 
 21  import string 
 22  from Bio import File 
 23  from Bio import Index 
 24  from Bio.Crystal import Hetero 
 25  from Bio.Crystal import Chain 
 26  from Bio.Crystal import Crystal 
 27  from Bio.SeqFeature import Reference 
 28  import urllib 
 29  import sgmllib 
 30  from Bio.ParserSupport import * 
 31  from Bio.SeqFeature import Reference 
 32   
 33   
34 -class Record( dict ):
35
36 - def __init__( self ):
37 self[ 'Id' ] = '' 38 self[ 'Features' ] = '' 39 self[ 'Name' ] = '' 40 self[ 'Sequence' ] = Crystal( {} ) 41 self[ 'Citation' ] = Reference() 42 self[ 'Space Group' ] = '' 43 self[ 'Cell Constants' ] = {} 44 self[ 'Crystallization Conditions' ] = [] 45 self[ 'Refinement' ] = '' 46 self[ 'Coordinates' ] = ''
47
48 - def __str__( self ):
49 keys = self.keys() 50 keys.sort() 51 out = '' 52 for key in keys: 53 val = self[ key ] 54 if( type( val ) == type( [] ) ): 55 out = out + '\n%s\n' % key 56 for item in val: 57 out = out + '%s\n' % item 58 59 elif( type( val ) == type( {} ) ): 60 out = out + '\n%s\n' % key 61 subkeys = val.keys() 62 subkeys.sort() 63 for item in subkeys: 64 out = out + '%s : %s\n' % ( item, val[ item ] ) 65 elif( isinstance( val, dict ) ): 66 out = out + '\n%s\n' % key 67 subkeys = val.keys() 68 subkeys.sort() 69 for item in subkeys: 70 out = out + '%s : %s\n' % ( item, val[ item ] ) 71 72 else: 73 out = out + '%s: %s\n' % ( key, self[ key ] ) 74 return out
75
76 -def _parse_constants( text ):
77 items = text.split( '=' ) 78 constants = {} 79 key = '' 80 for i in range( 0, ( len( items ) - 1 ) ): 81 item = items[ i ] 82 item = item.strip() 83 separator = item.rfind( ' ' ) 84 if( separator < 0 ): 85 separator = 0 86 val = item[ :separator ] 87 val = val.strip() 88 if( key != '' ): 89 constants[ key ] = val 90 key = item[ separator: ] 91 key = key.strip() 92 constants[ key ] = items[ -1 ] 93 return constants
94 95 96 97 98
99 -class NdbParser( sgmllib.SGMLParser ):
100 """Parses Ndb sequence data into a Record object. 101 data available at: http://ndbserver.rutgers.edu/NDB/NDBATLAS/index.html 102 """
103 - def reset(self):
104 sgmllib.SGMLParser.reset( self ) 105 self.ndb_dict = Record() 106 self.text = '' 107 self._space_group = '' 108 self._state = 'id' 109 self._reference_state = 'authors' 110 self._current_reference = Reference()
111
112 - def parse(self, handle):
113 self.reset() 114 self.feed(handle) 115 return self.ndb_dict
116
117 - def feed(self, handle):
118 """feed(self, handle ) 119 120 Feed in ndb data for scanning. handle is a file-like object 121 containing ndb data. consumer is a Consumer object that will 122 receive events as the ndb data is scanned. 123 124 """ 125 if isinstance(handle, File.UndoHandle): 126 uhandle = handle 127 else: 128 uhandle = File.UndoHandle(handle) 129 text = '' 130 while 1: 131 line = uhandle.readline() 132 if( not line ): 133 break 134 line = string.strip( line ) 135 if( line[ -7: ] == '</HTML>' ): 136 break 137 text = text + ' ' + line 138 139 sgmllib.SGMLParser.feed( self, text )
140 141
142 - def handle_data(self, newtext ):
143 newtext = string.strip( newtext ) 144 self.text = self.text + newtext
145
146 - def start_h1( self, attrs ):
147 self._flush_text()
148
149 - def end_h1( self ):
150 text = self._flush_text() 151 if( self._state == 'id' ): 152 cols = text.split( ':' ) 153 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper() 154 self._state = 'id_found'
155
156 - def start_h2( self, attrs ):
157 text = self._flush_text() 158 if( self._state == 'features' ): 159 self.ndb_dict[ 'Features' ] = text 160 elif( self._state == 'name' ): 161 self.ndb_dict[ 'Name' ] = text 162 elif( self._state == 'sequence' ): 163 pass 164 elif( self._state == 'citation' ): 165 if( self._reference_state == 'journal' ): 166 self._current_reference.journal = text 167 self.ndb_dict[ 'Citation' ] = self._current_reference 168 elif( self._state == 'space' ): 169 self._space_group = self._space_group + text 170 self.ndb_dict[ 'Space Group' ] = self._space_group 171 elif( self._state == 'constants' ): 172 self.ndb_dict[ 'Cell Constants' ] = _parse_constants( text ) 173 elif( self._state == 'crystallization' ): 174 pass 175 elif( self._state == 'refinement' ): 176 self.ndb_dict[ 'Refinement' ] = text 177 elif( self._state == 'coordinates' ): 178 self.ndb_dict[ 'Coordinates' ] = text
179
180 - def end_h2( self ):
181 text = self._flush_text() 182 text = text.lower() 183 if( self._state == 'id' ): 184 if( text.find( 'id' ) >= 0 ): 185 cols = text.split( ':' ) 186 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper() 187 self._state = 'id_found' 188 elif( text.find( 'feature' ) >= 0 ): 189 self._state = 'features' 190 elif( text.find( 'name' ) >= 0 ): 191 self._state = 'name' 192 elif( text.find( 'sequence' ) >= 0 ): 193 self._state = 'sequence' 194 elif( text.find( 'citation' ) >= 0 ): 195 self._state = 'citation' 196 elif( text.find( 'space' ) >= 0 ): 197 self._state = 'space' 198 elif( text.find( 'constants' ) >= 0 ): 199 self._state = 'constants' 200 elif( text.find( 'crystallization' ) >= 0 ): 201 self._state = 'crystallization' 202 elif( text.find( 'refinement' ) >= 0 ): 203 self._state = 'refinement' 204 elif( text.find( 'coordinates' ) >= 0 ): 205 self._state = 'coordinates'
206 207
208 - def start_ul( self, attrs ):
209 if( self._state == 'sequence' ): 210 self._flush_text() 211 212 elif( self._state == 'crystallization' ): 213 self._flush_text()
214
215 - def end_ul( self ):
216 if( self._state == 'sequence' ): 217 self._parse_chain() 218 elif( self._state == 'crystallization' ): 219 text = self._flush_text() 220 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text ) 221 elif( self._state == 'citation' ): 222 if( self._reference_state == 'journal' ): 223 self._current_reference.journal = self._flush_text() 224 self._reference_state = 'done'
225
226 - def start_sub( self, attrs ):
227 if( self._state == 'space' ): 228 self._space_group = self._space_group + self._flush_text()
229
230 - def end_sub( self ):
231 if( self._state == 'space' ): 232 self._space_group = self._space_group + '(%s) ' % self._flush_text()
233
234 - def start_li( self, attrs ):
235 if( self._state == 'sequence' ): 236 self._parse_chain() 237 elif( self._state == 'crystallization' ): 238 text = self._flush_text() 239 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
240
241 - def end_li( self ):
242 if( self._state == 'sequence' ): 243 self._parse_chain() 244 elif( self._state == 'crystallization' ): 245 text = self._flush_text() 246 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
247
248 - def do_br( self, attrs ):
249 if( self._state == 'citation' ): 250 if( self._reference_state == 'authors' ): 251 self._current_reference.authors = self._flush_text() 252 self._reference_state = 'title' 253 elif( self._reference_state == 'title' ): 254 self._current_reference.title = self._flush_text() 255 self._reference_state = 'journal'
256
257 - def start_i( self, attrs ):
258 pass
259
260 - def end_i( self ):
261 if( self._state == 'references' ): 262 if( self._reference_state == 'title' ): 263 text = self._flush_text() 264 self._current_reference.title = text 265 self._reference_state = 'journal'
266 267
268 - def _parse_chain( self ):
269 text = self._flush_text() 270 text = text.strip() 271 if( text.lower().startswith( 'chain' ) ): 272 fields = text.split( ':' ) 273 words = fields[ 0 ].split() 274 key = words[ 1 ] 275 val = fields[ 1 ] 276 self.ndb_dict[ 'Sequence' ][ key ] = val
277 278 279
280 - def _flush_text( self ):
281 text = string.strip( self.text ) 282 self.text = '' 283 return text[:]
284 285 286 if( __name__ == '__main__' ): 287 handle = open( 'PR0004.htm') 288 undo_handle = File.UndoHandle( handle ) 289 ndb_parser = NdbParser() 290 record = ndb_parser.parse( handle ) 291 print str( record ) 292