Package Bio :: Package CDD
[hide private]
[frames] | no frames]

Source Code for Package Bio.CDD

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Deal with Conserved Domain Database (CDD) entries from NCBI. 
  6  """ 
  7   
  8  # standard library 
  9  import string 
 10  import array 
 11  import os 
 12  import re 
 13  import sgmllib 
 14  import urlparse 
 15   
 16   
 17  # XML from python 2.0 
 18  from xml.sax import handler 
 19   
 20  # Martel 
 21  import Martel 
 22  from Martel import RecordReader 
 23   
 24  from Bio.FilteredReader import FilteredReader 
 25  from Bio.FilteredReader import remove_empty_line 
 26  from Bio.FilteredReader import remove_leading_whitespace 
 27  from Bio.SGMLExtractor import SGMLExtractorHandle 
 28  from Bio import File 
 29  from Bio.Seq import Seq 
 30  from Martel.Dispatch import Dispatcher 
 31  import cdd_format 
 32  import Record 
 33   
34 -class Iterator:
35 """Iterator interface to move over a file of CDD entries one at a time. 36 Iterator expects a handle to an sgml file. It extracts data bracketed 37 by specified tag pairs, then removes blank lines and leading white space. 38 The parser operates on the filtered data. 39 """
40 - def __init__(self, handle, parser = None):
41 """Initialize the iterator. 42 43 Arguments: 44 o handle - A handle with CDD entries to iterate through. 45 o parser - An optional parser to pass the entries through before 46 returning them. If None, then the raw entry will be returned. 47 """ 48 record_handle = SGMLExtractorHandle( handle, [ 'title', 'table', ] ) 49 filtered_handle = FilteredReader( record_handle ) 50 filtered_handle.filter_chain = [ remove_empty_line, remove_leading_whitespace ] 51 self.handle = File.UndoHandle( filtered_handle ) 52 self._reader = RecordReader.Everything( self.handle ) 53 self._parser = parser
54
55 - def next(self):
56 """Return the next CDD record from the handle. 57 58 Will return None if we ran out of records. 59 """ 60 data = self._reader.next() 61 62 if self._parser is not None: 63 if data: 64 dumpfile = open( 'dump', 'w' ) 65 dumpfile.write( data ) 66 dumpfile.close() 67 return self._parser.parse(File.StringHandle(data)) 68 69 return data
70
71 - def __iter__(self):
72 return iter(self.next, None)
73
74 -class _Scanner:
75 """Start up Martel to do the scanning of the file. 76 77 This initialzes the Martel based parser and connects it to a handler 78 that will generate events for a Feature Consumer. 79 """
80 - def __init__(self, debug_level = 0):
81 """Initialize the scanner by setting up our caches. 82 83 Creating the parser takes a long time, so we want to cache it 84 to reduce parsing time. 85 86 Arguments: 87 o debug - The level of debugging that the parser should 88 display. Level 0 is no debugging, Level 2 displays the most 89 debugging info (but is much slower). See Martel documentation 90 for more info on this. 91 """ 92 # a listing of all tags we are interested in scanning for 93 # in the MartelParser 94 self.interest_tags = [ "cd_tag", \ 95 "description_tag", \ 96 "status_tag", \ 97 "source_tag", \ 98 "date_tag", \ 99 "taxonomy_tag", \ 100 "aligned_tag", \ 101 "representative_tag", \ 102 "range_tag", \ 103 "sequence_tag", \ 104 "description_contents_multiline", \ 105 "status_contents_multiline", \ 106 "source_contents_multiline", \ 107 "date_contents_multiline", \ 108 "reference_contents_multiline", \ 109 "taxonomy_contents_multiline", \ 110 "aligned_contents_multiline", \ 111 "representative_contents_multiline", \ 112 "range_contents_multiline", \ 113 "cd_contents_multiline", \ 114 "sequence_contents_multiline", \ 115 "table_entry" ] 116 117 # make a parser that returns only the tags we are interested in 118 expression = Martel.select_names( cdd_format.cdd_record, self.interest_tags) 119 self._parser = expression.make_parser(debug_level )
120
121 - def feed(self, handle, consumer):
122 """Feeed a set of data into the scanner. 123 124 Arguments: 125 o handle - A handle with the information to parse. 126 o consumer - The consumer that should be informed of events. 127 """ 128 consumer.set_interest_tags( self.interest_tags ) 129 self._parser.setContentHandler( consumer ) 130 # self._parser.setErrorHandler(handle.ErrorHandler()) 131 132 self._parser.parseFile(handle)
133
134 -class _RecordConsumer( Dispatcher ):
135 """Create a CDD Record object from scanner generated information. 136 """
137 - def __init__(self):
138 Dispatcher.__init__( self ) 139 self.data = Record.Record() 140 self._pending_key = ''
141 142
143 - def set_interest_tags( self, interest_tags ):
144 self.interest_tags = interest_tags
145
146 - def start_cd_tag( self, line, attrs ):
147 self.save_characters()
148
149 - def end_cd_tag( self, cdd_record ):
150 key = self.save_key()
151
152 - def start_cd_contents_multiline( self, text, attrs ):
153 self.save_characters()
154
155 - def end_cd_contents_multiline( self, cdd_record ):
156 self.add_entry()
157
158 - def start_description_tag( self, text, attrs ):
159 self.save_characters()
160
161 - def end_description_tag( self, cdd_record ):
162 key = self.save_key()
163
164 - def start_description_contents_multiline( self, text, attrs ):
165 self.save_characters()
166
167 - def end_description_contents_multiline( self, cdd_record ):
168 self.add_entry()
169
170 - def start_status_tag( self, text, attrs ):
171 self.save_characters()
172
173 - def end_status_tag( self, cdd_record ):
174 key = self.save_key()
175
176 - def start_status_contents_multiline( self, text, attrs ):
177 self.save_characters()
178
179 - def end_status_contents_multiline( self, cdd_record ):
180 self.add_entry()
181
182 - def start_source_tag( self, text, attrs ):
183 self.save_characters()
184
185 - def end_source_tag( self, cdd_record ):
186 key = self.save_key()
187
188 - def start_source_contents_multiline( self, text, attrs ):
189 self.save_characters()
190
191 - def end_source_contents_multiline( self, cdd_record ):
192 self.add_entry()
193
194 - def start_date_tag( self, text, attrs ):
195 self.save_characters()
196
197 - def end_date_tag( self, cdd_record ):
198 key = self.save_key()
199
200 - def start_date_contents_multiline( self, text, attrs ):
201 self.save_characters()
202
203 - def end_date_contents_multiline( self, cdd_record ):
204 self.add_entry()
205
206 - def start_reference_contents_multiline( self, text, attrs ):
207 self.save_characters()
208
209 - def end_reference_contents_multiline( self, cdd_record ):
210 reference = self.get_characters() 211 self.data[ 'references' ].append( reference )
212
213 - def start_taxonomy_tag( self, text, attrs ):
214 self.save_characters()
215
216 - def end_taxonomy_tag( self, cdd_record ):
217 key = self.save_key()
218
219 - def start_taxonomy_contents_multiline( self, text, attrs ):
220 self.save_characters()
221
222 - def end_taxonomy_contents_multiline( self, cdd_record ):
223 self.add_entry()
224
225 - def start_aligned_tag( self, text, attrs ):
226 self.save_characters()
227
228 - def end_aligned_tag( self, cdd_record ):
229 key = self.save_key()
230
231 - def start_aligned_contents_multiline( self, text, attrs ):
232 self.save_characters()
233
234 - def end_aligned_contents_multiline( self, cdd_record ):
235 self.add_entry()
236
237 - def start_representative_tag( self, text, attrs ):
238 self.save_characters()
239
240 - def end_representative_tag( self, cdd_record ):
241 key = self.save_key()
242
243 - def start_representative_contents_multiline( self, text, attrs ):
244 self.save_characters()
245
246 - def end_representative_contents_multiline( self, cdd_record ):
247 self.add_entry()
248
249 - def start_range_tag( self, text, attrs ):
250 self.save_characters()
251
252 - def end_range_tag( self, cdd_record ):
253 key = self.save_key()
254
255 - def start_range_contents_multiline( self, text, attrs ):
256 self.save_characters()
257
258 - def end_range_contents_multiline( self, cdd_record ):
259 self.add_entry()
260
261 - def start_sequence_tag( self, text, attrs ):
262 self.save_characters()
263
264 - def end_sequence_tag( self, cdd_record ):
265 key = self.save_key()
266
267 - def start_sequence_contents_multiline( self, text, attrs ):
268 self.save_characters()
269
270 - def end_sequence_contents_multiline( self, cdd_record ):
271 line = self.get_characters() 272 ( lines ) = line.splitlines() 273 key = self._pending_key 274 val = '' 275 for line in lines: 276 line = line.strip() 277 val = val + line 278 self.data[ key ] = Seq( val )
279
280 - def start_table_entry( self, text, attrs ):
281 self.save_characters()
282
283 - def end_table_entry( self, cdd_record ):
284 line = self.get_characters() 285 ( lines ) = line.splitlines() 286 key = '' 287 val = '' 288 state = 'key' 289 for line in lines: 290 line = line.strip() 291 upper_line = line.upper() 292 if( upper_line.endswith( '[CD]' ) ): 293 line = line[ :-4 ] 294 state = 'val' 295 elif( len( line ) > 60 ): 296 state = 'val' 297 else: 298 state = 'key' 299 if( state == 'key' ): 300 key = key + line 301 else: 302 val = val + line 303 self.data[ 'alignment_lookup' ][ key ] = val
304
305 - def save_key( self ):
306 key = self.get_characters() 307 self._pending_key = key[ : -1 ]
308
309 - def add_entry( self ):
310 key = self._pending_key 311 self._pending_key = "" 312 self.data[ key ] = self.get_characters()
313
314 -class RecordParser:
315 """Parse CDD files into Record objects 316 """
317 - def __init__(self, debug_level = 0):
318 """Initialize the parser. 319 320 Arguments: 321 o debug_level - An optional argument that specifies the amount of 322 debugging information Martel should spit out. By default we have 323 no debugging info (the fastest way to do things), but if you want 324 you can set this as high as two and see exactly where a parse fails. 325 """ 326 self._scanner = _Scanner(debug_level)
327
328 - def parse(self, handle):
329 """Parse the specified handle into an NBRF record. 330 """ 331 self._consumer = _RecordConsumer() 332 self._scanner.feed(handle, self._consumer) 333 return self._consumer.data
334