1
2
3
4
5 """Deal with Conserved Domain Database (CDD) entries from NCBI.
6 """
7
8
9 import string
10 import array
11 import os
12 import re
13 import sgmllib
14 import urlparse
15
16
17
18 from xml.sax import handler
19
20
21 import Martel
22 from Martel import RecordReader
23
24 from Bio.FilteredReader import FilteredReader
25 from Bio.FilteredReader import remove_empty_line
26 from Bio.FilteredReader import remove_leading_whitespace
27 from Bio.SGMLExtractor import SGMLExtractorHandle
28 from Bio import File
29 from Bio.Seq import Seq
30 from Martel.Dispatch import Dispatcher
31 import cdd_format
32 import Record
33
35 """Iterator interface to move over a file of CDD entries one at a time.
36 Iterator expects a handle to an sgml file. It extracts data bracketed
37 by specified tag pairs, then removes blank lines and leading white space.
38 The parser operates on the filtered data.
39 """
40 - def __init__(self, handle, parser = None):
54
56 """Return the next CDD record from the handle.
57
58 Will return None if we ran out of records.
59 """
60 data = self._reader.next()
61
62 if self._parser is not None:
63 if data:
64 dumpfile = open( 'dump', 'w' )
65 dumpfile.write( data )
66 dumpfile.close()
67 return self._parser.parse(File.StringHandle(data))
68
69 return data
70
72 return iter(self.next, None)
73
75 """Start up Martel to do the scanning of the file.
76
77 This initialzes the Martel based parser and connects it to a handler
78 that will generate events for a Feature Consumer.
79 """
81 """Initialize the scanner by setting up our caches.
82
83 Creating the parser takes a long time, so we want to cache it
84 to reduce parsing time.
85
86 Arguments:
87 o debug - The level of debugging that the parser should
88 display. Level 0 is no debugging, Level 2 displays the most
89 debugging info (but is much slower). See Martel documentation
90 for more info on this.
91 """
92
93
94 self.interest_tags = [ "cd_tag", \
95 "description_tag", \
96 "status_tag", \
97 "source_tag", \
98 "date_tag", \
99 "taxonomy_tag", \
100 "aligned_tag", \
101 "representative_tag", \
102 "range_tag", \
103 "sequence_tag", \
104 "description_contents_multiline", \
105 "status_contents_multiline", \
106 "source_contents_multiline", \
107 "date_contents_multiline", \
108 "reference_contents_multiline", \
109 "taxonomy_contents_multiline", \
110 "aligned_contents_multiline", \
111 "representative_contents_multiline", \
112 "range_contents_multiline", \
113 "cd_contents_multiline", \
114 "sequence_contents_multiline", \
115 "table_entry" ]
116
117
118 expression = Martel.select_names( cdd_format.cdd_record, self.interest_tags)
119 self._parser = expression.make_parser(debug_level )
120
121 - def feed(self, handle, consumer):
122 """Feeed a set of data into the scanner.
123
124 Arguments:
125 o handle - A handle with the information to parse.
126 o consumer - The consumer that should be informed of events.
127 """
128 consumer.set_interest_tags( self.interest_tags )
129 self._parser.setContentHandler( consumer )
130
131
132 self._parser.parseFile(handle)
133
135 """Create a CDD Record object from scanner generated information.
136 """
141
142
145
148
151
152 - def start_cd_contents_multiline( self, text, attrs ):
154
155 - def end_cd_contents_multiline( self, cdd_record ):
157
160
163
166
169
172
175
178
179 - def end_status_contents_multiline( self, cdd_record ):
181
184
187
190
191 - def end_source_contents_multiline( self, cdd_record ):
193
196
199
200 - def start_date_contents_multiline( self, text, attrs ):
202
203 - def end_date_contents_multiline( self, cdd_record ):
205
208
210 reference = self.get_characters()
211 self.data[ 'references' ].append( reference )
212
215
218
221
224
227
230
233
236
239
242
245
248
251
254
255 - def start_range_contents_multiline( self, text, attrs ):
257
258 - def end_range_contents_multiline( self, cdd_record ):
260
263
266
269
271 line = self.get_characters()
272 ( lines ) = line.splitlines()
273 key = self._pending_key
274 val = ''
275 for line in lines:
276 line = line.strip()
277 val = val + line
278 self.data[ key ] = Seq( val )
279
280 - def start_table_entry( self, text, attrs ):
282
283 - def end_table_entry( self, cdd_record ):
284 line = self.get_characters()
285 ( lines ) = line.splitlines()
286 key = ''
287 val = ''
288 state = 'key'
289 for line in lines:
290 line = line.strip()
291 upper_line = line.upper()
292 if( upper_line.endswith( '[CD]' ) ):
293 line = line[ :-4 ]
294 state = 'val'
295 elif( len( line ) > 60 ):
296 state = 'val'
297 else:
298 state = 'key'
299 if( state == 'key' ):
300 key = key + line
301 else:
302 val = val + line
303 self.data[ 'alignment_lookup' ][ key ] = val
304
308
309 - def add_entry( self ):
310 key = self._pending_key
311 self._pending_key = ""
312 self.data[ key ] = self.get_characters()
313
315 """Parse CDD files into Record objects
316 """
318 """Initialize the parser.
319
320 Arguments:
321 o debug_level - An optional argument that specifies the amount of
322 debugging information Martel should spit out. By default we have
323 no debugging info (the fastest way to do things), but if you want
324 you can set this as high as two and see exactly where a parse fails.
325 """
326 self._scanner = _Scanner(debug_level)
327
328 - def parse(self, handle):
329 """Parse the specified handle into an NBRF record.
330 """
331 self._consumer = _RecordConsumer()
332 self._scanner.feed(handle, self._consumer)
333 return self._consumer.data
334