1
2
3
4
5 """Deal with Conserved Domain Database (CDD) entries from NCBI.
6 """
7
8 import warnings
9 warnings.warn("Bio.CDD was deprecated, as it cannot parse recent HTML files from the CDD database. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
10
11
12
13 import string
14 import array
15 import os
16 import re
17 import sgmllib
18 import urlparse
19
20
21
22 from xml.sax import handler
23
24
25 import Martel
26 from Martel import RecordReader
27
28 from Bio.FilteredReader import FilteredReader
29 from Bio.FilteredReader import remove_empty_line
30 from Bio.FilteredReader import remove_leading_whitespace
31 from Bio.SGMLExtractor import SGMLExtractorHandle
32 from Bio import File
33 from Bio.Seq import Seq
34 from Martel.Dispatch import Dispatcher
35 import cdd_format
36 import Record
37
39 """Iterator interface to move over a file of CDD entries one at a time.
40 Iterator expects a handle to an sgml file. It extracts data bracketed
41 by specified tag pairs, then removes blank lines and leading white space.
42 The parser operates on the filtered data.
43 """
44 - def __init__(self, handle, parser = None):
58
60 """Return the next CDD record from the handle.
61
62 Will return None if we ran out of records.
63 """
64 data = self._reader.next()
65
66 if self._parser is not None:
67 if data:
68 dumpfile = open( 'dump', 'w' )
69 dumpfile.write( data )
70 dumpfile.close()
71 return self._parser.parse(File.StringHandle(data))
72
73 return data
74
76 return iter(self.next, None)
77
79 """Start up Martel to do the scanning of the file.
80
81 This initialzes the Martel based parser and connects it to a handler
82 that will generate events for a Feature Consumer.
83 """
85 """Initialize the scanner by setting up our caches.
86
87 Creating the parser takes a long time, so we want to cache it
88 to reduce parsing time.
89
90 Arguments:
91 o debug - The level of debugging that the parser should
92 display. Level 0 is no debugging, Level 2 displays the most
93 debugging info (but is much slower). See Martel documentation
94 for more info on this.
95 """
96
97
98 self.interest_tags = [ "cd_tag", \
99 "description_tag", \
100 "status_tag", \
101 "source_tag", \
102 "date_tag", \
103 "taxonomy_tag", \
104 "aligned_tag", \
105 "representative_tag", \
106 "range_tag", \
107 "sequence_tag", \
108 "description_contents_multiline", \
109 "status_contents_multiline", \
110 "source_contents_multiline", \
111 "date_contents_multiline", \
112 "reference_contents_multiline", \
113 "taxonomy_contents_multiline", \
114 "aligned_contents_multiline", \
115 "representative_contents_multiline", \
116 "range_contents_multiline", \
117 "cd_contents_multiline", \
118 "sequence_contents_multiline", \
119 "table_entry" ]
120
121
122 expression = Martel.select_names( cdd_format.cdd_record, self.interest_tags)
123 self._parser = expression.make_parser(debug_level )
124
125 - def feed(self, handle, consumer):
126 """Feeed a set of data into the scanner.
127
128 Arguments:
129 o handle - A handle with the information to parse.
130 o consumer - The consumer that should be informed of events.
131 """
132 consumer.set_interest_tags( self.interest_tags )
133 self._parser.setContentHandler( consumer )
134
135
136 self._parser.parseFile(handle)
137
139 """Create a CDD Record object from scanner generated information.
140 """
145
146
149
152
155
156 - def start_cd_contents_multiline( self, text, attrs ):
158
159 - def end_cd_contents_multiline( self, cdd_record ):
161
164
167
170
173
176
179
182
183 - def end_status_contents_multiline( self, cdd_record ):
185
188
191
194
195 - def end_source_contents_multiline( self, cdd_record ):
197
200
203
204 - def start_date_contents_multiline( self, text, attrs ):
206
207 - def end_date_contents_multiline( self, cdd_record ):
209
212
214 reference = self.get_characters()
215 self.data[ 'references' ].append( reference )
216
219
222
225
228
231
234
237
240
243
246
249
252
255
258
259 - def start_range_contents_multiline( self, text, attrs ):
261
262 - def end_range_contents_multiline( self, cdd_record ):
264
267
270
273
275 line = self.get_characters()
276 ( lines ) = line.splitlines()
277 key = self._pending_key
278 val = ''
279 for line in lines:
280 line = line.strip()
281 val = val + line
282 self.data[ key ] = Seq( val )
283
284 - def start_table_entry( self, text, attrs ):
286
287 - def end_table_entry( self, cdd_record ):
288 line = self.get_characters()
289 ( lines ) = line.splitlines()
290 key = ''
291 val = ''
292 state = 'key'
293 for line in lines:
294 line = line.strip()
295 upper_line = line.upper()
296 if( upper_line.endswith( '[CD]' ) ):
297 line = line[ :-4 ]
298 state = 'val'
299 elif( len( line ) > 60 ):
300 state = 'val'
301 else:
302 state = 'key'
303 if( state == 'key' ):
304 key = key + line
305 else:
306 val = val + line
307 self.data[ 'alignment_lookup' ][ key ] = val
308
312
313 - def add_entry( self ):
314 key = self._pending_key
315 self._pending_key = ""
316 self.data[ key ] = self.get_characters()
317
319 """Parse CDD files into Record objects
320 """
322 """Initialize the parser.
323
324 Arguments:
325 o debug_level - An optional argument that specifies the amount of
326 debugging information Martel should spit out. By default we have
327 no debugging info (the fastest way to do things), but if you want
328 you can set this as high as two and see exactly where a parse fails.
329 """
330 self._scanner = _Scanner(debug_level)
331
332 - def parse(self, handle):
333 """Parse the specified handle into an NBRF record.
334 """
335 self._consumer = _RecordConsumer()
336 self._scanner.feed(handle, self._consumer)
337 return self._consumer.data
338