1
2
3
4
5
6 """
7 This module provides code to work with files from Gobase.
8 http://megasun.bch.umontreal.ca/gobase/
9
10
11 Classes:
12 Record Holds gobase sequence data.
13 Iterator Iterates over sequence data in a gobase file.
14 Dictionary Accesses a gobase file using a dictionary interface.
15 RecordParser Parses gobase sequence data into a Record object.
16
17 _Scanner Scans a gobase-format stream.
18 _RecordConsumer Consumes gobase data to a Record object.
19
20
21 Functions:
22 index_file Index a FASTA file for a Dictionary.
23
24 """
25 from types import *
26 import string
27 import re
28 from Bio import File
29 from Bio import Index
30 from Bio.ParserSupport import *
31
33 """Holds information from a Gobase record.
34
35 Members:
36 species_name
37 taxon_division
38 gobase_id
39 """
41 """__init__(self, colwidth=60)
42
43 Create a new Record. colwidth specifies the number of residues
44 to put on each line.
45
46 """
47 self.species_name = ''
48 self.taxon_division = ''
49
51 """Holds information from a Gobase record.
52
53 Members:
54 molecule_type
55 is_plasmid
56 shape
57 submission_date
58 update_date
59 entrez_record
60 genbank_accession
61 """
63 """__init__(self, colwidth=60)
64
65 Create a new Record. colwidth specifies the number of residues
66 to put on each line.
67
68 """
69 Record.__init__( self )
70 self.molecule_type = ''
71 self.is_plasmid = ''
72 self.shape = ''
73 self.submission_date = ''
74 self.update_date = ''
75 self.entrez_record = ''
76 self.genbank_accession = ''
77
79 """Holds information from a Gobase record.
80
81 Members:
82 """
84 """__init__(self, colwidth=60)
85
86 Create a new Record. colwidth specifies the number of residues
87 to put on each line.
88
89 """
90 Record.__init__( self )
91 self.gene_class = ''
92 self.plasmid_encoded = ''
93 self.is_partial_gene = ''
94 self.is_pseudo_gene = ''
95 self.is_transpliced_gene = ''
96 self.chloroplast_origin = ''
97 self.contains_intron = ''
98 self.orf = ''
99 self.included_in_intron = ''
100 self.published_info = ''
101 self.genbank_accession = ''
102 self.entrez_record = ''
103 self.product_type = ''
104 self.product_class = ''
105
107 """Holds information from a Gobase record.
108
109 Members:
110 product_class
111 gene_class
112 is_partial_protein
113 is_plasmid
114 function
115 entry_record
116 """
118 """__init__(self, colwidth=60)
119
120 Create a new Record. colwidth specifies the number of residues
121 to put on each line.
122
123 """
124 Record.__init__( self )
125 self.product_class = ''
126 self.gene_class = ''
127 self.is_partial_protein = ''
128 self.is_plasmid = ''
129 self.is_pseudo = ''
130 self.function = ''
131 self.entry_record = ''
132
134 """Returns one record at a time from a Gobase file.
135
136 Methods:
137 next Return the next record from the stream, or None.
138
139 """
140 - def __init__(self, handle, parser=None):
141 """__init__(self, handle, parser=None)
142
143 Create a new iterator. handle is a file-like object. parser
144 is an optional Parser object to change the results into another form.
145 If set to None, then the raw contents of the file will be returned.
146
147 """
148 if type(handle) is not FileType and type(handle) is not InstanceType:
149 raise ValueError, "I expected a file handle or file-like object"
150 self._uhandle = SGMLHandle( File.UndoHandle( handle ) )
151 self._parser = parser
152
154 """next(self) -> object
155
156 Return the next gobase record from the file. If no more records,
157 return None.
158
159 """
160 lines = []
161 first_tag = 'Recognition Sequence'
162 while 1:
163 line = self._uhandle.readline()
164 if not line:
165 break
166 if line[:len( first_tag )] == 'first_tag':
167 self._uhandle.saveline(line)
168 break
169
170 if not line:
171 return None
172
173 if self._parser is not None:
174 return self._parser.parse(File.StringHandle(data))
175 return data
176
178 return iter(self.next, None)
179
181 """Accesses a gobase file using a dictionary interface.
182
183 """
184 __filename_key = '__filename'
185
186 - def __init__(self, indexname, parser=None):
187 """__init__(self, indexname, parser=None)
188
189 Open a Gobase Dictionary. indexname is the name of the
190 index for the dictionary. The index should have been created
191 using the index_file function. parser is an optional Parser
192 object to change the results into another form. If set to None,
193 then the raw contents of the file will be returned.
194
195 """
196 self._index = Index.Index(indexname)
197 self._handle = open(self._index[Dictionary.__filename_key])
198 self._parser = parser
199
201 return len(self._index)
202
210
212 return getattr(self._index, name)
213
215 """Parses Gobase sequence data into a Record object.
216
217 """
221
222 - def parse(self, handle):
223 self._scanner.feed(handle, self._consumer)
224 return self._consumer.data
225
227 """Scans a gobase file.
228
229 Methods:
230 feed Feed in one gobase record.
231
232 """
233 - def feed(self, handle, consumer):
234 """feed(self, handle, consumer)
235
236 Feed in gobase data for scanning. handle is a file-like object
237 containing gobase data. consumer is a Consumer object that will
238 receive events as the gobase data is scanned.
239
240 """
241 if isinstance(handle, File.UndoHandle):
242 uhandle = handle
243 else:
244 uhandle = File.UndoHandle(handle)
245 uhandle = File.SGMLHandle( uhandle )
246
247 if uhandle.peekline():
248 self._scan_record(uhandle, consumer)
249
251 line = safe_readline( uhandle )
252 line = string.join( string.split( line ), ' ' ) + ' '
253 return line
254
255 - def _text_in( self, uhandle, text, count ):
256 for j in range( count ):
257 try:
258 line = self._scan_line( uhandle )
259 text = text + line
260 except:
261 if( line == '' ):
262 return text
263 return text
264
266 data = consumer.data
267 next_item = self._scan_field( text, 'Molecule type:', 'Species name:' )
268 data.molecule_type = consumer.text_field( next_item )
269
270 next_item = self._scan_field( text, 'Shape of molecule:', 'Sequence length:' )
271 data.shape = consumer.text_field( next_item )
272
273 next_item = self._scan_field( text, 'Plasmid:', 'Complete genome:' )
274 data.is_plasmid = consumer.text_field( next_item )
275
276 next_item = self._scan_field( text, 'NCBI Entrez record:', 'Genbank accession:' )
277 data.entrez_record = consumer.text_field( next_item )
278
279 next_item = self._scan_field( text, 'Genbank accession:', 'Coding gene(s):' )
280 data.genbank_accession = consumer.text_field( next_item )
281 consumer.data = data
282
284 data = consumer.data
285 next_item = self._scan_field( text, 'Gene Class:', 'Species name:' )
286 data.gene_class = consumer.text_field( next_item )
287
288 next_item = self._scan_field( text, 'Plasmid encoded:', 'Partial gene:' )
289 data.is_plasmid = consumer.word_field( next_item )
290
291 next_item = self._scan_field( text, 'Partial gene:', 'Pseudo:' )
292 data.is_partial_gene = consumer.text_field( next_item )
293
294 next_item = self._scan_field( text, 'Pseudo:', 'Transpliced gene:' )
295 data.is_pseudo_gene = consumer.text_field( next_item )
296
297 next_item = self._scan_field( text, 'Transpliced gene:', 'Chloroplast origin:' )
298 data.is_transpliced_gene = consumer.text_field( next_item )
299
300 next_item = self._scan_field( text, 'Chloroplast origin:', 'Contains intron(s):' )
301 data.chloroplast_origin = consumer.word_field( next_item )
302
303 next_item = self._scan_field( text, 'Contains intron(s):' )
304 data.contains_intron = consumer.word_field( next_item )
305
306 next_item = self._scan_field( text, 'Included in intron:' )
307 data.included_in_intron = consumer.word_field( next_item )
308
309 next_item = self._scan_field( text, 'ORF:' )
310 data.orf = consumer.word_field( next_item )
311
312 next_item = self._scan_field( text, 'NCBI Entrez record:' )
313 data.entrez_record = consumer.word_field( next_item )
314
315 next_item = self._scan_field( text, 'Genbank accession:', 'Product type:' )
316 data.genbank_accession = consumer.word_field( next_item )
317
318 next_item = self._scan_field( text, 'Product type:', 'Product Class:' )
319 data.product_type = consumer.text_field( next_item )
320
321 next_item = self._scan_field( text, 'Product Class:' )
322 data.product_class = consumer.text_field( next_item )
323
324 consumer.data = data
325
327 data = consumer.data
328 next_item = self._scan_field( text, 'Product Class:', 'Species name:' )
329 data.product_class = consumer.text_field( next_item )
330
331 next_item = self._scan_field( text, 'Gene Class:', 'Partial protein:' )
332 data.gene_class = consumer.text_field( next_item )
333
334 next_item = self._scan_field( text, 'Partial protein:', 'Conflict:' )
335 data.is_partial_protein = consumer.text_field( next_item )
336
337 next_item = self._scan_field( text, 'Plasmid:', 'Sequence length:' )
338 data.is_plasmid = consumer.text_field( next_item )
339
340 next_item = self._scan_field( text, 'General function:' )
341 data.function = consumer.text_field( next_item )
342
343 next_item = self._scan_field( text, 'NCBI Entrez record:' )
344 data.entrez_record = consumer.word_field( next_item )
345
346 consumer.data = data
347
349 text = ''
350 text = self._text_in( uhandle, text, 100 )
351 text = string.lstrip( text )
352
353 if( string.find( text, 'Sequence' ) == 0 ):
354 consumer.data = SequenceRecord()
355 self._scan_sequence_record( text, consumer )
356 elif( string.find( text, 'Gene' ) == 0 ):
357 consumer.data = GeneRecord()
358 self._scan_gene_record( text, consumer )
359 elif( string.find( text, 'Protein' ) == 0 ):
360 consumer.data = ProteinRecord()
361 self._scan_protein_record( text, consumer )
362 else:
363 print 'UNKNOWN!!!!!!'
364
365 data = consumer.data
366 next_item = self._scan_field( text, 'Species name:', 'Taxon division' )
367 data.species_name = consumer.text_field( next_item )
368
369 next_item = self._scan_field( text, 'Taxon division:' )
370 print next_item
371 data.taxon_division = consumer.word_field( next_item )
372 consumer.data = data
373
374
375
376
377 - def _scan_field(self, text, field, next_field = None ):
395
396
398 """Consumer that converts a gobase record to a Record object.
399
400 Members:
401 data Record with gobase data.
402
403 """
406
409
410 - def text_field( self, line ):
411 if( line == '' ):
412 return ''
413 cols = string.split( line, ': ' )
414 return( cols[ 1 ] )
415
417 if( line == '' ):
418 return None
419 cols = string.split( line, ': ' )
420 return( int( cols[ 1 ] ) )
421
423 if( line == '' ):
424 return ''
425 cols = string.split( line, ': ' )
426 cols = string.split( cols[ 1 ] )
427 return( cols[ 0 ] )
428
430 if( line == '' ):
431 return ''
432 cols = string.split( line, ':' )
433 cols = string.split( cols[ 1 ] )
434 return( string.join( cols[ :3 ] ) )
435
436
437 -def index_file(filename, indexname, rec2key=None):
438 """index_file(filename, ind/exname, rec2key=None)
439
440 Index a gobase file. filename is the name of the file.
441 indexname is the name of the dictionary. rec2key is an
442 optional callback that takes a Record and generates a unique key
443 (e.g. the accession number) for the record. If not specified,
444 the sequence title will be used.
445
446 """
447 if not os.path.exists(filename):
448 raise ValueError, "%s does not exist" % filename
449
450 index = Index.Index(indexname, truncate=1)
451 index[Dictionary._Dictionary__filename_key] = filename
452
453 iter = Iterator(open(filename), parser=RecordParser())
454 while 1:
455 start = iter._uhandle.tell()
456 rec = iter.next()
457 length = iter._uhandle.tell() - start
458
459 if rec is None:
460 break
461 if rec2key is not None:
462 key = rec2key(rec)
463 else:
464 key = rec.title
465
466 if not key:
467 raise KeyError, "empty sequence key was produced"
468 elif index.has_key(key):
469 raise KeyError, "duplicate key %s found" % key
470
471 index[key] = start, length
472