1
2
3
4
5
6 """
7 This module provides code to work with files from Gobase.
8 http://megasun.bch.umontreal.ca/gobase/
9
10
11 Classes:
12 Record Holds gobase sequence data.
13 Iterator Iterates over sequence data in a gobase file.
14 Dictionary Accesses a gobase file using a dictionary interface.
15 RecordParser Parses gobase sequence data into a Record object.
16
17 _Scanner Scans a gobase-format stream.
18 _RecordConsumer Consumes gobase data to a Record object.
19
20
21 Functions:
22 index_file Index a FASTA file for a Dictionary.
23
24 """
25
26 import warnings
27 warnings.warn("Bio.Gobase is deprecated, as this module doesn't seem to have any users. If you are using Bio.Gobase, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module.", DeprecationWarning)
28
29 from types import *
30 import string
31 import re
32 from Bio import File
33 from Bio import Index
34 from Bio.ParserSupport import *
35
37 """Holds information from a Gobase record.
38
39 Members:
40 species_name
41 taxon_division
42 gobase_id
43 """
45 """__init__(self, colwidth=60)
46
47 Create a new Record. colwidth specifies the number of residues
48 to put on each line.
49
50 """
51 self.species_name = ''
52 self.taxon_division = ''
53
55 """Holds information from a Gobase record.
56
57 Members:
58 molecule_type
59 is_plasmid
60 shape
61 submission_date
62 update_date
63 entrez_record
64 genbank_accession
65 """
67 """__init__(self, colwidth=60)
68
69 Create a new Record. colwidth specifies the number of residues
70 to put on each line.
71
72 """
73 Record.__init__( self )
74 self.molecule_type = ''
75 self.is_plasmid = ''
76 self.shape = ''
77 self.submission_date = ''
78 self.update_date = ''
79 self.entrez_record = ''
80 self.genbank_accession = ''
81
83 """Holds information from a Gobase record.
84
85 Members:
86 """
88 """__init__(self, colwidth=60)
89
90 Create a new Record. colwidth specifies the number of residues
91 to put on each line.
92
93 """
94 Record.__init__( self )
95 self.gene_class = ''
96 self.plasmid_encoded = ''
97 self.is_partial_gene = ''
98 self.is_pseudo_gene = ''
99 self.is_transpliced_gene = ''
100 self.chloroplast_origin = ''
101 self.contains_intron = ''
102 self.orf = ''
103 self.included_in_intron = ''
104 self.published_info = ''
105 self.genbank_accession = ''
106 self.entrez_record = ''
107 self.product_type = ''
108 self.product_class = ''
109
111 """Holds information from a Gobase record.
112
113 Members:
114 product_class
115 gene_class
116 is_partial_protein
117 is_plasmid
118 function
119 entry_record
120 """
122 """__init__(self, colwidth=60)
123
124 Create a new Record. colwidth specifies the number of residues
125 to put on each line.
126
127 """
128 Record.__init__( self )
129 self.product_class = ''
130 self.gene_class = ''
131 self.is_partial_protein = ''
132 self.is_plasmid = ''
133 self.is_pseudo = ''
134 self.function = ''
135 self.entry_record = ''
136
138 """Returns one record at a time from a Gobase file.
139
140 Methods:
141 next Return the next record from the stream, or None.
142
143 """
144 - def __init__(self, handle, parser=None):
145 """__init__(self, handle, parser=None)
146
147 Create a new iterator. handle is a file-like object. parser
148 is an optional Parser object to change the results into another form.
149 If set to None, then the raw contents of the file will be returned.
150
151 """
152 if type(handle) is not FileType and type(handle) is not InstanceType:
153 raise ValueError, "I expected a file handle or file-like object"
154 self._uhandle = SGMLHandle( File.UndoHandle( handle ) )
155 self._parser = parser
156
158 """next(self) -> object
159
160 Return the next gobase record from the file. If no more records,
161 return None.
162
163 """
164 lines = []
165 first_tag = 'Recognition Sequence'
166 while 1:
167 line = self._uhandle.readline()
168 if not line:
169 break
170 if line[:len( first_tag )] == 'first_tag':
171 self._uhandle.saveline(line)
172 break
173
174 if not line:
175 return None
176
177 if self._parser is not None:
178 return self._parser.parse(File.StringHandle(data))
179 return data
180
182 return iter(self.next, None)
183
185 """Accesses a gobase file using a dictionary interface.
186
187 """
188 __filename_key = '__filename'
189
190 - def __init__(self, indexname, parser=None):
191 """__init__(self, indexname, parser=None)
192
193 Open a Gobase Dictionary. indexname is the name of the
194 index for the dictionary. The index should have been created
195 using the index_file function. parser is an optional Parser
196 object to change the results into another form. If set to None,
197 then the raw contents of the file will be returned.
198
199 """
200 self._index = Index.Index(indexname)
201 self._handle = open(self._index[Dictionary.__filename_key])
202 self._parser = parser
203
205 return len(self._index)
206
214
216 return getattr(self._index, name)
217
219 """Parses Gobase sequence data into a Record object.
220
221 """
225
226 - def parse(self, handle):
227 self._scanner.feed(handle, self._consumer)
228 return self._consumer.data
229
231 """Scans a gobase file.
232
233 Methods:
234 feed Feed in one gobase record.
235
236 """
237 - def feed(self, handle, consumer):
238 """feed(self, handle, consumer)
239
240 Feed in gobase data for scanning. handle is a file-like object
241 containing gobase data. consumer is a Consumer object that will
242 receive events as the gobase data is scanned.
243
244 """
245 if isinstance(handle, File.UndoHandle):
246 uhandle = handle
247 else:
248 uhandle = File.UndoHandle(handle)
249 uhandle = File.SGMLHandle( uhandle )
250
251 if uhandle.peekline():
252 self._scan_record(uhandle, consumer)
253
255 line = safe_readline( uhandle )
256 line = string.join( string.split( line ), ' ' ) + ' '
257 return line
258
259 - def _text_in( self, uhandle, text, count ):
260 for j in range( count ):
261 try:
262 line = self._scan_line( uhandle )
263 text = text + line
264 except:
265 if( line == '' ):
266 return text
267 return text
268
270 data = consumer.data
271 next_item = self._scan_field( text, 'Molecule type:', 'Species name:' )
272 data.molecule_type = consumer.text_field( next_item )
273
274 next_item = self._scan_field( text, 'Shape of molecule:', 'Sequence length:' )
275 data.shape = consumer.text_field( next_item )
276
277 next_item = self._scan_field( text, 'Plasmid:', 'Complete genome:' )
278 data.is_plasmid = consumer.text_field( next_item )
279
280 next_item = self._scan_field( text, 'NCBI Entrez record:', 'Genbank accession:' )
281 data.entrez_record = consumer.text_field( next_item )
282
283 next_item = self._scan_field( text, 'Genbank accession:', 'Coding gene(s):' )
284 data.genbank_accession = consumer.text_field( next_item )
285 consumer.data = data
286
288 data = consumer.data
289 next_item = self._scan_field( text, 'Gene Class:', 'Species name:' )
290 data.gene_class = consumer.text_field( next_item )
291
292 next_item = self._scan_field( text, 'Plasmid encoded:', 'Partial gene:' )
293 data.is_plasmid = consumer.word_field( next_item )
294
295 next_item = self._scan_field( text, 'Partial gene:', 'Pseudo:' )
296 data.is_partial_gene = consumer.text_field( next_item )
297
298 next_item = self._scan_field( text, 'Pseudo:', 'Transpliced gene:' )
299 data.is_pseudo_gene = consumer.text_field( next_item )
300
301 next_item = self._scan_field( text, 'Transpliced gene:', 'Chloroplast origin:' )
302 data.is_transpliced_gene = consumer.text_field( next_item )
303
304 next_item = self._scan_field( text, 'Chloroplast origin:', 'Contains intron(s):' )
305 data.chloroplast_origin = consumer.word_field( next_item )
306
307 next_item = self._scan_field( text, 'Contains intron(s):' )
308 data.contains_intron = consumer.word_field( next_item )
309
310 next_item = self._scan_field( text, 'Included in intron:' )
311 data.included_in_intron = consumer.word_field( next_item )
312
313 next_item = self._scan_field( text, 'ORF:' )
314 data.orf = consumer.word_field( next_item )
315
316 next_item = self._scan_field( text, 'NCBI Entrez record:' )
317 data.entrez_record = consumer.word_field( next_item )
318
319 next_item = self._scan_field( text, 'Genbank accession:', 'Product type:' )
320 data.genbank_accession = consumer.word_field( next_item )
321
322 next_item = self._scan_field( text, 'Product type:', 'Product Class:' )
323 data.product_type = consumer.text_field( next_item )
324
325 next_item = self._scan_field( text, 'Product Class:' )
326 data.product_class = consumer.text_field( next_item )
327
328 consumer.data = data
329
331 data = consumer.data
332 next_item = self._scan_field( text, 'Product Class:', 'Species name:' )
333 data.product_class = consumer.text_field( next_item )
334
335 next_item = self._scan_field( text, 'Gene Class:', 'Partial protein:' )
336 data.gene_class = consumer.text_field( next_item )
337
338 next_item = self._scan_field( text, 'Partial protein:', 'Conflict:' )
339 data.is_partial_protein = consumer.text_field( next_item )
340
341 next_item = self._scan_field( text, 'Plasmid:', 'Sequence length:' )
342 data.is_plasmid = consumer.text_field( next_item )
343
344 next_item = self._scan_field( text, 'General function:' )
345 data.function = consumer.text_field( next_item )
346
347 next_item = self._scan_field( text, 'NCBI Entrez record:' )
348 data.entrez_record = consumer.word_field( next_item )
349
350 consumer.data = data
351
353 text = ''
354 text = self._text_in( uhandle, text, 100 )
355 text = string.lstrip( text )
356
357 if( string.find( text, 'Sequence' ) == 0 ):
358 consumer.data = SequenceRecord()
359 self._scan_sequence_record( text, consumer )
360 elif( string.find( text, 'Gene' ) == 0 ):
361 consumer.data = GeneRecord()
362 self._scan_gene_record( text, consumer )
363 elif( string.find( text, 'Protein' ) == 0 ):
364 consumer.data = ProteinRecord()
365 self._scan_protein_record( text, consumer )
366 else:
367 print 'UNKNOWN!!!!!!'
368
369 data = consumer.data
370 next_item = self._scan_field( text, 'Species name:', 'Taxon division' )
371 data.species_name = consumer.text_field( next_item )
372
373 next_item = self._scan_field( text, 'Taxon division:' )
374 print next_item
375 data.taxon_division = consumer.word_field( next_item )
376 consumer.data = data
377
378
379
380
381 - def _scan_field(self, text, field, next_field = None ):
399
400
402 """Consumer that converts a gobase record to a Record object.
403
404 Members:
405 data Record with gobase data.
406
407 """
410
413
414 - def text_field( self, line ):
415 if( line == '' ):
416 return ''
417 cols = string.split( line, ': ' )
418 return( cols[ 1 ] )
419
421 if( line == '' ):
422 return None
423 cols = string.split( line, ': ' )
424 return( int( cols[ 1 ] ) )
425
427 if( line == '' ):
428 return ''
429 cols = string.split( line, ': ' )
430 cols = string.split( cols[ 1 ] )
431 return( cols[ 0 ] )
432
434 if( line == '' ):
435 return ''
436 cols = string.split( line, ':' )
437 cols = string.split( cols[ 1 ] )
438 return( string.join( cols[ :3 ] ) )
439
440
441 -def index_file(filename, indexname, rec2key=None):
442 """index_file(filename, ind/exname, rec2key=None)
443
444 Index a gobase file. filename is the name of the file.
445 indexname is the name of the dictionary. rec2key is an
446 optional callback that takes a Record and generates a unique key
447 (e.g. the accession number) for the record. If not specified,
448 the sequence title will be used.
449
450 """
451 if not os.path.exists(filename):
452 raise ValueError, "%s does not exist" % filename
453
454 index = Index.Index(indexname, truncate=1)
455 index[Dictionary._Dictionary__filename_key] = filename
456
457 iter = Iterator(open(filename), parser=RecordParser())
458 while 1:
459 start = iter._uhandle.tell()
460 rec = iter.next()
461 length = iter._uhandle.tell() - start
462
463 if rec is None:
464 break
465 if rec2key is not None:
466 key = rec2key(rec)
467 else:
468 key = rec.title
469
470 if not key:
471 raise KeyError, "empty sequence key was produced"
472 elif index.has_key(key):
473 raise KeyError, "duplicate key %s found" % key
474
475 index[key] = start, length
476