1
2
3
4
5
6 """
7 This module provides code to work with files from Rebase.
8 http://rebase.neb.com/rebase/rebase.html
9
10
11 Classes:
12 Record Holds rebase sequence data.
13 Iterator Iterates over sequence data in a rebase file.
14 Dictionary Accesses a rebase file using a dictionary interface.
15 RecordParser Parses rebase sequence data into a Record object.
16
17 _Scanner Scans a rebase-format stream.
18 _RecordConsumer Consumes rebase data to a Record object.
19
20
21 Functions:
22 index_file Index a FASTA file for a Dictionary.
23
24 """
25 from types import *
26 import string
27 from Bio import File
28 from Bio import Index
29 from Bio.ParserSupport import *
30
32 """Holds information from a FASTA record.
33
34 Members:
35 seq_5_to_3 The sequence.
36 seq_3_to_5
37 enzyme_num The enzyme number
38 pos Position of cleavage
39 prototype Prototype
40 source
41 microorganism
42 temperature Growth temperature
43 misc Miscellaneous information
44 date_entered
45 date_modified
46 num_Adeno2
47 num_Lambda
48 num_pBR322
49 num_PhiX174
50 num_SV40
51
52 """
54 """__init__(self, colwidth=60)
55
56 Create a new Record. colwidth specifies the number of residues
57 to put on each line.
58
59 """
60 self.seq_5_to_3 = ''
61 self.seq_3_to_5 = ''
62 self.methylation = ''
63 self.enzyme_num = None
64 self.prototype = ''
65 self.source = ''
66 self.microorganism = ''
67 self.temperature = None
68 self.misc = ''
69 self.date_entered = ''
70 self.date_modified = ''
71 self._colwidth = colwidth
72 self.num_Adeno2 = 0
73 self.num_Lambda = 0
74 self.num_pBR322 = 0
75 self.num_PhiX174 = 0
76 self.num_SV40 = 0
77
79 """Returns one record at a time from a Rebase file.
80
81 Methods:
82 next Return the next record from the stream, or None.
83
84 """
85 - def __init__(self, handle, parser=None):
86 """__init__(self, handle, parser=None)
87
88 Create a new iterator. handle is a file-like object. parser
89 is an optional Parser object to change the results into another form.
90 If set to None, then the raw contents of the file will be returned.
91
92 """
93 if type(handle) is not FileType and type(handle) is not InstanceType:
94 raise ValueError, "I expected a file handle or file-like object"
95 self._uhandle = SGMLHandle( File.UndoHandle( handle ) )
96 self._parser = parser
97
99 """next(self) -> object
100
101 Return the next rebase record from the file. If no more records,
102 return None.
103
104 """
105 lines = []
106 first_tag = 'Recognition Sequence'
107 while 1:
108 line = self._uhandle.readline()
109 if not line:
110 break
111 if line[:len( first_tag )] == 'first_tag':
112 self._uhandle.saveline(line)
113 break
114
115 if not line:
116 return None
117
118 if self._parser is not None:
119 return self._parser.parse(File.StringHandle(data))
120 return data
121
123 return iter(self.next, None)
124
126 """Accesses a rebase file using a dictionary interface.
127
128 """
129 __filename_key = '__filename'
130
131 - def __init__(self, indexname, parser=None):
132 """__init__(self, indexname, parser=None)
133
134 Open a Fasta Dictionary. indexname is the name of the
135 index for the dictionary. The index should have been created
136 using the index_file function. parser is an optional Parser
137 object to change the results into another form. If set to None,
138 then the raw contents of the file will be returned.
139
140 """
141 self._index = Index.Index(indexname)
142 self._handle = open(self._index[Dictionary.__filename_key])
143 self._parser = parser
144
146 return len(self._index)
147
155
157 return getattr(self._index, name)
158
160 """Parses FASTA sequence data into a Record object.
161
162 """
166
167 - def parse(self, handle):
168 self._scanner.feed(handle, self._consumer)
169 return self._consumer.data
170
172 """Scans a rebase file.
173
174 Methods:
175 feed Feed in one rebase record.
176
177 """
178 - def feed(self, handle, consumer):
179 """feed(self, handle, consumer)
180
181 Feed in rebase data for scanning. handle is a file-like object
182 containing rebase data. consumer is a Consumer object that will
183 receive events as the rebase data is scanned.
184
185 """
186 if isinstance(handle, File.UndoHandle):
187 uhandle = handle
188 else:
189 uhandle = File.UndoHandle(handle)
190 uhandle = File.SGMLHandle( uhandle )
191
192 if uhandle.peekline():
193 self._scan_record(uhandle, consumer)
194
196 line = safe_readline( uhandle )
197 line = string.join( string.split( line ), ' ' ) + ' '
198 return line
199
200 - def _text_in( self, uhandle, text, count ):
201 for j in range( count ):
202 line = self._scan_line( uhandle )
203 text = text + line
204 return text
205
207 consumer.start_sequence()
208 text = ''
209 text = self._text_in( uhandle, text, 100 )
210 self._scan_sequence( text, consumer)
211 self._scan_methylation( text, consumer)
212 self._scan_enzyme_num( text, consumer )
213 self._scan_prototype( text, consumer )
214 self._scan_source( text, consumer )
215 self._scan_microorganism( text, consumer )
216 self._scan_temperature( text, consumer)
217 self._scan_date_entered( text, consumer)
218 self._scan_date_modified( text, consumer)
219 self._scan_Adeno2( text, consumer)
220 self._scan_Lambda( text, consumer)
221 self._scan_pBR322( text, consumer)
222 self._scan_PhiX174( text, consumer)
223 self._scan_SV40( text, consumer)
224
225
226
234
241
247
253
259
260
266
272
273
279
286
292
298
304
310
316
317
319 """Consumer that converts a rebase record to a Record object.
320
321 Members:
322 data Record with rebase data.
323
324 """
327
330
333
345
349
353
357
361
365
370
372 cols = string.split( line, ':' )
373 cols = string.split( cols[ 1 ] )
374 self.data.date_entered = string.join( cols[ :3 ] )
375
377 cols = string.split( line, ':' )
378 cols = string.split( cols[ 1 ] )
379 self.data.date_modified = string.join( cols[ :3 ] )
380
384
388
392
396
398 cols = string.split( line, ':' )
399 cols = string.split( cols[ 1 ], ' ' )
400 self.data.num_SV40 = cols[ 1 ]
401
402 -def index_file(filename, indexname, rec2key=None):
403 """index_file(filename, ind/exname, rec2key=None)
404
405 Index a rebase file. filename is the name of the file.
406 indexname is the name of the dictionary. rec2key is an
407 optional callback that takes a Record and generates a unique key
408 (e.g. the accession number) for the record. If not specified,
409 the sequence title will be used.
410
411 """
412 if not os.path.exists(filename):
413 raise ValueError, "%s does not exist" % filename
414
415 index = Index.Index(indexname, truncate=1)
416 index[Dictionary._Dictionary__filename_key] = filename
417
418 iter = Iterator(open(filename), parser=RecordParser())
419 while 1:
420 start = iter._uhandle.tell()
421 rec = iter.next()
422 length = iter._uhandle.tell() - start
423
424 if rec is None:
425 break
426 if rec2key is not None:
427 key = rec2key(rec)
428 else:
429 key = rec.title
430
431 if not key:
432 raise KeyError, "empty sequence key was produced"
433 elif index.has_key(key):
434 raise KeyError, "duplicate key %s found" % key
435
436 index[key] = start, length
437