Package Bio :: Package Rebase
[hide private]
[frames] | no frames]

Source Code for Package Bio.Rebase

  1  # Copyright 2000 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with files from Rebase. 
  8  http://rebase.neb.com/rebase/rebase.html 
  9   
 10   
 11  Classes: 
 12  Record             Holds rebase sequence data. 
 13  Iterator           Iterates over sequence data in a rebase file. 
 14  Dictionary         Accesses a rebase file using a dictionary interface. 
 15  RecordParser       Parses rebase sequence data into a Record object. 
 16   
 17  _Scanner           Scans a rebase-format stream. 
 18  _RecordConsumer    Consumes rebase data to a Record object. 
 19   
 20   
 21  Functions: 
 22  index_file         Index a FASTA file for a Dictionary. 
 23   
 24  """ 
 25  from types import * 
 26  import string 
 27  from Bio import File 
 28  from Bio import Index 
 29  from Bio.ParserSupport import * 
 30   
31 -class Record:
32 """Holds information from a FASTA record. 33 34 Members: 35 seq_5_to_3 The sequence. 36 seq_3_to_5 37 enzyme_num The enzyme number 38 pos Position of cleavage 39 prototype Prototype 40 source 41 microorganism 42 temperature Growth temperature 43 misc Miscellaneous information 44 date_entered 45 date_modified 46 num_Adeno2 47 num_Lambda 48 num_pBR322 49 num_PhiX174 50 num_SV40 51 52 """
53 - def __init__(self, colwidth=60):
54 """__init__(self, colwidth=60) 55 56 Create a new Record. colwidth specifies the number of residues 57 to put on each line. 58 59 """ 60 self.seq_5_to_3 = '' 61 self.seq_3_to_5 = '' 62 self.methylation = '' 63 self.enzyme_num = None 64 self.prototype = '' 65 self.source = '' 66 self.microorganism = '' 67 self.temperature = None 68 self.misc = '' 69 self.date_entered = '' 70 self.date_modified = '' 71 self._colwidth = colwidth 72 self.num_Adeno2 = 0 73 self.num_Lambda = 0 74 self.num_pBR322 = 0 75 self.num_PhiX174 = 0 76 self.num_SV40 = 0
77
78 -class Iterator:
79 """Returns one record at a time from a Rebase file. 80 81 Methods: 82 next Return the next record from the stream, or None. 83 84 """
85 - def __init__(self, handle, parser=None):
86 """__init__(self, handle, parser=None) 87 88 Create a new iterator. handle is a file-like object. parser 89 is an optional Parser object to change the results into another form. 90 If set to None, then the raw contents of the file will be returned. 91 92 """ 93 if type(handle) is not FileType and type(handle) is not InstanceType: 94 raise ValueError, "I expected a file handle or file-like object" 95 self._uhandle = SGMLHandle( File.UndoHandle( handle ) ) 96 self._parser = parser
97
98 - def next(self):
99 """next(self) -> object 100 101 Return the next rebase record from the file. If no more records, 102 return None. 103 104 """ 105 lines = [] 106 first_tag = 'Recognition Sequence' 107 while 1: 108 line = self._uhandle.readline() 109 if not line: 110 break 111 if line[:len( first_tag )] == 'first_tag': 112 self._uhandle.saveline(line) 113 break 114 115 if not line: 116 return None 117 118 if self._parser is not None: 119 return self._parser.parse(File.StringHandle(data)) 120 return data
121
122 - def __iter__(self):
123 return iter(self.next, None)
124
125 -class Dictionary:
126 """Accesses a rebase file using a dictionary interface. 127 128 """ 129 __filename_key = '__filename' 130
131 - def __init__(self, indexname, parser=None):
132 """__init__(self, indexname, parser=None) 133 134 Open a Fasta Dictionary. indexname is the name of the 135 index for the dictionary. The index should have been created 136 using the index_file function. parser is an optional Parser 137 object to change the results into another form. If set to None, 138 then the raw contents of the file will be returned. 139 140 """ 141 self._index = Index.Index(indexname) 142 self._handle = open(self._index[Dictionary.__filename_key]) 143 self._parser = parser
144
145 - def __len__(self):
146 return len(self._index)
147
148 - def __getitem__(self, key):
149 start, len = self._index[key] 150 self._handle.seek(start) 151 data = self._handle.read(len) 152 if self._parser is not None: 153 return self._parser.parse(File.StringHandle(data)) 154 return data
155
156 - def __getattr__(self, name):
157 return getattr(self._index, name)
158
159 -class RecordParser:
160 """Parses FASTA sequence data into a Record object. 161 162 """
163 - def __init__(self):
164 self._scanner = _Scanner() 165 self._consumer = _RecordConsumer()
166
167 - def parse(self, handle):
168 self._scanner.feed(handle, self._consumer) 169 return self._consumer.data
170
171 -class _Scanner:
172 """Scans a rebase file. 173 174 Methods: 175 feed Feed in one rebase record. 176 177 """
178 - def feed(self, handle, consumer):
179 """feed(self, handle, consumer) 180 181 Feed in rebase data for scanning. handle is a file-like object 182 containing rebase data. consumer is a Consumer object that will 183 receive events as the rebase data is scanned. 184 185 """ 186 if isinstance(handle, File.UndoHandle): 187 uhandle = handle 188 else: 189 uhandle = File.UndoHandle(handle) 190 uhandle = File.SGMLHandle( uhandle ) 191 192 if uhandle.peekline(): 193 self._scan_record(uhandle, consumer)
194
195 - def _scan_line(self, uhandle ):
196 line = safe_readline( uhandle ) 197 line = string.join( string.split( line ), ' ' ) + ' ' 198 return line
199
200 - def _text_in( self, uhandle, text, count ):
201 for j in range( count ): 202 line = self._scan_line( uhandle ) 203 text = text + line 204 return text
205
206 - def _scan_record(self, uhandle, consumer):
207 consumer.start_sequence() 208 text = '' 209 text = self._text_in( uhandle, text, 100 ) 210 self._scan_sequence( text, consumer) 211 self._scan_methylation( text, consumer) 212 self._scan_enzyme_num( text, consumer ) 213 self._scan_prototype( text, consumer ) 214 self._scan_source( text, consumer ) 215 self._scan_microorganism( text, consumer ) 216 self._scan_temperature( text, consumer) 217 self._scan_date_entered( text, consumer) 218 self._scan_date_modified( text, consumer) 219 self._scan_Adeno2( text, consumer) 220 self._scan_Lambda( text, consumer) 221 self._scan_pBR322( text, consumer) 222 self._scan_PhiX174( text, consumer) 223 self._scan_SV40( text, consumer)
224 # consumer.end_sequence() 225 226
227 - def _scan_sequence(self, text, consumer ):
228 start = string.find( text, 'Recognition Sequence:' ) 229 end = string.find( text, 'Base (Type of methylation):' ) 230 if( end == -1 ): 231 end = string.find( text, 'REBASE enzyme #:' ) 232 next_item = text[ start:end ] 233 consumer.sequence( next_item )
234
235 - def _scan_methylation(self, text, consumer ):
236 start = string.find( text, 'Base (Type of methylation):' ) 237 if( start != -1 ): 238 end = string.find( text, 'REBASE enzyme #:' ) 239 next_item = text[ start:end ] 240 consumer.methylation( next_item )
241
242 - def _scan_enzyme_num(self, text, consumer ):
243 start = string.find( text, 'REBASE enzyme #:' ) 244 end = string.find( text, 'Prototype:' ) 245 next_item = text[ start:end ] 246 consumer.enzyme_num( next_item )
247
248 - def _scan_prototype(self, text, consumer ):
249 start = string.find( text, 'Prototype:' ) 250 end = string.find( text, 'Source:' ) 251 next_item = text[ start:end ] 252 consumer.prototype( next_item )
253
254 - def _scan_source(self, text, consumer ):
255 start = string.find( text, 'Source:' ) 256 end = string.find( text, 'Microorganism:' ) 257 next_item = text[ start:end ] 258 consumer.source( next_item )
259 260
261 - def _scan_microorganism(self, text, consumer ):
262 start = string.find( text, 'Microorganism:' ) 263 end = string.find( text, 'Growth Temperature:' ) 264 next_item = text[ start:end ] 265 consumer.microorganism( next_item )
266
267 - def _scan_temperature(self, text, consumer):
268 start = string.find( text, 'Growth Temperature:' ) 269 end = start + 30 270 next_item = text[ start:end ] 271 consumer.temperature( next_item )
272 273
274 - def _scan_date_entered(self, text, consumer):
275 start = string.find( text, 'Entered:' ) 276 end = start + 30 277 next_item = text[ start:end ] 278 consumer.data_entered( next_item )
279
280 - def _scan_date_modified(self, text, consumer):
281 start = string.find( text, 'Modified:' ) 282 if( start != -1 ): 283 end = start + 30 284 next_item = text[ start:end ] 285 consumer.data_modified( next_item )
286
287 - def _scan_Adeno2( self, text, consumer ):
288 start = string.find( text, 'Adeno2:' ) 289 end = string.find( text, 'Lambda:' ) 290 next_item = text[ start:end ] 291 consumer.num_Adeno2( next_item )
292
293 - def _scan_Lambda( self, text, consumer ):
294 start = string.find( text, 'Lambda:' ) 295 end = string.find( text, 'pBR322:' ) 296 next_item = text[ start:end ] 297 consumer.num_Lambda( next_item )
298
299 - def _scan_pBR322(self, text, consumer ):
300 start = string.find( text, 'pBR322:' ) 301 end = string.find( text, 'PhiX174:' ) 302 next_item = text[ start:end ] 303 consumer.num_pBR322( next_item )
304
305 - def _scan_PhiX174(self, text, consumer ):
306 start = string.find( text, 'PhiX174:' ) 307 end = string.find( text, 'SV40:' ) 308 next_item = text[ start:end ] 309 consumer.num_PhiX174( next_item )
310
311 - def _scan_SV40(self, text, consumer ):
312 start = string.find( text, 'SV40:' ) 313 end = start + 30 314 next_item = text[ start:end ] 315 consumer.num_SV40( next_item )
316 317
318 -class _RecordConsumer(AbstractConsumer):
319 """Consumer that converts a rebase record to a Record object. 320 321 Members: 322 data Record with rebase data. 323 324 """
325 - def __init__(self):
326 self.data = None
327
328 - def start_sequence(self):
329 self.data = Record()
330
331 - def end_sequence(self):
332 pass
333
334 - def sequence( self, line ):
335 cols = string.split( line, ': ' ) 336 sequence = cols[ 1 ] 337 sequence = string.strip( sequence ) 338 if( string.find( sequence, ' ...' ) != -1 ): 339 cols = string.split( sequence, '...' ) 340 self.data.seq_5_to_3 = cols[ 1 ] 341 elif( string.lower( sequence ) != 'unknown' ): 342 seq_len = len( sequence ) / 2 343 self.data.seq_5_to_3 = string.strip( sequence[ :seq_len ] ) 344 self.data.seq_3_to_5 = string.strip( sequence[ seq_len: ] )
345
346 - def methylation( self, line ):
347 cols = string.split( line, ': ' ) 348 self.data.methylation = cols[ 1 ]
349
350 - def enzyme_num( self, line ):
351 cols = string.split( line, ': ' ) 352 self.data.enzyme_num = int( cols[ 1 ] )
353
354 - def prototype( self, line ):
355 cols = string.split( line, ': ' ) 356 self.data.prototype = cols[ 1 ]
357
358 - def source( self, line ):
359 cols = string.split( line, ': ' ) 360 self.data.source = cols[ 1 ]
361
362 - def microorganism( self, line ):
363 cols = string.split( line, ': ' ) 364 self.data.microorganism = cols[ 1 ]
365
366 - def temperature( self, line ):
367 cols = string.split( line, ':' ) 368 cols = string.split( cols[ 1 ], ' ' ) 369 self.data.temperature = cols[ 1 ]
370
371 - def data_entered( self, line ):
372 cols = string.split( line, ':' ) 373 cols = string.split( cols[ 1 ] ) 374 self.data.date_entered = string.join( cols[ :3 ] )
375
376 - def data_modified( self, line ):
377 cols = string.split( line, ':' ) 378 cols = string.split( cols[ 1 ] ) 379 self.data.date_modified = string.join( cols[ :3 ] )
380
381 - def num_Adeno2( self, line ):
382 cols = string.split( line, ': ' ) 383 self.data.num_Adeno2 = int( cols[ 1 ] )
384
385 - def num_Lambda( self, line ):
386 cols = string.split( line, ': ' ) 387 self.data.num_Lambda = int( cols[ 1 ] )
388
389 - def num_pBR322( self, line ):
390 cols = string.split( line, ': ' ) 391 self.data.num_pBR322 = int( cols[ 1 ] )
392
393 - def num_PhiX174( self, line ):
394 cols = string.split( line, ': ' ) 395 self.data.num_PhiX174 = int( cols[ 1 ] )
396
397 - def num_SV40( self, line ):
398 cols = string.split( line, ':' ) 399 cols = string.split( cols[ 1 ], ' ' ) 400 self.data.num_SV40 = cols[ 1 ]
401
402 -def index_file(filename, indexname, rec2key=None):
403 """index_file(filename, ind/exname, rec2key=None) 404 405 Index a rebase file. filename is the name of the file. 406 indexname is the name of the dictionary. rec2key is an 407 optional callback that takes a Record and generates a unique key 408 (e.g. the accession number) for the record. If not specified, 409 the sequence title will be used. 410 411 """ 412 if not os.path.exists(filename): 413 raise ValueError, "%s does not exist" % filename 414 415 index = Index.Index(indexname, truncate=1) 416 index[Dictionary._Dictionary__filename_key] = filename 417 418 iter = Iterator(open(filename), parser=RecordParser()) 419 while 1: 420 start = iter._uhandle.tell() 421 rec = iter.next() 422 length = iter._uhandle.tell() - start 423 424 if rec is None: 425 break 426 if rec2key is not None: 427 key = rec2key(rec) 428 else: 429 key = rec.title 430 431 if not key: 432 raise KeyError, "empty sequence key was produced" 433 elif index.has_key(key): 434 raise KeyError, "duplicate key %s found" % key 435 436 index[key] = start, length
437