Package Bio :: Package Rebase
[hide private]
[frames] | no frames]

Source Code for Package Bio.Rebase

  1  # Copyright 2000 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with files from Rebase. 
  8  http://rebase.neb.com/rebase/rebase.html 
  9   
 10   
 11  Classes: 
 12  Record             Holds rebase sequence data. 
 13  Iterator           Iterates over sequence data in a rebase file. 
 14  Dictionary         Accesses a rebase file using a dictionary interface. 
 15  RecordParser       Parses rebase sequence data into a Record object. 
 16   
 17  _Scanner           Scans a rebase-format stream. 
 18  _RecordConsumer    Consumes rebase data to a Record object. 
 19   
 20   
 21  Functions: 
 22  index_file         Index a FASTA file for a Dictionary. 
 23   
 24  """ 
 25   
 26  import warnings 
 27  warnings.warn("Bio.Rebase was deprecated, as it does not seem to be able to parse recent HTML files from Rebase. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 28   
 29   
 30  from types import * 
 31  import string 
 32  from Bio import File 
 33  from Bio import Index 
 34  from Bio.ParserSupport import * 
 35   
36 -class Record:
37 """Holds information from a FASTA record. 38 39 Members: 40 seq_5_to_3 The sequence. 41 seq_3_to_5 42 enzyme_num The enzyme number 43 pos Position of cleavage 44 prototype Prototype 45 source 46 microorganism 47 temperature Growth temperature 48 misc Miscellaneous information 49 date_entered 50 date_modified 51 num_Adeno2 52 num_Lambda 53 num_pBR322 54 num_PhiX174 55 num_SV40 56 57 """
58 - def __init__(self, colwidth=60):
59 """__init__(self, colwidth=60) 60 61 Create a new Record. colwidth specifies the number of residues 62 to put on each line. 63 64 """ 65 self.seq_5_to_3 = '' 66 self.seq_3_to_5 = '' 67 self.methylation = '' 68 self.enzyme_num = None 69 self.prototype = '' 70 self.source = '' 71 self.microorganism = '' 72 self.temperature = None 73 self.misc = '' 74 self.date_entered = '' 75 self.date_modified = '' 76 self._colwidth = colwidth 77 self.num_Adeno2 = 0 78 self.num_Lambda = 0 79 self.num_pBR322 = 0 80 self.num_PhiX174 = 0 81 self.num_SV40 = 0
82
83 -class Iterator:
84 """Returns one record at a time from a Rebase file. 85 86 Methods: 87 next Return the next record from the stream, or None. 88 89 """
90 - def __init__(self, handle, parser=None):
91 """__init__(self, handle, parser=None) 92 93 Create a new iterator. handle is a file-like object. parser 94 is an optional Parser object to change the results into another form. 95 If set to None, then the raw contents of the file will be returned. 96 97 """ 98 if type(handle) is not FileType and type(handle) is not InstanceType: 99 raise ValueError, "I expected a file handle or file-like object" 100 self._uhandle = SGMLHandle( File.UndoHandle( handle ) ) 101 self._parser = parser
102
103 - def next(self):
104 """next(self) -> object 105 106 Return the next rebase record from the file. If no more records, 107 return None. 108 109 """ 110 lines = [] 111 first_tag = 'Recognition Sequence' 112 while 1: 113 line = self._uhandle.readline() 114 if not line: 115 break 116 if line[:len( first_tag )] == 'first_tag': 117 self._uhandle.saveline(line) 118 break 119 120 if not line: 121 return None 122 123 if self._parser is not None: 124 return self._parser.parse(File.StringHandle(data)) 125 return data
126
127 - def __iter__(self):
128 return iter(self.next, None)
129
130 -class Dictionary:
131 """Accesses a rebase file using a dictionary interface. 132 133 """ 134 __filename_key = '__filename' 135
136 - def __init__(self, indexname, parser=None):
137 """__init__(self, indexname, parser=None) 138 139 Open a Fasta Dictionary. indexname is the name of the 140 index for the dictionary. The index should have been created 141 using the index_file function. parser is an optional Parser 142 object to change the results into another form. If set to None, 143 then the raw contents of the file will be returned. 144 145 """ 146 self._index = Index.Index(indexname) 147 self._handle = open(self._index[Dictionary.__filename_key]) 148 self._parser = parser
149
150 - def __len__(self):
151 return len(self._index)
152
153 - def __getitem__(self, key):
154 start, len = self._index[key] 155 self._handle.seek(start) 156 data = self._handle.read(len) 157 if self._parser is not None: 158 return self._parser.parse(File.StringHandle(data)) 159 return data
160
161 - def __getattr__(self, name):
162 return getattr(self._index, name)
163
164 -class RecordParser:
165 """Parses FASTA sequence data into a Record object. 166 167 """
168 - def __init__(self):
169 self._scanner = _Scanner() 170 self._consumer = _RecordConsumer()
171
172 - def parse(self, handle):
173 self._scanner.feed(handle, self._consumer) 174 return self._consumer.data
175
176 -class _Scanner:
177 """Scans a rebase file. 178 179 Methods: 180 feed Feed in one rebase record. 181 182 """
183 - def feed(self, handle, consumer):
184 """feed(self, handle, consumer) 185 186 Feed in rebase data for scanning. handle is a file-like object 187 containing rebase data. consumer is a Consumer object that will 188 receive events as the rebase data is scanned. 189 190 """ 191 if isinstance(handle, File.UndoHandle): 192 uhandle = handle 193 else: 194 uhandle = File.UndoHandle(handle) 195 uhandle = File.SGMLHandle( uhandle ) 196 197 if uhandle.peekline(): 198 self._scan_record(uhandle, consumer)
199
200 - def _scan_line(self, uhandle ):
201 line = safe_readline( uhandle ) 202 line = string.join( string.split( line ), ' ' ) + ' ' 203 return line
204
205 - def _text_in( self, uhandle, text, count ):
206 for j in range( count ): 207 line = self._scan_line( uhandle ) 208 text = text + line 209 return text
210
211 - def _scan_record(self, uhandle, consumer):
212 consumer.start_sequence() 213 text = '' 214 text = self._text_in( uhandle, text, 100 ) 215 self._scan_sequence( text, consumer) 216 self._scan_methylation( text, consumer) 217 self._scan_enzyme_num( text, consumer ) 218 self._scan_prototype( text, consumer ) 219 self._scan_source( text, consumer ) 220 self._scan_microorganism( text, consumer ) 221 self._scan_temperature( text, consumer) 222 self._scan_date_entered( text, consumer) 223 self._scan_date_modified( text, consumer) 224 self._scan_Adeno2( text, consumer) 225 self._scan_Lambda( text, consumer) 226 self._scan_pBR322( text, consumer) 227 self._scan_PhiX174( text, consumer) 228 self._scan_SV40( text, consumer)
229 # consumer.end_sequence() 230 231
232 - def _scan_sequence(self, text, consumer ):
233 start = string.find( text, 'Recognition Sequence:' ) 234 end = string.find( text, 'Base (Type of methylation):' ) 235 if( end == -1 ): 236 end = string.find( text, 'REBASE enzyme #:' ) 237 next_item = text[ start:end ] 238 consumer.sequence( next_item )
239
240 - def _scan_methylation(self, text, consumer ):
241 start = string.find( text, 'Base (Type of methylation):' ) 242 if( start != -1 ): 243 end = string.find( text, 'REBASE enzyme #:' ) 244 next_item = text[ start:end ] 245 consumer.methylation( next_item )
246
247 - def _scan_enzyme_num(self, text, consumer ):
248 start = string.find( text, 'REBASE enzyme #:' ) 249 end = string.find( text, 'Prototype:' ) 250 next_item = text[ start:end ] 251 consumer.enzyme_num( next_item )
252
253 - def _scan_prototype(self, text, consumer ):
254 start = string.find( text, 'Prototype:' ) 255 end = string.find( text, 'Source:' ) 256 next_item = text[ start:end ] 257 consumer.prototype( next_item )
258
259 - def _scan_source(self, text, consumer ):
260 start = string.find( text, 'Source:' ) 261 end = string.find( text, 'Microorganism:' ) 262 next_item = text[ start:end ] 263 consumer.source( next_item )
264 265
266 - def _scan_microorganism(self, text, consumer ):
267 start = string.find( text, 'Microorganism:' ) 268 end = string.find( text, 'Growth Temperature:' ) 269 next_item = text[ start:end ] 270 consumer.microorganism( next_item )
271
272 - def _scan_temperature(self, text, consumer):
273 start = string.find( text, 'Growth Temperature:' ) 274 end = start + 30 275 next_item = text[ start:end ] 276 consumer.temperature( next_item )
277 278
279 - def _scan_date_entered(self, text, consumer):
280 start = string.find( text, 'Entered:' ) 281 end = start + 30 282 next_item = text[ start:end ] 283 consumer.data_entered( next_item )
284
285 - def _scan_date_modified(self, text, consumer):
286 start = string.find( text, 'Modified:' ) 287 if( start != -1 ): 288 end = start + 30 289 next_item = text[ start:end ] 290 consumer.data_modified( next_item )
291
292 - def _scan_Adeno2( self, text, consumer ):
293 start = string.find( text, 'Adeno2:' ) 294 end = string.find( text, 'Lambda:' ) 295 next_item = text[ start:end ] 296 consumer.num_Adeno2( next_item )
297
298 - def _scan_Lambda( self, text, consumer ):
299 start = string.find( text, 'Lambda:' ) 300 end = string.find( text, 'pBR322:' ) 301 next_item = text[ start:end ] 302 consumer.num_Lambda( next_item )
303
304 - def _scan_pBR322(self, text, consumer ):
305 start = string.find( text, 'pBR322:' ) 306 end = string.find( text, 'PhiX174:' ) 307 next_item = text[ start:end ] 308 consumer.num_pBR322( next_item )
309
310 - def _scan_PhiX174(self, text, consumer ):
311 start = string.find( text, 'PhiX174:' ) 312 end = string.find( text, 'SV40:' ) 313 next_item = text[ start:end ] 314 consumer.num_PhiX174( next_item )
315
316 - def _scan_SV40(self, text, consumer ):
317 start = string.find( text, 'SV40:' ) 318 end = start + 30 319 next_item = text[ start:end ] 320 consumer.num_SV40( next_item )
321 322
323 -class _RecordConsumer(AbstractConsumer):
324 """Consumer that converts a rebase record to a Record object. 325 326 Members: 327 data Record with rebase data. 328 329 """
330 - def __init__(self):
331 self.data = None
332
333 - def start_sequence(self):
334 self.data = Record()
335
336 - def end_sequence(self):
337 pass
338
339 - def sequence( self, line ):
340 cols = string.split( line, ': ' ) 341 sequence = cols[ 1 ] 342 sequence = string.strip( sequence ) 343 if( string.find( sequence, ' ...' ) != -1 ): 344 cols = string.split( sequence, '...' ) 345 self.data.seq_5_to_3 = cols[ 1 ] 346 elif( string.lower( sequence ) != 'unknown' ): 347 seq_len = len( sequence ) / 2 348 self.data.seq_5_to_3 = string.strip( sequence[ :seq_len ] ) 349 self.data.seq_3_to_5 = string.strip( sequence[ seq_len: ] )
350
351 - def methylation( self, line ):
352 cols = string.split( line, ': ' ) 353 self.data.methylation = cols[ 1 ]
354
355 - def enzyme_num( self, line ):
356 cols = string.split( line, ': ' ) 357 self.data.enzyme_num = int( cols[ 1 ] )
358
359 - def prototype( self, line ):
360 cols = string.split( line, ': ' ) 361 self.data.prototype = cols[ 1 ]
362
363 - def source( self, line ):
364 cols = string.split( line, ': ' ) 365 self.data.source = cols[ 1 ]
366
367 - def microorganism( self, line ):
368 cols = string.split( line, ': ' ) 369 self.data.microorganism = cols[ 1 ]
370
371 - def temperature( self, line ):
372 cols = string.split( line, ':' ) 373 cols = string.split( cols[ 1 ], ' ' ) 374 self.data.temperature = cols[ 1 ]
375
376 - def data_entered( self, line ):
377 cols = string.split( line, ':' ) 378 cols = string.split( cols[ 1 ] ) 379 self.data.date_entered = string.join( cols[ :3 ] )
380
381 - def data_modified( self, line ):
382 cols = string.split( line, ':' ) 383 cols = string.split( cols[ 1 ] ) 384 self.data.date_modified = string.join( cols[ :3 ] )
385
386 - def num_Adeno2( self, line ):
387 cols = string.split( line, ': ' ) 388 self.data.num_Adeno2 = int( cols[ 1 ] )
389
390 - def num_Lambda( self, line ):
391 cols = string.split( line, ': ' ) 392 self.data.num_Lambda = int( cols[ 1 ] )
393
394 - def num_pBR322( self, line ):
395 cols = string.split( line, ': ' ) 396 self.data.num_pBR322 = int( cols[ 1 ] )
397
398 - def num_PhiX174( self, line ):
399 cols = string.split( line, ': ' ) 400 self.data.num_PhiX174 = int( cols[ 1 ] )
401
402 - def num_SV40( self, line ):
403 cols = string.split( line, ':' ) 404 cols = string.split( cols[ 1 ], ' ' ) 405 self.data.num_SV40 = cols[ 1 ]
406
407 -def index_file(filename, indexname, rec2key=None):
408 """index_file(filename, ind/exname, rec2key=None) 409 410 Index a rebase file. filename is the name of the file. 411 indexname is the name of the dictionary. rec2key is an 412 optional callback that takes a Record and generates a unique key 413 (e.g. the accession number) for the record. If not specified, 414 the sequence title will be used. 415 416 """ 417 if not os.path.exists(filename): 418 raise ValueError, "%s does not exist" % filename 419 420 index = Index.Index(indexname, truncate=1) 421 index[Dictionary._Dictionary__filename_key] = filename 422 423 iter = Iterator(open(filename), parser=RecordParser()) 424 while 1: 425 start = iter._uhandle.tell() 426 rec = iter.next() 427 length = iter._uhandle.tell() - start 428 429 if rec is None: 430 break 431 if rec2key is not None: 432 key = rec2key(rec) 433 else: 434 key = rec.title 435 436 if not key: 437 raise KeyError, "empty sequence key was produced" 438 elif index.has_key(key): 439 raise KeyError, "duplicate key %s found" % key 440 441 index[key] = start, length
442