Package Bio :: Package Gobase
[hide private]
[frames] | no frames]

Source Code for Package Bio.Gobase

  1  # Copyright 2000 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with files from Gobase. 
  8  http://megasun.bch.umontreal.ca/gobase/ 
  9   
 10   
 11  Classes: 
 12  Record             Holds gobase sequence data. 
 13  Iterator           Iterates over sequence data in a gobase file. 
 14  Dictionary         Accesses a gobase file using a dictionary interface. 
 15  RecordParser       Parses gobase sequence data into a Record object. 
 16   
 17  _Scanner           Scans a gobase-format stream. 
 18  _RecordConsumer    Consumes gobase data to a Record object. 
 19   
 20   
 21  Functions: 
 22  index_file         Index a FASTA file for a Dictionary. 
 23   
 24  """ 
 25   
 26  import warnings 
 27  warnings.warn("Bio.Gobase is deprecated, as this module doesn't seem to have any users. If you are using Bio.Gobase, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module.", DeprecationWarning) 
 28   
 29  from types import * 
 30  import string 
 31  import re 
 32  from Bio import File 
 33  from Bio import Index 
 34  from Bio.ParserSupport import * 
 35   
36 -class Record:
37 """Holds information from a Gobase record. 38 39 Members: 40 species_name 41 taxon_division 42 gobase_id 43 """
44 - def __init__(self, colwidth=60):
45 """__init__(self, colwidth=60) 46 47 Create a new Record. colwidth specifies the number of residues 48 to put on each line. 49 50 """ 51 self.species_name = '' 52 self.taxon_division = ''
53
54 -class SequenceRecord( Record ):
55 """Holds information from a Gobase record. 56 57 Members: 58 molecule_type 59 is_plasmid 60 shape 61 submission_date 62 update_date 63 entrez_record 64 genbank_accession 65 """
66 - def __init__(self, colwidth=60):
67 """__init__(self, colwidth=60) 68 69 Create a new Record. colwidth specifies the number of residues 70 to put on each line. 71 72 """ 73 Record.__init__( self ) 74 self.molecule_type = '' 75 self.is_plasmid = '' 76 self.shape = '' 77 self.submission_date = '' 78 self.update_date = '' 79 self.entrez_record = '' 80 self.genbank_accession = ''
81
82 -class GeneRecord( Record ):
83 """Holds information from a Gobase record. 84 85 Members: 86 """
87 - def __init__(self, colwidth=60):
88 """__init__(self, colwidth=60) 89 90 Create a new Record. colwidth specifies the number of residues 91 to put on each line. 92 93 """ 94 Record.__init__( self ) 95 self.gene_class = '' 96 self.plasmid_encoded = '' 97 self.is_partial_gene = '' 98 self.is_pseudo_gene = '' 99 self.is_transpliced_gene = '' 100 self.chloroplast_origin = '' 101 self.contains_intron = '' 102 self.orf = '' 103 self.included_in_intron = '' 104 self.published_info = '' 105 self.genbank_accession = '' 106 self.entrez_record = '' 107 self.product_type = '' 108 self.product_class = ''
109
110 -class ProteinRecord( Record ):
111 """Holds information from a Gobase record. 112 113 Members: 114 product_class 115 gene_class 116 is_partial_protein 117 is_plasmid 118 function 119 entry_record 120 """
121 - def __init__(self, colwidth=60):
122 """__init__(self, colwidth=60) 123 124 Create a new Record. colwidth specifies the number of residues 125 to put on each line. 126 127 """ 128 Record.__init__( self ) 129 self.product_class = '' 130 self.gene_class = '' 131 self.is_partial_protein = '' 132 self.is_plasmid = '' 133 self.is_pseudo = '' 134 self.function = '' 135 self.entry_record = ''
136
137 -class Iterator:
138 """Returns one record at a time from a Gobase file. 139 140 Methods: 141 next Return the next record from the stream, or None. 142 143 """
144 - def __init__(self, handle, parser=None):
145 """__init__(self, handle, parser=None) 146 147 Create a new iterator. handle is a file-like object. parser 148 is an optional Parser object to change the results into another form. 149 If set to None, then the raw contents of the file will be returned. 150 151 """ 152 if type(handle) is not FileType and type(handle) is not InstanceType: 153 raise ValueError, "I expected a file handle or file-like object" 154 self._uhandle = SGMLHandle( File.UndoHandle( handle ) ) 155 self._parser = parser
156
157 - def next(self):
158 """next(self) -> object 159 160 Return the next gobase record from the file. If no more records, 161 return None. 162 163 """ 164 lines = [] 165 first_tag = 'Recognition Sequence' 166 while 1: 167 line = self._uhandle.readline() 168 if not line: 169 break 170 if line[:len( first_tag )] == 'first_tag': 171 self._uhandle.saveline(line) 172 break 173 174 if not line: 175 return None 176 177 if self._parser is not None: 178 return self._parser.parse(File.StringHandle(data)) 179 return data
180
181 - def __iter__(self):
182 return iter(self.next, None)
183
184 -class Dictionary:
185 """Accesses a gobase file using a dictionary interface. 186 187 """ 188 __filename_key = '__filename' 189
190 - def __init__(self, indexname, parser=None):
191 """__init__(self, indexname, parser=None) 192 193 Open a Gobase Dictionary. indexname is the name of the 194 index for the dictionary. The index should have been created 195 using the index_file function. parser is an optional Parser 196 object to change the results into another form. If set to None, 197 then the raw contents of the file will be returned. 198 199 """ 200 self._index = Index.Index(indexname) 201 self._handle = open(self._index[Dictionary.__filename_key]) 202 self._parser = parser
203
204 - def __len__(self):
205 return len(self._index)
206
207 - def __getitem__(self, key):
208 start, len = self._index[key] 209 self._handle.seek(start) 210 data = self._handle.read(len) 211 if self._parser is not None: 212 return self._parser.parse(File.StringHandle(data)) 213 return data
214
215 - def __getattr__(self, name):
216 return getattr(self._index, name)
217
218 -class RecordParser:
219 """Parses Gobase sequence data into a Record object. 220 221 """
222 - def __init__(self):
223 self._scanner = _Scanner() 224 self._consumer = _RecordConsumer()
225
226 - def parse(self, handle):
227 self._scanner.feed(handle, self._consumer) 228 return self._consumer.data
229
230 -class _Scanner:
231 """Scans a gobase file. 232 233 Methods: 234 feed Feed in one gobase record. 235 236 """
237 - def feed(self, handle, consumer):
238 """feed(self, handle, consumer) 239 240 Feed in gobase data for scanning. handle is a file-like object 241 containing gobase data. consumer is a Consumer object that will 242 receive events as the gobase data is scanned. 243 244 """ 245 if isinstance(handle, File.UndoHandle): 246 uhandle = handle 247 else: 248 uhandle = File.UndoHandle(handle) 249 uhandle = File.SGMLHandle( uhandle ) 250 251 if uhandle.peekline(): 252 self._scan_record(uhandle, consumer)
253
254 - def _scan_line(self, uhandle ):
255 line = safe_readline( uhandle ) 256 line = string.join( string.split( line ), ' ' ) + ' ' 257 return line
258
259 - def _text_in( self, uhandle, text, count ):
260 for j in range( count ): 261 try: 262 line = self._scan_line( uhandle ) 263 text = text + line 264 except: 265 if( line == '' ): 266 return text 267 return text
268
269 - def _scan_sequence_record( self, text, consumer ):
270 data = consumer.data 271 next_item = self._scan_field( text, 'Molecule type:', 'Species name:' ) 272 data.molecule_type = consumer.text_field( next_item ) 273 274 next_item = self._scan_field( text, 'Shape of molecule:', 'Sequence length:' ) 275 data.shape = consumer.text_field( next_item ) 276 277 next_item = self._scan_field( text, 'Plasmid:', 'Complete genome:' ) 278 data.is_plasmid = consumer.text_field( next_item ) 279 280 next_item = self._scan_field( text, 'NCBI Entrez record:', 'Genbank accession:' ) 281 data.entrez_record = consumer.text_field( next_item ) 282 283 next_item = self._scan_field( text, 'Genbank accession:', 'Coding gene(s):' ) 284 data.genbank_accession = consumer.text_field( next_item ) 285 consumer.data = data
286
287 - def _scan_gene_record( self, text, consumer ):
288 data = consumer.data 289 next_item = self._scan_field( text, 'Gene Class:', 'Species name:' ) 290 data.gene_class = consumer.text_field( next_item ) 291 292 next_item = self._scan_field( text, 'Plasmid encoded:', 'Partial gene:' ) 293 data.is_plasmid = consumer.word_field( next_item ) 294 295 next_item = self._scan_field( text, 'Partial gene:', 'Pseudo:' ) 296 data.is_partial_gene = consumer.text_field( next_item ) 297 298 next_item = self._scan_field( text, 'Pseudo:', 'Transpliced gene:' ) 299 data.is_pseudo_gene = consumer.text_field( next_item ) 300 301 next_item = self._scan_field( text, 'Transpliced gene:', 'Chloroplast origin:' ) 302 data.is_transpliced_gene = consumer.text_field( next_item ) 303 304 next_item = self._scan_field( text, 'Chloroplast origin:', 'Contains intron(s):' ) 305 data.chloroplast_origin = consumer.word_field( next_item ) 306 307 next_item = self._scan_field( text, 'Contains intron(s):' ) 308 data.contains_intron = consumer.word_field( next_item ) 309 310 next_item = self._scan_field( text, 'Included in intron:' ) 311 data.included_in_intron = consumer.word_field( next_item ) 312 313 next_item = self._scan_field( text, 'ORF:' ) 314 data.orf = consumer.word_field( next_item ) 315 316 next_item = self._scan_field( text, 'NCBI Entrez record:' ) 317 data.entrez_record = consumer.word_field( next_item ) 318 319 next_item = self._scan_field( text, 'Genbank accession:', 'Product type:' ) 320 data.genbank_accession = consumer.word_field( next_item ) 321 322 next_item = self._scan_field( text, 'Product type:', 'Product Class:' ) 323 data.product_type = consumer.text_field( next_item ) 324 325 next_item = self._scan_field( text, 'Product Class:' ) 326 data.product_class = consumer.text_field( next_item ) 327 328 consumer.data = data
329
330 - def _scan_protein_record( self, text, consumer ):
331 data = consumer.data 332 next_item = self._scan_field( text, 'Product Class:', 'Species name:' ) 333 data.product_class = consumer.text_field( next_item ) 334 335 next_item = self._scan_field( text, 'Gene Class:', 'Partial protein:' ) 336 data.gene_class = consumer.text_field( next_item ) 337 338 next_item = self._scan_field( text, 'Partial protein:', 'Conflict:' ) 339 data.is_partial_protein = consumer.text_field( next_item ) 340 341 next_item = self._scan_field( text, 'Plasmid:', 'Sequence length:' ) 342 data.is_plasmid = consumer.text_field( next_item ) 343 344 next_item = self._scan_field( text, 'General function:' ) 345 data.function = consumer.text_field( next_item ) 346 347 next_item = self._scan_field( text, 'NCBI Entrez record:' ) 348 data.entrez_record = consumer.word_field( next_item ) 349 350 consumer.data = data
351
352 - def _scan_record(self, uhandle, consumer):
353 text = '' 354 text = self._text_in( uhandle, text, 100 ) 355 text = string.lstrip( text ) 356 357 if( string.find( text, 'Sequence' ) == 0 ): 358 consumer.data = SequenceRecord() 359 self._scan_sequence_record( text, consumer ) 360 elif( string.find( text, 'Gene' ) == 0 ): 361 consumer.data = GeneRecord() 362 self._scan_gene_record( text, consumer ) 363 elif( string.find( text, 'Protein' ) == 0 ): 364 consumer.data = ProteinRecord() 365 self._scan_protein_record( text, consumer ) 366 else: 367 print 'UNKNOWN!!!!!!' 368 369 data = consumer.data 370 next_item = self._scan_field( text, 'Species name:', 'Taxon division' ) 371 data.species_name = consumer.text_field( next_item ) 372 373 next_item = self._scan_field( text, 'Taxon division:' ) 374 print next_item 375 data.taxon_division = consumer.word_field( next_item ) 376 consumer.data = data
377 378 # consumer.end_sequence() 379 380
381 - def _scan_field(self, text, field, next_field = None ):
382 start = string.find( text, field ) 383 if( start == -1 ): 384 return '' 385 if( next_field == None ): 386 pattern = re.compile( '[A-Z][a-z0-9 ]+:' ) 387 offset = start + len( field ) 388 match = pattern.search( text[ offset: ] ) 389 if match: 390 end = offset + match.start() 391 else: 392 end = start + 40 393 else: 394 end = string.find( text, next_field ) 395 if( end == -1 ): 396 return '' 397 next_item = text[ start:end ] 398 return( next_item )
399 400
401 -class _RecordConsumer(AbstractConsumer):
402 """Consumer that converts a gobase record to a Record object. 403 404 Members: 405 data Record with gobase data. 406 407 """
408 - def __init__(self):
409 self.data = None
410
411 - def end_sequence(self):
412 pass
413
414 - def text_field( self, line ):
415 if( line == '' ): 416 return '' 417 cols = string.split( line, ': ' ) 418 return( cols[ 1 ] )
419
420 - def int_field( self, line ):
421 if( line == '' ): 422 return None 423 cols = string.split( line, ': ' ) 424 return( int( cols[ 1 ] ) )
425
426 - def word_field( self, line ):
427 if( line == '' ): 428 return '' 429 cols = string.split( line, ': ' ) 430 cols = string.split( cols[ 1 ] ) 431 return( cols[ 0 ] )
432
433 - def date_field( self, line ):
434 if( line == '' ): 435 return '' 436 cols = string.split( line, ':' ) 437 cols = string.split( cols[ 1 ] ) 438 return( string.join( cols[ :3 ] ) )
439 440
441 -def index_file(filename, indexname, rec2key=None):
442 """index_file(filename, ind/exname, rec2key=None) 443 444 Index a gobase file. filename is the name of the file. 445 indexname is the name of the dictionary. rec2key is an 446 optional callback that takes a Record and generates a unique key 447 (e.g. the accession number) for the record. If not specified, 448 the sequence title will be used. 449 450 """ 451 if not os.path.exists(filename): 452 raise ValueError, "%s does not exist" % filename 453 454 index = Index.Index(indexname, truncate=1) 455 index[Dictionary._Dictionary__filename_key] = filename 456 457 iter = Iterator(open(filename), parser=RecordParser()) 458 while 1: 459 start = iter._uhandle.tell() 460 rec = iter.next() 461 length = iter._uhandle.tell() - start 462 463 if rec is None: 464 break 465 if rec2key is not None: 466 key = rec2key(rec) 467 else: 468 key = rec.title 469 470 if not key: 471 raise KeyError, "empty sequence key was produced" 472 elif index.has_key(key): 473 raise KeyError, "duplicate key %s found" % key 474 475 index[key] = start, length
476