Package Bio :: Package Gobase
[hide private]
[frames] | no frames]

Source Code for Package Bio.Gobase

  1  # Copyright 2000 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with files from Gobase. 
  8  http://megasun.bch.umontreal.ca/gobase/ 
  9   
 10   
 11  Classes: 
 12  Record             Holds gobase sequence data. 
 13  Iterator           Iterates over sequence data in a gobase file. 
 14  Dictionary         Accesses a gobase file using a dictionary interface. 
 15  RecordParser       Parses gobase sequence data into a Record object. 
 16   
 17  _Scanner           Scans a gobase-format stream. 
 18  _RecordConsumer    Consumes gobase data to a Record object. 
 19   
 20   
 21  Functions: 
 22  index_file         Index a FASTA file for a Dictionary. 
 23   
 24  """ 
 25  from types import * 
 26  import string 
 27  import re 
 28  from Bio import File 
 29  from Bio import Index 
 30  from Bio.ParserSupport import * 
 31   
32 -class Record:
33 """Holds information from a Gobase record. 34 35 Members: 36 species_name 37 taxon_division 38 gobase_id 39 """
40 - def __init__(self, colwidth=60):
41 """__init__(self, colwidth=60) 42 43 Create a new Record. colwidth specifies the number of residues 44 to put on each line. 45 46 """ 47 self.species_name = '' 48 self.taxon_division = ''
49
50 -class SequenceRecord( Record ):
51 """Holds information from a Gobase record. 52 53 Members: 54 molecule_type 55 is_plasmid 56 shape 57 submission_date 58 update_date 59 entrez_record 60 genbank_accession 61 """
62 - def __init__(self, colwidth=60):
63 """__init__(self, colwidth=60) 64 65 Create a new Record. colwidth specifies the number of residues 66 to put on each line. 67 68 """ 69 Record.__init__( self ) 70 self.molecule_type = '' 71 self.is_plasmid = '' 72 self.shape = '' 73 self.submission_date = '' 74 self.update_date = '' 75 self.entrez_record = '' 76 self.genbank_accession = ''
77
78 -class GeneRecord( Record ):
79 """Holds information from a Gobase record. 80 81 Members: 82 """
83 - def __init__(self, colwidth=60):
84 """__init__(self, colwidth=60) 85 86 Create a new Record. colwidth specifies the number of residues 87 to put on each line. 88 89 """ 90 Record.__init__( self ) 91 self.gene_class = '' 92 self.plasmid_encoded = '' 93 self.is_partial_gene = '' 94 self.is_pseudo_gene = '' 95 self.is_transpliced_gene = '' 96 self.chloroplast_origin = '' 97 self.contains_intron = '' 98 self.orf = '' 99 self.included_in_intron = '' 100 self.published_info = '' 101 self.genbank_accession = '' 102 self.entrez_record = '' 103 self.product_type = '' 104 self.product_class = ''
105
106 -class ProteinRecord( Record ):
107 """Holds information from a Gobase record. 108 109 Members: 110 product_class 111 gene_class 112 is_partial_protein 113 is_plasmid 114 function 115 entry_record 116 """
117 - def __init__(self, colwidth=60):
118 """__init__(self, colwidth=60) 119 120 Create a new Record. colwidth specifies the number of residues 121 to put on each line. 122 123 """ 124 Record.__init__( self ) 125 self.product_class = '' 126 self.gene_class = '' 127 self.is_partial_protein = '' 128 self.is_plasmid = '' 129 self.is_pseudo = '' 130 self.function = '' 131 self.entry_record = ''
132
133 -class Iterator:
134 """Returns one record at a time from a Gobase file. 135 136 Methods: 137 next Return the next record from the stream, or None. 138 139 """
140 - def __init__(self, handle, parser=None):
141 """__init__(self, handle, parser=None) 142 143 Create a new iterator. handle is a file-like object. parser 144 is an optional Parser object to change the results into another form. 145 If set to None, then the raw contents of the file will be returned. 146 147 """ 148 if type(handle) is not FileType and type(handle) is not InstanceType: 149 raise ValueError, "I expected a file handle or file-like object" 150 self._uhandle = SGMLHandle( File.UndoHandle( handle ) ) 151 self._parser = parser
152
153 - def next(self):
154 """next(self) -> object 155 156 Return the next gobase record from the file. If no more records, 157 return None. 158 159 """ 160 lines = [] 161 first_tag = 'Recognition Sequence' 162 while 1: 163 line = self._uhandle.readline() 164 if not line: 165 break 166 if line[:len( first_tag )] == 'first_tag': 167 self._uhandle.saveline(line) 168 break 169 170 if not line: 171 return None 172 173 if self._parser is not None: 174 return self._parser.parse(File.StringHandle(data)) 175 return data
176
177 - def __iter__(self):
178 return iter(self.next, None)
179
180 -class Dictionary:
181 """Accesses a gobase file using a dictionary interface. 182 183 """ 184 __filename_key = '__filename' 185
186 - def __init__(self, indexname, parser=None):
187 """__init__(self, indexname, parser=None) 188 189 Open a Gobase Dictionary. indexname is the name of the 190 index for the dictionary. The index should have been created 191 using the index_file function. parser is an optional Parser 192 object to change the results into another form. If set to None, 193 then the raw contents of the file will be returned. 194 195 """ 196 self._index = Index.Index(indexname) 197 self._handle = open(self._index[Dictionary.__filename_key]) 198 self._parser = parser
199
200 - def __len__(self):
201 return len(self._index)
202
203 - def __getitem__(self, key):
204 start, len = self._index[key] 205 self._handle.seek(start) 206 data = self._handle.read(len) 207 if self._parser is not None: 208 return self._parser.parse(File.StringHandle(data)) 209 return data
210
211 - def __getattr__(self, name):
212 return getattr(self._index, name)
213
214 -class RecordParser:
215 """Parses Gobase sequence data into a Record object. 216 217 """
218 - def __init__(self):
219 self._scanner = _Scanner() 220 self._consumer = _RecordConsumer()
221
222 - def parse(self, handle):
223 self._scanner.feed(handle, self._consumer) 224 return self._consumer.data
225
226 -class _Scanner:
227 """Scans a gobase file. 228 229 Methods: 230 feed Feed in one gobase record. 231 232 """
233 - def feed(self, handle, consumer):
234 """feed(self, handle, consumer) 235 236 Feed in gobase data for scanning. handle is a file-like object 237 containing gobase data. consumer is a Consumer object that will 238 receive events as the gobase data is scanned. 239 240 """ 241 if isinstance(handle, File.UndoHandle): 242 uhandle = handle 243 else: 244 uhandle = File.UndoHandle(handle) 245 uhandle = File.SGMLHandle( uhandle ) 246 247 if uhandle.peekline(): 248 self._scan_record(uhandle, consumer)
249
250 - def _scan_line(self, uhandle ):
251 line = safe_readline( uhandle ) 252 line = string.join( string.split( line ), ' ' ) + ' ' 253 return line
254
255 - def _text_in( self, uhandle, text, count ):
256 for j in range( count ): 257 try: 258 line = self._scan_line( uhandle ) 259 text = text + line 260 except: 261 if( line == '' ): 262 return text 263 return text
264
265 - def _scan_sequence_record( self, text, consumer ):
266 data = consumer.data 267 next_item = self._scan_field( text, 'Molecule type:', 'Species name:' ) 268 data.molecule_type = consumer.text_field( next_item ) 269 270 next_item = self._scan_field( text, 'Shape of molecule:', 'Sequence length:' ) 271 data.shape = consumer.text_field( next_item ) 272 273 next_item = self._scan_field( text, 'Plasmid:', 'Complete genome:' ) 274 data.is_plasmid = consumer.text_field( next_item ) 275 276 next_item = self._scan_field( text, 'NCBI Entrez record:', 'Genbank accession:' ) 277 data.entrez_record = consumer.text_field( next_item ) 278 279 next_item = self._scan_field( text, 'Genbank accession:', 'Coding gene(s):' ) 280 data.genbank_accession = consumer.text_field( next_item ) 281 consumer.data = data
282
283 - def _scan_gene_record( self, text, consumer ):
284 data = consumer.data 285 next_item = self._scan_field( text, 'Gene Class:', 'Species name:' ) 286 data.gene_class = consumer.text_field( next_item ) 287 288 next_item = self._scan_field( text, 'Plasmid encoded:', 'Partial gene:' ) 289 data.is_plasmid = consumer.word_field( next_item ) 290 291 next_item = self._scan_field( text, 'Partial gene:', 'Pseudo:' ) 292 data.is_partial_gene = consumer.text_field( next_item ) 293 294 next_item = self._scan_field( text, 'Pseudo:', 'Transpliced gene:' ) 295 data.is_pseudo_gene = consumer.text_field( next_item ) 296 297 next_item = self._scan_field( text, 'Transpliced gene:', 'Chloroplast origin:' ) 298 data.is_transpliced_gene = consumer.text_field( next_item ) 299 300 next_item = self._scan_field( text, 'Chloroplast origin:', 'Contains intron(s):' ) 301 data.chloroplast_origin = consumer.word_field( next_item ) 302 303 next_item = self._scan_field( text, 'Contains intron(s):' ) 304 data.contains_intron = consumer.word_field( next_item ) 305 306 next_item = self._scan_field( text, 'Included in intron:' ) 307 data.included_in_intron = consumer.word_field( next_item ) 308 309 next_item = self._scan_field( text, 'ORF:' ) 310 data.orf = consumer.word_field( next_item ) 311 312 next_item = self._scan_field( text, 'NCBI Entrez record:' ) 313 data.entrez_record = consumer.word_field( next_item ) 314 315 next_item = self._scan_field( text, 'Genbank accession:', 'Product type:' ) 316 data.genbank_accession = consumer.word_field( next_item ) 317 318 next_item = self._scan_field( text, 'Product type:', 'Product Class:' ) 319 data.product_type = consumer.text_field( next_item ) 320 321 next_item = self._scan_field( text, 'Product Class:' ) 322 data.product_class = consumer.text_field( next_item ) 323 324 consumer.data = data
325
326 - def _scan_protein_record( self, text, consumer ):
327 data = consumer.data 328 next_item = self._scan_field( text, 'Product Class:', 'Species name:' ) 329 data.product_class = consumer.text_field( next_item ) 330 331 next_item = self._scan_field( text, 'Gene Class:', 'Partial protein:' ) 332 data.gene_class = consumer.text_field( next_item ) 333 334 next_item = self._scan_field( text, 'Partial protein:', 'Conflict:' ) 335 data.is_partial_protein = consumer.text_field( next_item ) 336 337 next_item = self._scan_field( text, 'Plasmid:', 'Sequence length:' ) 338 data.is_plasmid = consumer.text_field( next_item ) 339 340 next_item = self._scan_field( text, 'General function:' ) 341 data.function = consumer.text_field( next_item ) 342 343 next_item = self._scan_field( text, 'NCBI Entrez record:' ) 344 data.entrez_record = consumer.word_field( next_item ) 345 346 consumer.data = data
347
348 - def _scan_record(self, uhandle, consumer):
349 text = '' 350 text = self._text_in( uhandle, text, 100 ) 351 text = string.lstrip( text ) 352 353 if( string.find( text, 'Sequence' ) == 0 ): 354 consumer.data = SequenceRecord() 355 self._scan_sequence_record( text, consumer ) 356 elif( string.find( text, 'Gene' ) == 0 ): 357 consumer.data = GeneRecord() 358 self._scan_gene_record( text, consumer ) 359 elif( string.find( text, 'Protein' ) == 0 ): 360 consumer.data = ProteinRecord() 361 self._scan_protein_record( text, consumer ) 362 else: 363 print 'UNKNOWN!!!!!!' 364 365 data = consumer.data 366 next_item = self._scan_field( text, 'Species name:', 'Taxon division' ) 367 data.species_name = consumer.text_field( next_item ) 368 369 next_item = self._scan_field( text, 'Taxon division:' ) 370 print next_item 371 data.taxon_division = consumer.word_field( next_item ) 372 consumer.data = data
373 374 # consumer.end_sequence() 375 376
377 - def _scan_field(self, text, field, next_field = None ):
378 start = string.find( text, field ) 379 if( start == -1 ): 380 return '' 381 if( next_field == None ): 382 pattern = re.compile( '[A-Z][a-z0-9 ]+:' ) 383 offset = start + len( field ) 384 match = pattern.search( text[ offset: ] ) 385 if match: 386 end = offset + match.start() 387 else: 388 end = start + 40 389 else: 390 end = string.find( text, next_field ) 391 if( end == -1 ): 392 return '' 393 next_item = text[ start:end ] 394 return( next_item )
395 396
397 -class _RecordConsumer(AbstractConsumer):
398 """Consumer that converts a gobase record to a Record object. 399 400 Members: 401 data Record with gobase data. 402 403 """
404 - def __init__(self):
405 self.data = None
406
407 - def end_sequence(self):
408 pass
409
410 - def text_field( self, line ):
411 if( line == '' ): 412 return '' 413 cols = string.split( line, ': ' ) 414 return( cols[ 1 ] )
415
416 - def int_field( self, line ):
417 if( line == '' ): 418 return None 419 cols = string.split( line, ': ' ) 420 return( int( cols[ 1 ] ) )
421
422 - def word_field( self, line ):
423 if( line == '' ): 424 return '' 425 cols = string.split( line, ': ' ) 426 cols = string.split( cols[ 1 ] ) 427 return( cols[ 0 ] )
428
429 - def date_field( self, line ):
430 if( line == '' ): 431 return '' 432 cols = string.split( line, ':' ) 433 cols = string.split( cols[ 1 ] ) 434 return( string.join( cols[ :3 ] ) )
435 436
437 -def index_file(filename, indexname, rec2key=None):
438 """index_file(filename, ind/exname, rec2key=None) 439 440 Index a gobase file. filename is the name of the file. 441 indexname is the name of the dictionary. rec2key is an 442 optional callback that takes a Record and generates a unique key 443 (e.g. the accession number) for the record. If not specified, 444 the sequence title will be used. 445 446 """ 447 if not os.path.exists(filename): 448 raise ValueError, "%s does not exist" % filename 449 450 index = Index.Index(indexname, truncate=1) 451 index[Dictionary._Dictionary__filename_key] = filename 452 453 iter = Iterator(open(filename), parser=RecordParser()) 454 while 1: 455 start = iter._uhandle.tell() 456 rec = iter.next() 457 length = iter._uhandle.tell() - start 458 459 if rec is None: 460 break 461 if rec2key is not None: 462 key = rec2key(rec) 463 else: 464 key = rec.title 465 466 if not key: 467 raise KeyError, "empty sequence key was produced" 468 elif index.has_key(key): 469 raise KeyError, "duplicate key %s found" % key 470 471 index[key] = start, length
472