Package Bio :: Package Mindy :: Module SimpleSeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.Mindy.SimpleSeqRecord

  1  """Index a file based on information in a SeqRecord object. 
  2   
  3  This indexer tries to make it simple to index a file of records (ie. like 
  4  a GenBank file full of entries) so that individual records can be  
  5  readily retrieved. 
  6   
  7  The indexing in this file takes place by converting the elements in the 
  8  file into SeqRecord objects, and then indexing by some item in these  
  9  SeqRecords. This is a slower method, but is very flexible. 
 10   
 11  We have two default functions to index by the id and name elements of a 
 12  SeqRecord (ie. LOCUS and accession number from GenBank). There is also  
 13  a base class which you can derive from to create your own indexer 
 14  which allows you to index by anything you feel like using python code. 
 15  """ 
 16  from Bio.builders.SeqRecord.sequence import BuildSeqRecord 
 17   
 18  # --- base class for indexing using SeqRecord information 
19 -class BaseSeqRecordIndexer:
20 """Base class for indexing using SeqRecord information. 21 22 This is the class you should derive from to index using some type of 23 information in a SeqRecord. This is an abstract base class, so it needs 24 to be subclassed to be useful. 25 """
26 - def __init__(self):
27 pass
28
29 - def get_builder(self):
30 tricky_builder = FixDocumentBuilder(self.get_id_dictionary) 31 return tricky_builder
32
33 - def primary_key_name(self):
34 raise NotImplementedError("Please implement in derived classes")
35
36 - def secondary_key_names(self):
37 raise NotImplementedError("Please implement in derived classes")
38
39 - def get_id_dictionary(self, seq_record):
40 raise NotImplementedError("Please implement in derived classes")
41
42 -class SimpleIndexer(BaseSeqRecordIndexer):
43 """Index a file based on .id and .name attributes of a SeqRecord. 44 45 A simple-minded indexing scheme which should work for simple cases. The 46 easiest way to use this is trhough the create_*db functions of this 47 module. 48 """
49 - def __init__(self):
51
52 - def primary_key_name(self):
53 return "id"
54
55 - def secondary_key_names(self):
56 return ["name", "aliases"]
57
58 - def get_id_dictionary(self, seq_record):
59 # XXX implement aliases once we have this attribute in SeqRecords 60 id_info = {"id" : [seq_record.id], 61 "name" : [seq_record.name], 62 "aliases" : []} 63 return id_info
64
65 -class FunctionIndexer(BaseSeqRecordIndexer):
66 """Indexer to index based on values returned by a function. 67 68 This class is passed a function which will return id, name and alias 69 information from a SeqRecord object. It needs to return either one item, 70 which is an id from the title, or three items which are (in order), the id, 71 a list of names, and a list of aliases. 72 73 This indexer allows indexing to be completely flexible based on passed 74 functions. 75 """
76 - def __init__(self, index_function):
77 BaseSeqRecordIndexer.__init__(self) 78 self.index_function = index_function
79
80 - def primary_key_name(self):
81 return "id"
82
83 - def secondary_key_names(self):
84 return ["name", "aliases"]
85
86 - def get_id_dictionary(self, seq_record):
87 items = self.index_function(seq_record) 88 if type(items) is not type([]) and type(items) is not type(()): 89 items = [items] 90 if len(items) == 1: 91 seq_id = items[0] 92 name = [] 93 aliases = [] 94 elif len(items) == 3: 95 seq_id, name, aliases = items 96 else: 97 raise ValueError("Unexpected items from index function: %s" % 98 (items)) 99 100 return {"id" : [seq_id], 101 "name" : name, 102 "aliases" : aliases}
103
104 -class FixDocumentBuilder(BuildSeqRecord):
105 """A SAX builder-style class to make a parsed SeqRecord available. 106 107 This class does a lot of trickery to make things fit in the SAX 108 framework and still have the flexibility to use a built SeqRecord 109 object. 110 111 You shouldn't really need to use this class unless you are doing 112 something really fancy-pants; otherwise, just use the 113 BaseSeqRecordIndexer interfaces. 114 """
115 - def __init__(self, get_ids_callback):
116 """Intialize with a callback function to gets id info from a SeqRecord. 117 118 get_ids_callback should be a callable function that will take a 119 SeqRecord object and return a dictionary mapping id names to 120 the valid ids for these names. 121 """ 122 BuildSeqRecord.__init__(self) 123 self._ids_callback = get_ids_callback
124
125 - def end_record(self, tag):
126 """Overrride the builder function to muck with the document attribute. 127 """ 128 # first build up the SeqRecord 129 BuildSeqRecord.end_record(self, tag) 130 # now convert the SeqRecord into the dictionary that the indexer needs 131 self.document = self._ids_callback(self.document)
132 133 # --- convenience functions for indexing 134 # you should just use these unless you are doing something fancy
135 -def create_berkeleydb(files, db_name, indexer = SimpleIndexer()):
136 from Bio.Mindy import BerkeleyDB 137 unique_name = indexer.primary_key_name() 138 alias_names = indexer.secondary_key_names() 139 creator = BerkeleyDB.create(db_name, unique_name, alias_names) 140 builder = indexer.get_builder() 141 for filename in files: 142 creator.load(filename, builder = builder, fileid_info = {}) 143 creator.close()
144
145 -def create_flatdb(files, db_name, indexer = SimpleIndexer()):
146 from Bio.Mindy import FlatDB 147 unique_name = indexer.primary_key_name() 148 alias_names = indexer.secondary_key_names() 149 creator = FlatDB.create(db_name, unique_name, alias_names) 150 builder = indexer.get_builder() 151 for filename in files: 152 creator.load(filename, builder = builder, fileid_info = {}) 153 creator.close()
154