Package Bio :: Package Mindy :: Module FlatDB
[hide private]
[frames] | no frames]

Source Code for Module Bio.Mindy.FlatDB

  1   
  2  import os, bisect 
  3  import BaseDB, Location 
  4  import Bio 
  5   
  6  _open = open 
  7  INDEX_TYPE = "flat/1" 
  8   
9 -def _parse_primary_table_entry(s):
10 name, filetag, startpos, length = s.rstrip().split("\t") 11 return name, filetag, long(startpos), long(length)
12
13 -def _read_primary_table(filename):
14 infile = _open(filename, "rb") 15 size = int(infile.read(4)) 16 table = {} 17 while 1: 18 s = infile.read(size) 19 if not s: 20 break 21 assert len(s) == size, (repr(s), size) 22 name, filetag, startpos, length = _parse_primary_table_entry(s) 23 table[name] = filetag, startpos, length 24 return table
25
26 -def _write_primary_table(filename, primary_table):
27 info = primary_table.items() 28 info.sort() 29 n = 1 30 for k, v in info: 31 # Find the longest width 32 s = "%s\t%s" % (k, v) 33 if len(s) > n: 34 n = len(s) 35 if n > 9999: 36 raise AssertionError( 37 "Primary index record too large for format spec! " + 38 " %s bytes in %r" % (n, s)) 39 outfile = _open(filename, "wb") 40 outfile.write("%04d" % n) 41 for k, v in info: 42 s = "%s\t%s" % (k, v) 43 outfile.write(s.ljust(n)) 44 outfile.close()
45
46 -def _parse_secondary_table_entry(s):
47 return s.rstrip().split("\t")
48
49 -def _read_secondary_table(filename):
50 infile = _open(filename, "rb") 51 size = int(infile.read(4)) 52 table = {} 53 while 1: 54 s = infile.read(size) 55 if not s: 56 break 57 assert len(s) == size, (repr(s), size) 58 alias, name = _parse_secondary_table_entry(s) 59 table.setdefault(alias, []).append(name) 60 infile.close() 61 return table
62
63 -def _write_secondary_table(filename, table):
64 items = table.items() 65 items.sort() 66 # Find the largest field 67 n = 0 68 for k, v in items: 69 for x in v: 70 s = "%s\t%s" % (k, x) 71 if len(s) > n: 72 n = len(s) 73 if n > 9999: 74 raise AssertionError( 75 "Secondary index record too large for format spec! " + 76 " %s bytes in %r" % (n, s)) 77 # And write the output 78 outfile = _open(filename, "wb") 79 outfile.write("%04d" % n) 80 for k, v in items: 81 for x in v: 82 s = "%s\t%s" % (k, x) 83 outfile.write(s.ljust(n)) 84 outfile.close()
85
86 -class BaseFlatDB(BaseDB.OpenDB):
87 - def __init__(self, dbname):
88 BaseDB.OpenDB.__init__(self, dbname, INDEX_TYPE) 89 self.key_filename = os.path.join(dbname, 90 "key_%s.key" % self.primary_namespace)
91
92 -class PrimaryTable(BaseDB.DictLookup):
93 - def __init__(self, db, namespace, table):
94 self.db = db 95 self.namespace = namespace 96 self.table = table
97 - def __getitem__(self, name):
98 fileid, startpos, length = self.table[name] 99 return [Location.Location(self.namespace, 100 name, 101 self.db.fileid_info[fileid][0], 102 startpos, 103 length) 104 ]
105 - def keys(self):
106 return self.table.keys()
107
108 -class SecondaryTable(BaseDB.DictLookup):
109 - def __init__(self, db, namespace, table):
110 self.db = db 111 self.namespace = namespace 112 self.table = table
113 - def __getitem__(self, name):
114 data = [] 115 for entry in self.table[name]: 116 fileid, startpos, length = self.db.primary_table[entry] 117 data.append( Location.Location(self.namespace, 118 name, 119 self.db.fileid_info[fileid][0], 120 startpos, 121 length) ) 122 return data
123 - def keys(self):
124 return self.table.keys()
125
126 -class MemoryFlatDB(BaseDB.WriteDB, BaseFlatDB):
127 - def __init__(self, dbname):
128 self.__in_constructor = 1 129 self._need_flush = 0 130 BaseFlatDB.__init__(self, dbname) 131 132 primary_filename = os.path.join(self.dbname, 133 "key_%s.key" % (self.primary_namespace,) ) 134 self.primary_table = _read_primary_table(primary_filename) 135 136 self.secondary_tables = {} 137 for namespace in self.secondary_namespaces: 138 filename = os.path.join(self.dbname, "id_%s.index" % namespace) 139 self.secondary_tables[namespace] = _read_secondary_table(filename) 140 141 self.__in_constructor = 0
142
143 - def add_record(self, filetag, startpos, length, table):
144 key_list = table[self.primary_namespace] 145 if len(key_list) != 1: 146 raise TypeError( 147 "Field %s has %d entries but must have only one " 148 "(must be unique)" % (repr(unique), len(key_list))) 149 key = key_list[0] 150 if self.primary_table.has_key(key): 151 raise TypeError("Field %r = %r already exists; must be unique" % 152 (self.primary_namespace, key)) 153 self.primary_table[key] = "%s\t%s\t%s" % (filetag, 154 BaseDB._int_str(startpos), 155 BaseDB._int_str(length)) 156 157 for namespace in self.secondary_namespaces: 158 lookup = self.secondary_tables[namespace] 159 # Get the list of secondary identifiers for this identifier 160 for val in table.get(namespace, ()): 161 # Go from secondary identifier to list of primary identifiers 162 lookup.setdefault(val, []).append(key) 163 self._need_flush = 1
164
165 - def flush(self):
166 if not self._need_flush: 167 return 168 169 config_filename = os.path.join(self.dbname, "config.dat") 170 BaseDB.write_config(config_filename = config_filename, 171 index_type = INDEX_TYPE, 172 primary_namespace = self.primary_namespace, 173 secondary_namespaces = 174 self.secondary_tables.keys(), 175 fileid_info = self.fileid_info, 176 formatname = self.formatname, 177 ) 178 179 primary_filename = os.path.join(self.dbname, 180 "key_%s.key" % (self.primary_namespace,) ) 181 _write_primary_table(filename = primary_filename, 182 primary_table = self.primary_table) 183 184 185 # Write the secondary identifier information 186 for namespace, table in self.secondary_tables.items(): 187 filename = os.path.join(self.dbname, "id_%s.index" % namespace) 188 _write_secondary_table(filename = filename, 189 table = table) 190 191 self._need_flush = 0
192
193 - def close(self):
194 self.flush() 195 self.primary_table = self.fileid_info = self.filename_map = \ 196 self.secondary_tables = None
197
198 - def __del__(self):
199 if not self.__in_constructor: 200 self.close()
201 202
203 - def __getitem__(self, namespace):
204 """return the database table lookup for the given namespace""" 205 if namespace == self.primary_namespace: 206 return PrimaryTable(self, namespace, self.primary_table) 207 return SecondaryTable(self, namespace, self.secondary_tables[namespace])
208 209
210 -class BisectFile:
211 - def __init__(self, infile, size):
212 self.infile = infile 213 self.size = size 214 infile.seek(0) 215 self.record_size = int(infile.read(4)) 216 assert (size - 4) % self.record_size == 0, "record size is wrong"
217 - def __len__(self):
218 if self.record_size == 0: 219 return 0 220 return int((self.size - 4) / self.record_size)
221 - def __getitem__(self, i):
222 self.infile.seek(i * self.record_size + 4) 223 return self.infile.read(self.record_size).split("\t")[0]
224 - def get_entry(self, i):
225 self.infile.seek(i * self.record_size + 4) 226 return self.infile.read(self.record_size)
227
228 -def _find_entry(filename, wantword):
229 size = os.path.getsize(filename) 230 infile = _open(filename, "rb") 231 232 bf = BisectFile(infile, size) 233 left = bisect.bisect_left(bf, wantword) 234 line = bf.get_entry(left) 235 if not line.startswith(wantword): 236 return None 237 return line
238
239 -def _find_range(filename, wantword):
240 size = os.path.getsize(filename) 241 infile = _open(filename, "rb") 242 243 bf = BisectFile(infile, size) 244 left = bisect.bisect_left(bf, wantword) 245 line = bf.get_entry(left) 246 if not line.startswith(wantword): 247 return None 248 249 right = bisect.bisect_right(bf, wantword) 250 data = [] 251 for i in range(left, right): 252 x = bf.get_entry(i) 253 data.append(x) 254 return data
255 256
257 -def _lookup_location(key_filename, word):
258 line = _find_entry(key_filename, word) 259 if line is None: 260 return None 261 return _parse_primary_table_entry(line)[1:]
262
263 -def _lookup_alias(id_filename, word):
264 lines = _find_range(id_filename, word) 265 if not lines: 266 return None 267 primary_keys = [] 268 for line in lines: 269 alias, primary_key = _parse_secondary_table_entry(line) 270 assert alias == word, (alias, word) 271 primary_keys.append(primary_key) 272 return primary_keys
273
274 -def create(dbname, primary_namespace, secondary_namespaces, 275 formatname = "unknown"):
276 os.mkdir(dbname) 277 config_filename = os.path.join(dbname, "config.dat") 278 BaseDB.write_config(config_filename = config_filename, 279 index_type = INDEX_TYPE, 280 primary_namespace = primary_namespace, 281 secondary_namespaces = secondary_namespaces, 282 fileid_info = {}, 283 formatname = formatname, 284 ) 285 286 primary_filename = os.path.join(dbname, 287 "key_%s.key" % (primary_namespace,) ) 288 _write_primary_table(filename = primary_filename, 289 primary_table = {}) 290 291 292 # Write the secondary identifier information 293 for namespace in secondary_namespaces: 294 filename = os.path.join(dbname, "id_%s.index" % namespace) 295 _write_secondary_table(filename = filename, 296 table = {}) 297 return open(dbname, "rw")
298 299
300 -def open(dbname, mode = "r"):
301 if mode == "r": 302 return DiskFlatDB(dbname) 303 elif mode == "rw": 304 return MemoryFlatDB(dbname) 305 elif mode == "a": 306 raise TypeError("Must call FlatDB.create to create the database") 307 else: 308 raise TypeError("Unknown mode: %r" % (mode,))
309
310 -def _get_first_words(filename):
311 infile = _open(filename, "rb") 312 size = int(infile.read(4)) 313 data = [] 314 while 1: 315 s = infile.read(size) 316 if not s: 317 break 318 assert len(s) == size, (repr(s), size) 319 s = s.split("\t")[0] 320 if not data or data[-1] != s: 321 data.append(s) 322 return data
323
324 -class PrimaryNamespace(BaseDB.DictLookup):
325 - def __init__(self, db, namespace):
326 self.db = db 327 self.namespace = namespace
328 - def __getitem__(self, name):
329 loc = _lookup_location(self.db.key_filename, name) 330 if loc is None: 331 raise KeyError("Cannot find primary key %r" % (name,)) 332 data = [ 333 Location.Location(self.namespace, 334 name, 335 self.db.fileid_info[loc[0]][0], 336 loc[1], 337 loc[2]) 338 ] 339 return data
340 - def keys(self):
341 return _get_first_words(self.db.key_filename)
342 343
344 -class SecondaryNamespace(BaseDB.DictLookup):
345 - def __init__(self, db, namespace):
346 self.db = db 347 self.namespace = namespace
348 - def __getitem__(self, name):
349 id_filename = os.path.join(self.db.dbname, 350 "id_%s.index" % self.namespace) 351 primary_keys = _lookup_alias(id_filename, name) 352 if primary_keys is None: 353 raise KeyError("Cannot find %r key %r" % (self.namespace, name)) 354 355 data = [] 356 for key in primary_keys: 357 loc = _lookup_location(self.db.key_filename, key) 358 if loc is None: 359 raise AssertionError("Cannot find primary key %r -- " 360 "lost database integrety" % (key,)) 361 data.append(Location.Location(self.namespace, name, 362 self.db.fileid_info[loc[0]][0], 363 loc[1], 364 loc[2])) 365 return data
366
367 - def keys(self):
368 id_filename = os.path.join(self.db.dbname, 369 "id_%s.index" % self.namespace) 370 return _get_first_words(id_filename)
371 372
373 -class DiskFlatDB(BaseFlatDB):
374 - def __init__(self, dbname):
375 BaseFlatDB.__init__(self, dbname)
376 377
378 - def __getitem__(self, namespace):
379 """return the database table lookup for the given namespace""" 380 if namespace == self.primary_namespace: 381 return PrimaryNamespace(self, namespace) 382 if namespace in self.secondary_namespaces: 383 return SecondaryNamespace(self, namespace) 384 raise KeyError(namespace)
385