Package Bio :: Package config :: Module DBRegistry
[hide private]
[frames] | no frames]

Source Code for Module Bio.config.DBRegistry

  1  # Copyright 2002 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # The SQL and Corba was modified from an original implementation by 
  7  # Brad Chapman. 
  8   
  9  """Implements Registry to access databases.  These objects access 
 10  databases using a dictionary-like interface, where the key is the ID 
 11  of the thing to look up, and the value returned is the data associated 
 12  with the key. 
 13   
 14  Classes: 
 15  DBRegistry     Accesses databases with a dictionary-like interface. 
 16  DBObject       Base class for Registry objects for databases. 
 17  DBGroup        Groups DBObjects. 
 18   
 19  CGIDB          Accesses CGI databases. 
 20  EUtilsDB       Accesses NCBI using EUtils. 
 21  BioSQLDB       Accesses a BioSQL database. 
 22  BioCorbaDB     Accesses a BioCorba database. 
 23  IndexedFileDB  Accesses a Mindy Indexed file. 
 24  """ 
 25  from Bio.config.Registry import * 
 26   
27 -class DBRegistry(Registry):
28 """This implements a dictionary-like interface to databases. 29 30 """
31 - def __init__(self, name, load_path=None):
32 Registry.__init__(self, name, load_path=load_path)
33 34 # Create a registry for access to databases. 35 db = DBRegistry("db", "Bio.dbdefs") 36
37 -def _clean_abbrev(abbrev):
38 return abbrev.replace("-", "_")
39
40 -class DBObject(RegisterableObject):
41 """This is a base class for dictionary-like interfaces to 42 databases. 43 44 Methods: 45 get Lookup a key in a database, with a default value. 46 get_as Lookup a key and convert to an object. 47 __getitem__ Lookup a key in a database. 48 49 THE FOLLOWING SHOULD BE IMPLEMENTED IN A DERIVED CLASS. 50 _get Return the data indicated by key. 51 _convert_to Convert the data to another object. 52 IMPLEMENT THESE ONLY IF TIMEOUT OR CONCURRENT ACCESS IS NEEDED. 53 _make_pickleable Make the object returned by _get to a pickleable. 54 _unmake_pickleable Turn the pickleable object back into the original 55 56 """
57 - def __init__(self, name, abbrev=None, doc=None, delay=None, timeout=None):
58 """DBObject(name[, abbrev][, doc][, delay][, timeout])""" 59 import _support 60 abbrev = _clean_abbrev(abbrev or name) 61 RegisterableObject.__init__(self, name, abbrev, doc) 62 if delay is not None: 63 x = _support.make_rate_limited_function(self._get, delay) 64 setattr(self, "_get", x) 65 if timeout is not None: 66 import warnings 67 warnings.warn("Using timeouts has been deprecated, as this code relies on Bio.MultiProc, which itself has been deprecated. If you need this functionality, please let the Biopython developers know by sending an email to biopython-dev@biopython.org.", 68 DeprecationWarning) 69 x = _support.make_timed_function( 70 self._get, timeout, 71 self._make_pickleable, self._unmake_pickleable) 72 setattr(self, "_get", x)
73
74 - def set(self, key, data):
75 self._set(key, data)
76
77 - def get(self, key, default=None):
78 """S.get(key[, default]) -> data""" 79 try: 80 results = self[key] 81 except KeyError: 82 results = default 83 return results
84
85 - def get_as(self, key, to_io=None, default=None):
86 """S.get_as(key[, to_io][, default]) -> object""" 87 data = self.get(key, default=default) 88 return self._convert_to(data, to_io)
89
90 - def __getitem__(self, key):
91 try: 92 return self._get(key) 93 except IOError, x: 94 if str(x) == "timed out": 95 raise KeyError, x 96 raise
97 98 # THESE FUNCTIONS CAN BE OVERLOADED IN A DERIVED CLASS. 99
100 - def _get(self, key):
101 """S._get(key) -> data""" 102 # Look up a key in the DB and return the data. 103 raise NotImplementedError, "Please implement in a derived class."
104 - def _convert_to(self, data, to_io):
105 """S._convert_to(data, to_io) -> another data type"""
106 # Convert the data returned by _get to the type specified by 107 # to_io, which is a FormatIO object.
108 - def _set(self, key, data):
109 """S._set(key, data)""" 110 # Not used. May be used in the future to support caching. 111 raise NotImplementedError, "Caching not supported here."
112 - def _make_pickleable(self, data):
113 """S._make_pickleable(key, data) -> pickleable_obj""" 114 # Make the handle a pickle-able python object. 115 # Only need to implement if supporting timeout or concurrent 116 # access. 117 raise NotImplementedError, "pickling not supported."
118 - def _unmake_pickleable(self, pickleable_obj):
119 """S._unmake_pickleable(key, pickleable_obj) -> data""" 120 # Turn the pickle-able python object back into a handle. 121 # Only need to implement if supporting timeout or concurrent 122 # access. 123 raise NotImplementedError, "pickling not supported."
124
125 -class DBGroup(RegisterableGroup):
126 """Groups DBObjects that return the same kind of data. 127 128 """
129 - def __init__(self, name, abbrev=None, doc=None, 130 behavior="serial", cache=None):
131 """DBGroup(name[, abbrev][, behavior][, doc]) 132 133 name is the name of the object, and abbrev is an abbreviation 134 for the name. 135 136 behavior is either "serial" or "concurrent". "serial" means 137 that I'll run each object until I get one that finishes 138 successfully. "concurrent" means that I'll run each object at 139 the same time and return the one that finishes. 140 141 """ 142 abbrev = _clean_abbrev(abbrev or name) 143 RegisterableGroup.__init__(self, name, abbrev, doc) 144 if behavior not in ['concurrent', 'serial']: 145 raise ValueError, "behavior must be 'concurrent' or 'serial'" 146 if behavior=='concurrent': 147 import warnings 148 warnings.warn("Concurrent behavior has been deprecated, as this functionality needs Bio.MultiProc, which itself has been deprecated. If you need the concurrent behavior, please let the Biopython developers know by sending an email to biopython-dev@biopython.org to avoid permanent removal of this feature.", 149 DeprecationWarning) 150 self.behavior = behavior 151 self._last_object_used = None
152
153 - def __getitem__(self, key):
154 if self.behavior == "concurrent": 155 data = self._run_concurrent(key) 156 else: 157 data = self._run_serial(key) 158 return data
159
160 - def get(self, key, default=None):
161 try: 162 data = self[key] 163 except KeyError: 164 data = default 165 return data
166
167 - def get_as(self, key, to_io=None, default=None):
168 """S.get_as(key[, to_io][, default]) -> object""" 169 data = self.get(key, default=default) 170 return self._last_object_used._convert_to(data, to_io)
171
172 - def _run_concurrent(self, key):
173 import time 174 from Bio.MultiProc.copen import copen_fn 175 176 def get_pickleable(obj, key): 177 return obj._make_pickleable(obj[key])
178 def unpickleable(obj, data): 179 return obj._unmake_pickleable(data)
180 181 fnhandles = [] # list of (obj, running function) 182 for obj in self.objs: 183 fnhandles.append((obj, copen_fn(get_pickleable, obj, key))) 184 # Check each of the function handles until one of them 185 # finishes or they all fail. 186 i = 0 187 while fnhandles: 188 if i >= len(fnhandles): 189 i = 0 190 time.sleep(0.1) 191 try: 192 ready = fnhandles[i][1].poll() 193 except SystemError, KeyboardInterrupt: 194 raise 195 except Exception, x: 196 # This handle failed, so get rid of it. 197 del fnhandles[i] 198 continue 199 if ready: 200 obj, fnhandle = fnhandles.pop(i) 201 retval = unpickleable(obj, fnhandle.read()) 202 self._last_object_used = obj 203 break 204 else: 205 i += 1 206 else: 207 raise KeyError, "I could not get any results." 208 # Shut down all the other requests that didn't finish. 209 for x, h in fnhandles: 210 h.close() 211 return retval 212
213 - def _run_serial(self, key):
214 for obj in self.objs: 215 try: 216 handle = obj[key] 217 except SystemError, KeyboardInterrupt: 218 raise 219 except Exception, x: 220 continue 221 else: 222 self._last_object_used = obj 223 return handle 224 raise KeyError, "I could not get any results."
225
226 -class TextLikeMixin:
227 """Mixin class with useful functionality for retrival of text files. 228 229 This implements some useful helper functions and overrides of DBObject 230 for those implementations which need to retrieve text, check for errors in 231 the retrieve text, and then convert that text to other formats. 232 """
233 - def _check_for_errors(self, handle, failure_cases):
234 from Martel import Parser 235 from Bio import StdHandler 236 from Bio.EUtils.ReseekFile import ReseekFile 237 238 if not failure_cases: 239 return handle 240 handle = ReseekFile(handle) 241 pos = handle.tell() 242 for expression, errormsg in failure_cases: 243 handle.seek(pos) 244 parser = expression.make_parser() 245 handler = StdHandler.RecognizeHandler() 246 parser.setContentHandler(handler) 247 parser.setErrorHandler(handler) 248 try: 249 parser.parseFile(handle) 250 except Parser.ParserException: 251 pass 252 if handler.recognized: 253 raise KeyError, errormsg 254 handle.seek(pos) 255 return handle
256
257 - def _convert_to(self, handle, to_io):
258 from Bio import FormatIO 259 x = to_io.read(handle) 260 if isinstance(x, FormatIO.FormatIOIterator): 261 i = 0 262 for rec in x: 263 if i > 0: 264 raise AssertionError, "Multiple records returned" 265 i += 1 266 else: 267 rec = x 268 return rec
269
270 -class CGIDB(DBObject, TextLikeMixin):
271 """This class implements DBObject for accessing CGI databases. 272 273 """
274 - def __init__(self, name, cgi, url=None, key=None, params=None, 275 abbrev=None, doc=None, delay=None, timeout=None, 276 getmethod=1, failure_cases=None):
277 """CGIDB(name, cgi[, url][, key][, params][, abbrev][, doc] 278 [, delay][, timeout][, getmethod][, failure_cases]) 279 280 name is the name of the object, abbrev is an abbreviation for 281 the name, and doc is some documentation describing the object. 282 283 cgi is the URL for the cgi script. url points to the 284 human-readable URL of the form. 285 286 params is a list of (key, value) tuples indicating the 287 parameters that should be passed to the CGI script. key is 288 the name of the parameter for the CGI script whose value is 289 the ID of the object to retrieve. 290 291 getmethod is a boolean describing whether a GET or POST should 292 be used. By default, GET is used. 293 294 failure_cases is a list of (Martel Expression, error message) 295 describing patterns of errors in the text returned by the 296 script. 297 298 """ 299 import _support 300 DBObject.__init__(self, name=name, abbrev=abbrev, 301 doc=doc, delay=delay, timeout=timeout) 302 self.cgi = cgi 303 self.key = key or '' 304 self.params = params or [] 305 self.url = url 306 self.getmethod = getmethod 307 self.failure_cases = [] 308 for exp, message in failure_cases or []: 309 exp = _support.make_cached_expression(exp) 310 self.failure_cases.append((exp, message))
311
312 - def _normalize_params(self, key):
313 return self.params + [(self.key, key)]
314
315 - def _get(self, key):
316 handle = self._cgiopen(key) 317 handle = self._check_for_errors(handle, self.failure_cases) 318 return handle
319
320 - def _cgiopen(self, key):
321 import urllib 322 params = self._normalize_params(key) 323 options = _my_urlencode(params) 324 if self.getmethod: 325 fullcgi = self.cgi 326 if options: 327 fullcgi = "%s?%s" % (self.cgi, options) 328 handle = urllib.urlopen(fullcgi) 329 else: # do a POST 330 handle = urllib.urlopen(self.cgi, options) 331 return handle
332
333 - def _make_pickleable(self, handle):
334 return handle.read()
335
336 - def _unmake_pickleable(self, obj):
337 import StringIO 338 return StringIO.StringIO(obj)
339
340 -class EUtilsDB(DBObject, TextLikeMixin):
341 """Implement DBObject for accessing EUtils databases at NCBI. 342 """
343 - def __init__(self, name, db, rettype, abbrev = None, doc = None, 344 failure_cases = None, delay = None, timeout = None):
345 """Initialize an EUtilsDB connection for retrieval. 346 347 name is the name of the object, abbrev is an abbreviation for 348 the name, and doc is some documentation describing the object. 349 350 db is the name of the database at NCBI you want to retrieve from 351 (ie. protein, nucleotide, pubmed) 352 353 rettype is the type of information to return 354 (ie. gp, gb, fasta, medline) 355 356 failure_cases is a list of (Martel Expression, error message) 357 describing patterns of errors in the text returned by the 358 script. 359 """ 360 import _support 361 DBObject.__init__(self, name=name, abbrev=abbrev, 362 doc=doc, delay=delay, timeout=timeout) 363 self.db = db 364 self.rettype = rettype 365 self.failure_cases = [] 366 for exp, message in failure_cases or []: 367 exp = _support.make_cached_expression(exp) 368 self.failure_cases.append((exp, message))
369
370 - def _get(self, key):
371 """Implementation of retrieval -- used DBIds client from EUtils. 372 """ 373 from Bio.EUtils import DBIds 374 from Bio.EUtils import DBIdsClient 375 db_id = DBIds(self.db, [key]) 376 eutils_client = DBIdsClient.from_dbids(db_id) 377 handle = eutils_client.efetch(retmode = "text", rettype = 378 self.rettype) 379 handle = self._check_for_errors(handle, self.failure_cases) 380 return handle
381
382 -class BioSQLDB(DBObject):
383 """Represent a BioSQL-style database to retrieve SeqRecord objects. 384 385 This returns a SeqRecord-like object from _get() instead of a 386 handle (since BioSQL is not going to give you a handle). 387 388 """
389 - def __init__(self, name, doc = "", db_host = 'localhost', db_port = '', 390 db_user = 'root', db_passwd = '', sql_db = '', 391 namespace_db = '', db_type = 'mysql'):
392 """Intialize with information for connecting to the BioSQL db. 393 """ 394 DBObject.__init__(self, name=name, doc=doc) 395 self.db_host = db_host 396 self.db_port = db_port 397 self.db_user = db_user 398 self.db_passwd = db_passwd 399 self.sql_db = sql_db 400 self.namespace_db = namespace_db 401 self.db_type = db_type
402
403 - def _get_db_module(self, db_type):
404 """Retrieve the appropriate module to use for connecting to a database 405 406 This parses a description of the database and tries to determine 407 which module is appropriate for that database type. 408 """ 409 if db_type in ['mysql']: 410 return 'MySQLdb' 411 elif db_type in ['pg', 'postgres', 'postgresql']: 412 raise ValueError("Postgres not supported yet. Sorry.") 413 else: 414 raise ValueError("Unknown database type: %s" % db_type)
415
416 - def _get(self, key):
417 # do the import here to prevent circular import problems 418 from BioSQL import BioSeqDatabase 419 420 # for params, we expect to get something like 421 # [('accession', 'AB030760')]. We don't worry about what the id 422 # is called right now, and just try to find it in the database 423 # any way we can 424 find_id = key 425 426 db_driver = self._get_db_module(self.db_type) 427 open_args = {"user" : self.db_user, 428 "passwd" : self.db_passwd, 429 "host" : self.db_host, 430 "db" : self.sql_db, 431 "driver" : db_driver} 432 if self.db_port: 433 open_args["port"] = self.db_port 434 server = BioSeqDatabase.open_database( *(), **open_args) 435 db = server[self.namespace_db] 436 # try our different id choices to test the query 437 item = None 438 for possible_id_type in ["accession", "display_id"]: 439 try: 440 item = db.lookup( *(), **{possible_id_type : find_id}) 441 except IndexError: 442 pass 443 if item is None: 444 raise KeyError("Could not get item with id: %s" % find_id) 445 return item
446
447 - def _convert_to(self, data, to_io):
448 from Bio import SeqRecord 449 if to_io != SeqRecord.io: 450 raise ValueError, "format %s not supported" % to_io.name 451 return data
452
453 - def _make_pickleable(self, item):
454 return item
455 - def _unmake_pickleable(self, item):
456 return item
457
458 -class BioCorbaDB(DBObject):
459 """Represent a BioCorba BioSequenceCollection for SeqRecord objects. 460 461 Returns SeqRecord-like objects. 462 463 """
464 - def __init__(self, name, ior_ref, server_type=None, doc=""):
465 """Intialize with IOR reference for a BioCorba Collection. 466 467 ior_ref is a URL or file reference to an IOR string. The IOR 468 should reference a BioSequenceCollection. This is the top level 469 BioCorba object we should use for making objects available. 470 471 server_type is a hack parameter which might be necessary if there 472 are server/client issues (ie. as with Perl ORBit) that we need 473 to muck around with. If not set, we just use a standard retriever. 474 """ 475 DBObject.__init__(self, name=name, doc=doc) 476 self.retriever = self._get_retriever(server_type) 477 self.ior_ref = ior_ref 478 self.corba_dict = None
479
480 - def _get_retriever(self, server_type):
481 """Return a BioCorba retriever object based on the specified server. 482 483 This returns a ready-to-go client retriever which can be used to 484 connect to a BioCorba server. 485 """ 486 # do the BioCorba imports here, so we don't have to have it 487 # installed to use this module 488 from BioCorba.Client.BiocorbaConnect import PerlCorbaClient, \ 489 PythonCorbaClient, JavaCorbaClient, GenericCorbaClient 490 from BioCorba.Client.Seqcore.CorbaCollection import \ 491 BioSequenceCollection 492 493 if server_type is None: 494 client_type = GenericCorbaClient 495 else: 496 server_type = server_type.lower() 497 if server_type.find("python") >= 0: 498 client_type = PythonCorbaClient 499 elif server_type.find("java") >= 0: 500 client_type = JavaCorbaClient 501 elif server_type.find("perl") >= 0: 502 client_type = PerlCorbaClient 503 else: 504 raise ValueError("Unexpected server type specified: %s" % 505 server_type) 506 507 retriever = client_type(BioSequenceCollection) 508 return retriever
509
510 - def _get_corba_client(self, ior_ref, retriever):
511 """Get a connection to the CORBA server based on the ior_ref 512 """ 513 # do the imports here so we don't need BioCorba for whole module 514 from BioCorba.Bio import GenBank 515 516 if ior_ref.find("http") >= 0: # assume it is a url 517 client = retriever.from_url_ior(ior_ref) 518 else: # assume it is a file 519 client = retriever.from_file_ior(ior_ref) 520 521 return GenBank.Dictionary(client, GenBank.FeatureParser())
522
523 - def _get(self, key):
524 # get the corba dictionary only once when fetched 525 if self.corba_dict is None: 526 self.corba_dict = self._get_corba_client(self.ior_ref, 527 self.retriever) 528 return self.corba_dict[key]
529
530 - def _convert_to(self, data, to_io):
531 from Bio import SeqRecord 532 if to_io != SeqRecord.io: 533 raise ValueError, "format %s not supported" % to_io.name 534 return data
535
536 -class IndexedFileDB(DBObject):
537 """Return SeqRecord objects from an indexed file. 538 539 This module deals with both flat file and BerkeleyDB indexes. 540 These indexed files can be created by any of the compliant indexing 541 implementations from Biopython, BioPerl, BioJava, etc... 542 543 """
544 - def __init__(self, name, dbname, doc = ""):
545 """Intialize with information about loading the database. 546 547 dbname is the name of the database to open. This will likely 548 be a filesystem path to a database directory. 549 """ 550 DBObject.__init__(self, name=name, doc=doc) 551 self.db = self._load_database(dbname)
552
553 - def _load_database(self, name):
554 """Get a connection with the given database. 555 """ 556 from Bio import Mindy 557 db = Mindy.open(dbname = name) 558 return db
559
560 - def _get_check_names(self, given_name, db):
561 """Get a list of all namespaces to search for the file under. 562 563 If given_name is a valid key, then it is returned as the only 564 thing to check. Otherwise, we go forward and check all possible 565 namespaces. 566 """ 567 if given_name is not None and given_name in db.keys(): 568 return [given_name] 569 else: 570 return db.keys()
571
572 - def _get(self, key):
573 """Do the database retrieval of the sequence, returning a handle. 574 """ 575 # XXX jchang: how does this namespace/key stuff work? can we 576 # get rid of namespace? 577 import operator 578 import StringIO 579 if not operator.isSequenceType(key) or len(key) != 2: 580 raise ValueError, "Key should be tuple of (namespace, key)" 581 namespace, key = key 582 names_to_check = self._get_check_names(namespace, self.db) 583 for check_name in names_to_check: 584 location = self.db.lookup( *(), **{check_name : key}) 585 if len(location) >= 1: 586 break 587 assert len(location) == 1, "Got multiple hits: %s" % location 588 return StringIO(location[0].text)
589
590 - def _convert_to(self, handle, to_io):
591 from Bio import FormatIO 592 x = to_io.read(handle) 593 if isinstance(x, FormatIO.FormatIOIterator): 594 i = 0 595 for rec in x: 596 if i > 0: 597 raise AssertionError, "Multiple records returned" 598 i += 1 599 else: 600 rec = x 601 return rec
602
603 -def _my_urlencode(params):
604 # urllib only handles key=value pairs. However, some CGI 605 # scripts also contain parameters that are passed without the 606 # key= part. Thus, search through the params for empty 607 # strings (or None), and handle these myself. 608 609 # params could be a dictionary of key->value or a list of 610 # (key,value) pairs. If it's a dictionary, convert it to a list. 611 import operator 612 import urllib 613 614 if operator.isMappingType(params) and hasattr(params, "items"): 615 params = params.items() 616 617 paramlist = [] 618 for key, value in params: 619 if key: 620 paramlist.append(urllib.urlencode([(key, value)])) 621 else: 622 paramlist.append(urllib.quote_plus(value)) 623 return '&'.join(paramlist)
624