1
2
3
4
5
6
7
8
9 """Implements Registry to access databases. These objects access
10 databases using a dictionary-like interface, where the key is the ID
11 of the thing to look up, and the value returned is the data associated
12 with the key.
13
14 Classes:
15 DBRegistry Accesses databases with a dictionary-like interface.
16 DBObject Base class for Registry objects for databases.
17 DBGroup Groups DBObjects.
18
19 CGIDB Accesses CGI databases.
20 EUtilsDB Accesses NCBI using EUtils.
21 BioSQLDB Accesses a BioSQL database.
22 BioCorbaDB Accesses a BioCorba database.
23 IndexedFileDB Accesses a Mindy Indexed file.
24 """
25 from Bio.config.Registry import *
26
28 """This implements a dictionary-like interface to databases.
29
30 """
31 - def __init__(self, name, load_path=None):
33
34
35 db = DBRegistry("db", "Bio.dbdefs")
36
38 return abbrev.replace("-", "_")
39
41 """This is a base class for dictionary-like interfaces to
42 databases.
43
44 Methods:
45 get Lookup a key in a database, with a default value.
46 get_as Lookup a key and convert to an object.
47 __getitem__ Lookup a key in a database.
48
49 THE FOLLOWING SHOULD BE IMPLEMENTED IN A DERIVED CLASS.
50 _get Return the data indicated by key.
51 _convert_to Convert the data to another object.
52 IMPLEMENT THESE ONLY IF TIMEOUT OR CONCURRENT ACCESS IS NEEDED.
53 _make_pickleable Make the object returned by _get to a pickleable.
54 _unmake_pickleable Turn the pickleable object back into the original
55
56 """
57 - def __init__(self, name, abbrev=None, doc=None, delay=None, timeout=None):
58 """DBObject(name[, abbrev][, doc][, delay][, timeout])"""
59 import _support
60 abbrev = _clean_abbrev(abbrev or name)
61 RegisterableObject.__init__(self, name, abbrev, doc)
62 if delay is not None:
63 x = _support.make_rate_limited_function(self._get, delay)
64 setattr(self, "_get", x)
65 if timeout is not None:
66 import warnings
67 warnings.warn("Using timeouts has been deprecated, as this code relies on Bio.MultiProc, which itself has been deprecated. If you need this functionality, please let the Biopython developers know by sending an email to biopython-dev@biopython.org.",
68 DeprecationWarning)
69 x = _support.make_timed_function(
70 self._get, timeout,
71 self._make_pickleable, self._unmake_pickleable)
72 setattr(self, "_get", x)
73
74 - def set(self, key, data):
76
77 - def get(self, key, default=None):
78 """S.get(key[, default]) -> data"""
79 try:
80 results = self[key]
81 except KeyError:
82 results = default
83 return results
84
85 - def get_as(self, key, to_io=None, default=None):
89
91 try:
92 return self._get(key)
93 except IOError, x:
94 if str(x) == "timed out":
95 raise KeyError, x
96 raise
97
98
99
100 - def _get(self, key):
101 """S._get(key) -> data"""
102
103 raise NotImplementedError, "Please implement in a derived class."
105 """S._convert_to(data, to_io) -> another data type"""
106
107
108 - def _set(self, key, data):
109 """S._set(key, data)"""
110
111 raise NotImplementedError, "Caching not supported here."
113 """S._make_pickleable(key, data) -> pickleable_obj"""
114
115
116
117 raise NotImplementedError, "pickling not supported."
119 """S._unmake_pickleable(key, pickleable_obj) -> data"""
120
121
122
123 raise NotImplementedError, "pickling not supported."
124
126 """Groups DBObjects that return the same kind of data.
127
128 """
129 - def __init__(self, name, abbrev=None, doc=None,
130 behavior="serial", cache=None):
131 """DBGroup(name[, abbrev][, behavior][, doc])
132
133 name is the name of the object, and abbrev is an abbreviation
134 for the name.
135
136 behavior is either "serial" or "concurrent". "serial" means
137 that I'll run each object until I get one that finishes
138 successfully. "concurrent" means that I'll run each object at
139 the same time and return the one that finishes.
140
141 """
142 abbrev = _clean_abbrev(abbrev or name)
143 RegisterableGroup.__init__(self, name, abbrev, doc)
144 if behavior not in ['concurrent', 'serial']:
145 raise ValueError, "behavior must be 'concurrent' or 'serial'"
146 if behavior=='concurrent':
147 import warnings
148 warnings.warn("Concurrent behavior has been deprecated, as this functionality needs Bio.MultiProc, which itself has been deprecated. If you need the concurrent behavior, please let the Biopython developers know by sending an email to biopython-dev@biopython.org to avoid permanent removal of this feature.",
149 DeprecationWarning)
150 self.behavior = behavior
151 self._last_object_used = None
152
159
160 - def get(self, key, default=None):
166
167 - def get_as(self, key, to_io=None, default=None):
171
178 def unpickleable(obj, data):
179 return obj._unmake_pickleable(data)
180
181 fnhandles = []
182 for obj in self.objs:
183 fnhandles.append((obj, copen_fn(get_pickleable, obj, key)))
184
185
186 i = 0
187 while fnhandles:
188 if i >= len(fnhandles):
189 i = 0
190 time.sleep(0.1)
191 try:
192 ready = fnhandles[i][1].poll()
193 except SystemError, KeyboardInterrupt:
194 raise
195 except Exception, x:
196
197 del fnhandles[i]
198 continue
199 if ready:
200 obj, fnhandle = fnhandles.pop(i)
201 retval = unpickleable(obj, fnhandle.read())
202 self._last_object_used = obj
203 break
204 else:
205 i += 1
206 else:
207 raise KeyError, "I could not get any results."
208
209 for x, h in fnhandles:
210 h.close()
211 return retval
212
214 for obj in self.objs:
215 try:
216 handle = obj[key]
217 except SystemError, KeyboardInterrupt:
218 raise
219 except Exception, x:
220 continue
221 else:
222 self._last_object_used = obj
223 return handle
224 raise KeyError, "I could not get any results."
225
227 """Mixin class with useful functionality for retrival of text files.
228
229 This implements some useful helper functions and overrides of DBObject
230 for those implementations which need to retrieve text, check for errors in
231 the retrieve text, and then convert that text to other formats.
232 """
233 - def _check_for_errors(self, handle, failure_cases):
234 from Martel import Parser
235 from Bio import StdHandler
236 from Bio.EUtils.ReseekFile import ReseekFile
237
238 if not failure_cases:
239 return handle
240 handle = ReseekFile(handle)
241 pos = handle.tell()
242 for expression, errormsg in failure_cases:
243 handle.seek(pos)
244 parser = expression.make_parser()
245 handler = StdHandler.RecognizeHandler()
246 parser.setContentHandler(handler)
247 parser.setErrorHandler(handler)
248 try:
249 parser.parseFile(handle)
250 except Parser.ParserException:
251 pass
252 if handler.recognized:
253 raise KeyError, errormsg
254 handle.seek(pos)
255 return handle
256
257 - def _convert_to(self, handle, to_io):
258 from Bio import FormatIO
259 x = to_io.read(handle)
260 if isinstance(x, FormatIO.FormatIOIterator):
261 i = 0
262 for rec in x:
263 if i > 0:
264 raise AssertionError, "Multiple records returned"
265 i += 1
266 else:
267 rec = x
268 return rec
269
270 -class CGIDB(DBObject, TextLikeMixin):
271 """This class implements DBObject for accessing CGI databases.
272
273 """
274 - def __init__(self, name, cgi, url=None, key=None, params=None,
275 abbrev=None, doc=None, delay=None, timeout=None,
276 getmethod=1, failure_cases=None):
277 """CGIDB(name, cgi[, url][, key][, params][, abbrev][, doc]
278 [, delay][, timeout][, getmethod][, failure_cases])
279
280 name is the name of the object, abbrev is an abbreviation for
281 the name, and doc is some documentation describing the object.
282
283 cgi is the URL for the cgi script. url points to the
284 human-readable URL of the form.
285
286 params is a list of (key, value) tuples indicating the
287 parameters that should be passed to the CGI script. key is
288 the name of the parameter for the CGI script whose value is
289 the ID of the object to retrieve.
290
291 getmethod is a boolean describing whether a GET or POST should
292 be used. By default, GET is used.
293
294 failure_cases is a list of (Martel Expression, error message)
295 describing patterns of errors in the text returned by the
296 script.
297
298 """
299 import _support
300 DBObject.__init__(self, name=name, abbrev=abbrev,
301 doc=doc, delay=delay, timeout=timeout)
302 self.cgi = cgi
303 self.key = key or ''
304 self.params = params or []
305 self.url = url
306 self.getmethod = getmethod
307 self.failure_cases = []
308 for exp, message in failure_cases or []:
309 exp = _support.make_cached_expression(exp)
310 self.failure_cases.append((exp, message))
311
313 return self.params + [(self.key, key)]
314
315 - def _get(self, key):
319
321 import urllib
322 params = self._normalize_params(key)
323 options = _my_urlencode(params)
324 if self.getmethod:
325 fullcgi = self.cgi
326 if options:
327 fullcgi = "%s?%s" % (self.cgi, options)
328 handle = urllib.urlopen(fullcgi)
329 else:
330 handle = urllib.urlopen(self.cgi, options)
331 return handle
332
335
337 import StringIO
338 return StringIO.StringIO(obj)
339
340 -class EUtilsDB(DBObject, TextLikeMixin):
341 """Implement DBObject for accessing EUtils databases at NCBI.
342 """
343 - def __init__(self, name, db, rettype, abbrev = None, doc = None,
344 failure_cases = None, delay = None, timeout = None):
345 """Initialize an EUtilsDB connection for retrieval.
346
347 name is the name of the object, abbrev is an abbreviation for
348 the name, and doc is some documentation describing the object.
349
350 db is the name of the database at NCBI you want to retrieve from
351 (ie. protein, nucleotide, pubmed)
352
353 rettype is the type of information to return
354 (ie. gp, gb, fasta, medline)
355
356 failure_cases is a list of (Martel Expression, error message)
357 describing patterns of errors in the text returned by the
358 script.
359 """
360 import _support
361 DBObject.__init__(self, name=name, abbrev=abbrev,
362 doc=doc, delay=delay, timeout=timeout)
363 self.db = db
364 self.rettype = rettype
365 self.failure_cases = []
366 for exp, message in failure_cases or []:
367 exp = _support.make_cached_expression(exp)
368 self.failure_cases.append((exp, message))
369
370 - def _get(self, key):
381
383 """Represent a BioSQL-style database to retrieve SeqRecord objects.
384
385 This returns a SeqRecord-like object from _get() instead of a
386 handle (since BioSQL is not going to give you a handle).
387
388 """
389 - def __init__(self, name, doc = "", db_host = 'localhost', db_port = '',
390 db_user = 'root', db_passwd = '', sql_db = '',
391 namespace_db = '', db_type = 'mysql'):
392 """Intialize with information for connecting to the BioSQL db.
393 """
394 DBObject.__init__(self, name=name, doc=doc)
395 self.db_host = db_host
396 self.db_port = db_port
397 self.db_user = db_user
398 self.db_passwd = db_passwd
399 self.sql_db = sql_db
400 self.namespace_db = namespace_db
401 self.db_type = db_type
402
404 """Retrieve the appropriate module to use for connecting to a database
405
406 This parses a description of the database and tries to determine
407 which module is appropriate for that database type.
408 """
409 if db_type in ['mysql']:
410 return 'MySQLdb'
411 elif db_type in ['pg', 'postgres', 'postgresql']:
412 raise ValueError("Postgres not supported yet. Sorry.")
413 else:
414 raise ValueError("Unknown database type: %s" % db_type)
415
416 - def _get(self, key):
417
418 from BioSQL import BioSeqDatabase
419
420
421
422
423
424 find_id = key
425
426 db_driver = self._get_db_module(self.db_type)
427 open_args = {"user" : self.db_user,
428 "passwd" : self.db_passwd,
429 "host" : self.db_host,
430 "db" : self.sql_db,
431 "driver" : db_driver}
432 if self.db_port:
433 open_args["port"] = self.db_port
434 server = BioSeqDatabase.open_database( *(), **open_args)
435 db = server[self.namespace_db]
436
437 item = None
438 for possible_id_type in ["accession", "display_id"]:
439 try:
440 item = db.lookup( *(), **{possible_id_type : find_id})
441 except IndexError:
442 pass
443 if item is None:
444 raise KeyError("Could not get item with id: %s" % find_id)
445 return item
446
452
457
459 """Represent a BioCorba BioSequenceCollection for SeqRecord objects.
460
461 Returns SeqRecord-like objects.
462
463 """
464 - def __init__(self, name, ior_ref, server_type=None, doc=""):
465 """Intialize with IOR reference for a BioCorba Collection.
466
467 ior_ref is a URL or file reference to an IOR string. The IOR
468 should reference a BioSequenceCollection. This is the top level
469 BioCorba object we should use for making objects available.
470
471 server_type is a hack parameter which might be necessary if there
472 are server/client issues (ie. as with Perl ORBit) that we need
473 to muck around with. If not set, we just use a standard retriever.
474 """
475 DBObject.__init__(self, name=name, doc=doc)
476 self.retriever = self._get_retriever(server_type)
477 self.ior_ref = ior_ref
478 self.corba_dict = None
479
481 """Return a BioCorba retriever object based on the specified server.
482
483 This returns a ready-to-go client retriever which can be used to
484 connect to a BioCorba server.
485 """
486
487
488 from BioCorba.Client.BiocorbaConnect import PerlCorbaClient, \
489 PythonCorbaClient, JavaCorbaClient, GenericCorbaClient
490 from BioCorba.Client.Seqcore.CorbaCollection import \
491 BioSequenceCollection
492
493 if server_type is None:
494 client_type = GenericCorbaClient
495 else:
496 server_type = server_type.lower()
497 if server_type.find("python") >= 0:
498 client_type = PythonCorbaClient
499 elif server_type.find("java") >= 0:
500 client_type = JavaCorbaClient
501 elif server_type.find("perl") >= 0:
502 client_type = PerlCorbaClient
503 else:
504 raise ValueError("Unexpected server type specified: %s" %
505 server_type)
506
507 retriever = client_type(BioSequenceCollection)
508 return retriever
509
511 """Get a connection to the CORBA server based on the ior_ref
512 """
513
514 from BioCorba.Bio import GenBank
515
516 if ior_ref.find("http") >= 0:
517 client = retriever.from_url_ior(ior_ref)
518 else:
519 client = retriever.from_file_ior(ior_ref)
520
521 return GenBank.Dictionary(client, GenBank.FeatureParser())
522
523 - def _get(self, key):
524
525 if self.corba_dict is None:
526 self.corba_dict = self._get_corba_client(self.ior_ref,
527 self.retriever)
528 return self.corba_dict[key]
529
535
537 """Return SeqRecord objects from an indexed file.
538
539 This module deals with both flat file and BerkeleyDB indexes.
540 These indexed files can be created by any of the compliant indexing
541 implementations from Biopython, BioPerl, BioJava, etc...
542
543 """
544 - def __init__(self, name, dbname, doc = ""):
545 """Intialize with information about loading the database.
546
547 dbname is the name of the database to open. This will likely
548 be a filesystem path to a database directory.
549 """
550 DBObject.__init__(self, name=name, doc=doc)
551 self.db = self._load_database(dbname)
552
554 """Get a connection with the given database.
555 """
556 from Bio import Mindy
557 db = Mindy.open(dbname = name)
558 return db
559
561 """Get a list of all namespaces to search for the file under.
562
563 If given_name is a valid key, then it is returned as the only
564 thing to check. Otherwise, we go forward and check all possible
565 namespaces.
566 """
567 if given_name is not None and given_name in db.keys():
568 return [given_name]
569 else:
570 return db.keys()
571
572 - def _get(self, key):
573 """Do the database retrieval of the sequence, returning a handle.
574 """
575
576
577 import operator
578 import StringIO
579 if not operator.isSequenceType(key) or len(key) != 2:
580 raise ValueError, "Key should be tuple of (namespace, key)"
581 namespace, key = key
582 names_to_check = self._get_check_names(namespace, self.db)
583 for check_name in names_to_check:
584 location = self.db.lookup( *(), **{check_name : key})
585 if len(location) >= 1:
586 break
587 assert len(location) == 1, "Got multiple hits: %s" % location
588 return StringIO(location[0].text)
589
591 from Bio import FormatIO
592 x = to_io.read(handle)
593 if isinstance(x, FormatIO.FormatIOIterator):
594 i = 0
595 for rec in x:
596 if i > 0:
597 raise AssertionError, "Multiple records returned"
598 i += 1
599 else:
600 rec = x
601 return rec
602
604
605
606
607
608
609
610
611 import operator
612 import urllib
613
614 if operator.isMappingType(params) and hasattr(params, "items"):
615 params = params.items()
616
617 paramlist = []
618 for key, value in params:
619 if key:
620 paramlist.append(urllib.urlencode([(key, value)]))
621 else:
622 paramlist.append(urllib.quote_plus(value))
623 return '&'.join(paramlist)
624