Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  """Connect with a BioSQL database and load Biopython like objects from it. 
  2   
  3  This provides interfaces for loading biological objects from a relational 
  4  database, and is compatible with the BioSQL standards. 
  5  """ 
  6  import BioSeq 
  7  import Loader 
  8  import DBUtils 
  9   
10 -def open_database(driver = "MySQLdb", **kwargs):
11 """Main interface for loading a existing BioSQL-style database. 12 13 This function is the easiest way to retrieve a connection to a 14 database, doing something like: 15 16 >>> from BioSeq import BioSeqDatabase 17 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb") 18 19 the various options are: 20 driver -> The name of the database driver to use for connecting. The 21 driver should implement the python DB API. By default, the MySQLdb 22 driver is used. 23 user -> the username to connect to the database with. 24 password, passwd -> the password to connect with 25 host -> the hostname of the database 26 database or db -> the name of the database 27 """ 28 module = __import__(driver) 29 connect = getattr(module, "connect") 30 31 # Different drivers use different keywords... 32 kw = kwargs.copy() 33 if driver == "MySQLdb": 34 if kw.has_key("database"): 35 kw["db"] = kw["database"] 36 del kw["database"] 37 if kw.has_key("password"): 38 kw["passwd"] = kw["password"] 39 del kw["password"] 40 else: 41 # DB-API recommendations 42 if kw.has_key("db"): 43 kw["database"] = kw["db"] 44 del kw["db"] 45 if kw.has_key("passwd"): 46 kw["password"] = kw["passwd"] 47 del kw["passwd"] 48 if driver == "psycopg" and not kw.get("database"): 49 kw["database"] = "template1" 50 try: 51 conn = connect(**kw) 52 except module.InterfaceError: 53 # Ok, so let's try building a DSN 54 # (older releases of psycopg need this) 55 if kw.has_key("database"): 56 kw["dbname"] = kw["database"] 57 del kw["database"] 58 elif kw.has_key("db"): 59 kw["dbname"] = kw["db"] 60 del kw["db"] 61 62 dsn = ' '.join(['='.join(i) for i in kw.items()]) 63 conn = connect(dsn) 64 65 return DBServer(conn, module)
66
67 -class DBServer:
68 - def __init__(self, conn, module, module_name=None):
69 self.module = module 70 if module_name is None: 71 module_name = module.__name__ 72 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name)) 73 self.module_name = module_name
74
75 - def __repr__(self):
76 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
77 - def __getitem__(self, name):
78 return BioSeqDatabase(self.adaptor, name)
79 - def keys(self):
80 return self.adaptor.list_biodatabase_names()
81 - def values(self):
82 return [self[key] for key in self.keys()]
83 - def items(self):
84 return [(key, self[key]) for key in self.keys()]
85
86 - def remove_database(self, db_name):
87 """Try to remove all references to items in a database. 88 """ 89 db_id = self.adaptor.fetch_dbid_by_dbname(db_name) 90 remover = Loader.DatabaseRemover(self.adaptor, db_id) 91 remover.remove()
92
93 - def new_database(self, db_name, authority=None, description=None):
94 """Add a new database to the server and return it. 95 """ 96 # make the database 97 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 98 r" VALUES (%s, %s, %s)" 99 self.adaptor.execute(sql, (db_name,authority, description)) 100 return BioSeqDatabase(self.adaptor, db_name)
101
102 - def load_database_sql(self, sql_file):
103 """Load a database schema into the given database. 104 105 This is used to create tables, etc when a database is first created. 106 sql_file should specify the complete path to a file containing 107 SQL entries for building the tables. 108 """ 109 # Not sophisticated enough for PG schema. Is it needed by MySQL? 110 # Looks like we need this more complicated way for both. Leaving it 111 # the default and removing the simple-minded approach. 112 113 # read the file with all comment lines removed 114 sql_handle = open(sql_file, "rb") 115 sql = r"" 116 for line in sql_handle.xreadlines(): 117 if line.find("--") == 0: # don't include comment lines 118 pass 119 elif line.find("#") == 0: # ditto for MySQL comments 120 pass 121 elif line.strip(): # only include non-blank lines 122 sql += line.strip() 123 sql += ' ' 124 125 # two ways to load the SQL 126 # 1. PostgreSQL can load it all at once and actually needs to 127 # due to FUNCTION defines at the end of the SQL which mess up 128 # the splitting by semicolons 129 if self.module_name in ["psycopg"]: 130 self.adaptor.cursor.execute(sql) 131 # 2. MySQL needs the database loading split up into single lines of 132 # SQL executed one at a time 133 elif self.module_name in ["MySQLdb"]: 134 sql_parts = sql.split(";") # one line per sql command 135 for sql_line in sql_parts[:-1]: # don't use the last item, it's blank 136 self.adaptor.cursor.execute(sql_line) 137 else: 138 raise ValueError("Module %s not supported by the loader." % 139 (self.module_name))
140
141 -class Adaptor:
142 - def __init__(self, conn, dbutils):
143 self.conn = conn 144 self.cursor = conn.cursor() 145 self.dbutils = dbutils
146
147 - def last_id(self, table):
148 return self.dbutils.last_id(self.cursor, table)
149
150 - def autocommit(self, y=True):
151 return self.dbutils.autocommit(self.conn, y)
152
153 - def commit(self):
154 return self.conn.commit()
155
156 - def rollback(self):
157 return self.conn.rollback()
158
159 - def close(self):
160 return self.conn.close()
161
162 - def fetch_dbid_by_dbname(self, dbname):
163 self.cursor.execute( 164 r"select biodatabase_id from biodatabase where name = %s", 165 (dbname,)) 166 rv = self.cursor.fetchall() 167 if not rv: 168 raise KeyError("Cannot find biodatabase with name %r" % dbname) 169 # Cannot happen (UK) 170 ## assert len(rv) == 1, "More than one biodatabase with name %r" % dbname 171 return rv[0][0]
172
173 - def fetch_seqid_by_display_id(self, dbid, name):
174 sql = r"select bioentry_id from bioentry where name = %s" 175 fields = [name] 176 if dbid: 177 sql += " and biodatabase_id = %s" 178 fields.append(dbid) 179 self.cursor.execute(sql, fields) 180 rv = self.cursor.fetchall() 181 if not rv: 182 raise IndexError("Cannot find display id %r" % name) 183 if len(rv) > 1: 184 raise IndexError("More than one entry with display id %r" % name) 185 return rv[0][0]
186
187 - def fetch_seqid_by_accession(self, dbid, name):
188 sql = r"select bioentry_id from bioentry where accession = %s" 189 fields = [name] 190 if dbid: 191 sql += " and biodatabase_id = %s" 192 fields.append(dbid) 193 self.cursor.execute(sql, fields) 194 rv = self.cursor.fetchall() 195 if not rv: 196 raise IndexError("Cannot find accession %r" % name) 197 if len(rv) > 1: 198 raise IndexError("More than one entry with accession %r" % name) 199 return rv[0][0]
200
201 - def fetch_seqids_by_accession(self, dbid, name):
202 sql = r"select bioentry_id from bioentry where accession = %s" 203 fields = [name] 204 if dbid: 205 sql += " and biodatabase_id = %s" 206 fields.append(dbid) 207 return self.execute_and_fetch_col0(sql, fields)
208
209 - def fetch_seqid_by_version(self, dbid, name):
210 acc_version = name.split(".") 211 if len(acc_version) > 2: 212 raise IndexError("Bad version %r" % name) 213 acc = acc_version[0] 214 if len(acc_version) == 2: 215 version = acc_version[1] 216 else: 217 version = "0" 218 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 219 r" AND version = %s" 220 fields = [acc, version] 221 if dbid: 222 sql += " and biodatabase_id = %s" 223 fields.append(dbid) 224 self.cursor.execute(sql, fields) 225 rv = self.cursor.fetchall() 226 if not rv: 227 raise IndexError("Cannot find version %r" % name) 228 if len(rv) > 1: 229 raise IndexError("More than one entry with version %r" % name) 230 return rv[0][0]
231
232 - def fetch_seqid_by_identifier(self, dbid, identifier):
233 # YB: was fetch_seqid_by_seqid 234 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 235 fields = [identifier] 236 if dbid: 237 sql += " and biodatabase_id = %s" 238 fields.append(dbid) 239 self.cursor.execute(sql, fields) 240 rv = self.cursor.fetchall() 241 if not rv: 242 raise IndexError("Cannot find display id %r" % identifier) 243 return rv[0][0]
244
245 - def list_biodatabase_names(self):
246 return self.execute_and_fetch_col0( 247 "SELECT name FROM biodatabase")
248
249 - def list_bioentry_ids(self, dbid):
250 return self.execute_and_fetch_col0( 251 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 252 (dbid,))
253
254 - def list_bioentry_display_ids(self, dbid):
255 return self.execute_and_fetch_col0( 256 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 257 (dbid,))
258
259 - def list_any_ids(self, sql, args):
260 """Return ids given a SQL statement to select for them. 261 262 This assumes that the given SQL does a SELECT statement that 263 returns a list of items. This parses them out of the 2D list 264 they come as and just returns them in a list. 265 """ 266 return self.cursor.execute_and_fetch_col0(sql, args)
267
268 - def execute_one(self, sql, args=None):
269 self.cursor.execute(sql, args or ()) 270 rv = self.cursor.fetchall() 271 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 272 return rv[0]
273
274 - def execute(self, sql, args=None):
275 """Just execute an sql command. 276 """ 277 self.cursor.execute(sql, args or ())
278
279 - def get_subseq_as_string(self, seqid, start, end):
280 length = end - start 281 return self.execute_one( 282 """select SUBSTRING(seq FROM %s FOR %s) 283 from biosequence where bioentry_id = %s""", 284 (start+1, length, seqid))[0]
285
286 - def execute_and_fetch_col0(self, sql, args=None):
287 self.cursor.execute(sql, args or ()) 288 return [field[0] for field in self.cursor.fetchall()]
289
290 - def execute_and_fetchall(self, sql, args=None):
291 self.cursor.execute(sql, args or ()) 292 return self.cursor.fetchall()
293 294 _allowed_lookups = { 295 # Lookup name / function name to get id, function to list all ids 296 'primary_id': "fetch_seqid_by_identifier", 297 'gi': "fetch_seqid_by_identifier", 298 'display_id': "fetch_seqid_by_display_id", 299 'name': "fetch_seqid_by_display_id", 300 'accession': "fetch_seqid_by_accession", 301 'version': "fetch_seqid_by_version", 302 } 303
304 -class BioSeqDatabase:
305 - def __init__(self, adaptor, name):
306 self.adaptor = adaptor 307 self.name = name 308 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
309 - def __repr__(self):
310 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
311
312 - def get_Seq_by_id(self, name):
313 """Gets a Bio::Seq object by its name 314 315 Example: seq = db.get_Seq_by_id('ROA1_HUMAN') 316 317 """ 318 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 319 return BioSeq.DBSeqRecord(self.adaptor, seqid)
320
321 - def get_Seq_by_acc(self, name):
322 """Gets a Bio::Seq object by accession number 323 324 Example: seq = db.get_Seq_by_acc('X77802') 325 326 """ 327 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 328 return BioSeq.DBSeqRecord(self.adaptor, seqid)
329
330 - def get_Seq_by_ver(self, name):
331 """Gets a Bio::Seq object by version number 332 333 Example: seq = db.get_Seq_by_ver('X77802.1') 334 335 """ 336 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 337 return BioSeq.DBSeqRecord(self.adaptor, seqid)
338
339 - def get_Seqs_by_acc(self, name):
340 """Gets a *list* of Bio::Seq objects by accession number 341 342 Example: seqs = db.get_Seq_by_acc('X77802') 343 344 """ 345 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 346 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
347
348 - def get_PrimarySeq_stream(self):
349 # my @array = $self->get_all_primary_ids; 350 # my $stream = Bio::DB::BioDatabasePSeqStream->new( 351 # -adaptor => $self->_adaptor->db->get_PrimarySeqAdaptor, 352 # -idlist => \@array); 353 raise NotImplementedError("waiting for Python 2.2's iter")
354
355 - def get_all_primary_ids(self):
356 """Array of all the primary_ids of the sequences in the database. 357 358 These maybe ids (display style) or accession numbers or 359 something else completely different - they *are not* 360 meaningful outside of this database implementation. 361 """ 362 return self.adaptor.list_bioentry_ids(self.dbid)
363
364 - def __getitem__(self, key):
365 return BioSeq.DBSeqRecord(self.adaptor, key)
366 - def keys(self):
367 return self.get_all_primary_ids()
368 - def values(self):
369 return [self[key] for key in self.keys()]
370 - def items(self):
371 return [(key, self[key]) for key in self.keys()]
372
373 - def lookup(self, **kwargs):
374 if len(kwargs) != 1: 375 raise TypeError("single key/value parameter expected") 376 k, v = kwargs.items()[0] 377 if not _allowed_lookups.has_key(k): 378 raise TypeError("lookup() expects one of %s, not %r" % \ 379 (repr(_allowed_lookups.keys())[1:-1], repr(k))) 380 lookup_name = _allowed_lookups[k] 381 lookup_func = getattr(self.adaptor, lookup_name) 382 seqid = lookup_func(self.dbid, v) 383 return BioSeq.DBSeqRecord(self.adaptor, seqid)
384
385 - def get_Seq_by_primary_id(self, seqid):
386 """Gets a Bio::Seq object by the primary (internal) id. 387 388 The primary id in these cases has to come from 389 $db->get_all_primary_ids. There is no other way to get (or 390 guess) the primary_ids in a database. 391 """ 392 return self[seqid]
393
394 - def load(self, record_iterator):
395 """Load a set of SeqRecords into the BioSQL database. 396 397 record_iterator is an Iterator object that returns SeqRecord objects 398 which will be used to populate the database. The Iterator should 399 implement next() and either return None or raise StopIteration 400 when it is out of objects. 401 402 Returns the number of records loaded. 403 """ 404 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid) 405 num_records = 0 406 while 1: 407 try: 408 cur_record = record_iterator.next() 409 except StopIteration: 410 break 411 if cur_record is None: 412 break 413 num_records += 1 414 db_loader.load_seqrecord(cur_record) 415 416 return num_records
417