Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

  1  """Load biopython objects into a BioSQL database for persistent storage. 
  2   
  3  This code makes it possible to store biopython objects in a relational 
  4  database and then retrieve them back. You shouldn't use any of the 
  5  classes in this module directly. Rather, call the load() method on 
  6  a database object. 
  7  """ 
  8  # standard modules 
  9  from time import gmtime, strftime 
 10   
 11  # biopython 
 12  from Bio import Alphabet 
 13  from Bio.SeqUtils.CheckSum import crc64 
 14   
 15   
16 -class DatabaseLoader:
17 """Load a database with biopython objects. 18 """
19 - def __init__(self, adaptor, dbid):
20 """Initialize with connection information for the database. 21 22 XXX Figure out what I need to load a database and document it. 23 """ 24 self.adaptor = adaptor 25 self.dbid = dbid
26
27 - def load_seqrecord(self, record):
28 """Load a Biopython SeqRecord into the database. 29 """ 30 bioentry_id = self._load_bioentry_table(record) 31 self._load_bioentry_date(record, bioentry_id) 32 self._load_biosequence(record, bioentry_id) 33 self._load_comment(record, bioentry_id) 34 references = record.annotations.get('references', ()) 35 for reference, rank in zip(references, range(len(references))): 36 self._load_reference(reference, rank, bioentry_id) 37 for seq_feature_num in range(len(record.features)): 38 seq_feature = record.features[seq_feature_num] 39 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
40
41 - def _get_ontology_id(self, name, definition=None):
42 oids = self.adaptor.execute_and_fetch_col0( 43 "SELECT ontology_id FROM ontology WHERE name = %s", 44 (name,)) 45 if oids: 46 return oids[0] 47 self.adaptor.execute( 48 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 49 (name, definition)) 50 return self.adaptor.last_id("ontology")
51 52
53 - def _get_term_id(self, 54 name, 55 ontology_id=None, 56 definition=None, 57 identifier=None):
58 """Get the id that corresponds to a term. 59 60 This looks through the term table for a the given term. If it 61 is not found, a new id corresponding to this term is created. 62 In either case, the id corresponding to that term is returned, so 63 that you can reference it in another table. 64 65 The ontology_id should be used to disambiguate the term. 66 """ 67 68 # try to get the term id 69 sql = r"SELECT term_id FROM term " \ 70 r"WHERE name = %s" 71 fields = [name] 72 if ontology_id: 73 sql += ' AND ontology_id = %s' 74 fields.append(ontology_id) 75 id_results = self.adaptor.execute_and_fetchall(sql, fields) 76 # something is wrong 77 if len(id_results) > 1: 78 raise ValueError("Multiple term ids for %s: %r" % 79 (name, id_results)) 80 elif len(id_results) == 1: 81 return id_results[0][0] 82 else: 83 sql = r"INSERT INTO term (name, definition," \ 84 r" identifier, ontology_id)" \ 85 r" VALUES (%s, %s, %s, %s)" 86 self.adaptor.execute(sql, (name, definition, 87 identifier, ontology_id)) 88 return self.adaptor.last_id("term")
89
90 - def _add_dbxref(self, dbname, accession, version):
91 """Insert a dbxref and return its id""" 92 93 self.adaptor.execute( 94 "INSERT INTO dbxref(dbname, accession, version)" \ 95 " VALUES (%s, %s, %s)", (dbname, accession, version)) 96 return self.adaptor.last_id("dbxref")
97
98 - def _get_taxon_id(self, record):
99 """Get the id corresponding to a taxon. 100 101 If the species isn't in the taxon table, it is created. 102 """ 103 104 ncbi_taxon_id = record.annotations.get("ncbi_taxid") 105 if not ncbi_taxon_id: 106 # Try the hard way... 107 for f in record.features: 108 if f.type == 'source': 109 quals = getattr(f, 'qualifiers', {}) 110 if "db_xref" in quals: 111 for db_xref in f.qualifiers["db_xref"]: 112 if db_xref.startswith("taxon:"): 113 ncbi_taxon_id = int(db_xref[6:]) 114 break 115 if ncbi_taxon_id: break 116 if ncbi_taxon_id: 117 taxa = self.adaptor.execute_and_fetch_col0( 118 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 119 (ncbi_taxon_id,)) 120 if taxa: 121 return taxa[0] 122 123 # Tough luck. Let's try the binomial 124 if record.annotations["organism"]: 125 taxa = self.adaptor.execute_and_fetch_col0( 126 "SELECT taxon_id FROM taxon_name" \ 127 " WHERE name_class = 'scientific name' AND name = %s", 128 (record.annotations["organism"],)) 129 if taxa: 130 return taxa[0] 131 132 133 # Last chance... 134 if record.annotations["source"]: 135 taxa = self.adaptor.execute_and_fetch_col0( 136 "SELECT DISTINCT taxon_id FROM taxon_name" \ 137 " WHERE name = %s", 138 (record.annotations["source"],)) 139 if len(taxa) > 1: 140 raise ValueError("Taxa: %d species have name %r" % ( 141 len(taxa), 142 record.annotations["source"])) 143 if taxa: 144 return taxa[0] 145 146 # OK, let's try inserting the species. 147 # Chances are we don't have enough information ... 148 # Furthermore, it won't be in the hierarchy. 149 150 lineage = [] 151 for c in record.annotations.get("taxonomy", []): 152 lineage.append([None, None, c]) 153 if lineage: 154 lineage[-1][1] = "genus" 155 lineage.append([None, "species", record.annotations["organism"]]) 156 # XXX do we have them? 157 if "subspecies" in record.annotations: 158 lineage.append([None, "subspecies", 159 record.annotations["subspecies"]]) 160 if "variant" in record.annotations: 161 lineage.append([None, "varietas", 162 record.annotations["variant"]]) 163 lineage[-1][0] = ncbi_taxon_id 164 165 left_value = self.adaptor.execute_one( 166 "SELECT MAX(left_value) FROM taxon")[0] 167 if not left_value: 168 left_value = 0 169 left_value += 1 170 171 # XXX -- Brad: Fixing this for now in an ugly way because 172 # I am getting overlaps for right_values. I need to dig into this 173 # more to actually understand how it works. I'm not sure it is 174 # actually working right anyhow. 175 right_start_value = self.adaptor.execute_one( 176 "SELECT MAX(right_value) FROM taxon")[0] 177 if not right_start_value: 178 right_start_value = 0 179 right_value = right_start_value + 2 * len(lineage) - 1 180 181 parent_taxon_id = None 182 for taxon in lineage: 183 self.adaptor.execute( 184 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 185 " left_value, right_value)" \ 186 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 187 taxon[0], 188 taxon[1], 189 left_value, 190 right_value)) 191 taxon_id = self.adaptor.last_id("taxon") 192 self.adaptor.execute( 193 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 194 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2])) 195 left_value += 1 196 right_value -= 1 197 parent_taxon_id = taxon_id 198 if "source" in record.annotations: 199 self.adaptor.execute( 200 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 201 "VALUES (%s, %s, 'common name')", ( 202 taxon_id, record.annotations["source"])) 203 204 return taxon_id
205
206 - def _load_bioentry_table(self, record):
207 """Fill the bioentry table with sequence information. 208 """ 209 # get the pertinent info and insert it 210 211 if record.id.find('.') >= 0: # try to get a version from the id 212 accession, version = record.id.split('.') 213 version = int(version) 214 else: # otherwise just use a version of 0 215 accession = record.id 216 version = 0 217 218 # taxon_id = self._get_taxon_id(record) 219 #taxon_id = "0" # inserted this because the taxon population code is out of date 220 # with the tables 221 identifier = record.annotations.get('gi') 222 description = getattr(record, 'description', None) 223 division = record.annotations.get("data_file_division", "UNK") 224 225 # removed taxon_id field, as it was causing difficulties with the 226 # schema - not inserting a value allows it to default to NULL, 227 # avoiding the foreign key constraint. 228 sql = """ 229 INSERT INTO bioentry ( 230 biodatabase_id, 231 name, 232 accession, 233 identifier, 234 division, 235 description, 236 version) 237 VALUES ( 238 %s, 239 %s, 240 %s, 241 %s, 242 %s, 243 %s)""" 244 self.adaptor.execute(sql, (self.dbid, 245 record.name, 246 accession, 247 identifier, 248 division, 249 description, 250 version)) 251 # now retrieve the id for the bioentry 252 bioentry_id = self.adaptor.last_id('bioentry') 253 254 return bioentry_id
255
256 - def _load_bioentry_date(self, record, bioentry_id):
257 """Add the effective date of the entry into the database. 258 """ 259 # dates are GenBank style, like: 260 # 14-SEP-2000 261 date = record.annotations.get("date", 262 strftime("%d-%b-%Y", gmtime()).upper()) 263 annotation_tags_id = self._get_ontology_id("Annotation Tags") 264 date_id = self._get_term_id("date_changed", annotation_tags_id) 265 sql = r"INSERT INTO bioentry_qualifier_value" \ 266 r" (bioentry_id, term_id, value, rank)" \ 267 r" VALUES (%s, %s, %s, 1)" 268 self.adaptor.execute(sql, (bioentry_id, date_id, date))
269
270 - def _load_biosequence(self, record, bioentry_id):
271 """Load the biosequence table in the database. 272 """ 273 # determine the string representation of the alphabet 274 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 275 alphabet = "dna" 276 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 277 alphabet = "rna" 278 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 279 alphabet = "protein" 280 else: 281 alphabet = "unknown" 282 283 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 284 r"length, seq, alphabet) " \ 285 r"VALUES (%s, 0, %s, %s, %s)" 286 self.adaptor.execute(sql, (bioentry_id, 287 len(record.seq.data), 288 record.seq.data, 289 alphabet))
290
291 - def _load_comment(self, record, bioentry_id):
292 # Assume annotations['comment'] is not a list 293 comment = record.annotations.get('comment') 294 if not comment: 295 return 296 comment = comment.replace('\n', ' ') 297 298 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 299 " VALUES (%s, %s, %s)" 300 self.adaptor.execute(sql, (bioentry_id, comment, 1))
301
302 - def _load_reference(self, reference, rank, bioentry_id):
303 304 refs = None 305 if reference.medline_id: 306 refs = self.adaptor.execute_and_fetch_col0( 307 "SELECT reference_id" \ 308 " FROM reference JOIN dbxref USING (dbxref_id)" \ 309 " WHERE dbname = 'MEDLINE' AND accession = %s", 310 (reference.medline_id,)) 311 if not refs and reference.pubmed_id: 312 refs = self.adaptor.execute_and_fetch_col0( 313 "SELECT reference_id" \ 314 " FROM reference JOIN dbxref USING (dbxref_id)" \ 315 " WHERE dbname = 'PUBMED' AND accession = %s", 316 (reference.pubmed_id,)) 317 if not refs: 318 s = [] 319 for f in reference.authors, reference.title, reference.journal: 320 s.append(f or "<undef>") 321 crc = crc64("".join(s)) 322 refs = self.adaptor.execute_and_fetch_col0( 323 "SELECT reference_id FROM reference" \ 324 r" WHERE crc = %s", (crc,)) 325 if not refs: 326 if reference.medline_id: 327 dbxref_id = self._add_dbxref("MEDLINE", 328 reference.medline_id, 0) 329 elif reference.pubmed_id: 330 dbxref_id = self._add_dbxref("PUBMED", 331 reference.pubmed_id, 0) 332 else: 333 dbxref_id = None 334 authors = reference.authors or None 335 title = reference.title or None 336 journal = reference.journal or None 337 self.adaptor.execute( 338 "INSERT INTO reference (dbxref_id, location," \ 339 " title, authors, crc)" \ 340 " VALUES (%s, %s, %s, %s, %s)", 341 (dbxref_id, journal, title, 342 authors, crc)) 343 reference_id = self.adaptor.last_id("reference") 344 else: 345 reference_id = refs[0] 346 347 if reference.location: 348 start = 1 + int(str(reference.location[0].start)) 349 end = int(str(reference.location[0].end)) 350 else: 351 start = None 352 end = None 353 354 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 355 " start_pos, end_pos, rank)" \ 356 " VALUES (%s, %s, %s, %s, %s)" 357 self.adaptor.execute(sql, (bioentry_id, reference_id, 358 start, end, rank + 1))
359
360 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
361 """Load a biopython SeqFeature into the database. 362 """ 363 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 364 bioentry_id) 365 self._load_seqfeature_locations(feature, seqfeature_id) 366 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
367
368 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
369 """Load the first tables of a seqfeature and returns the id. 370 371 This loads the "key" of the seqfeature (ie. CDS, gene) and 372 the basic seqfeature table itself. 373 """ 374 ontology_id = self._get_ontology_id('SeqFeature Keys') 375 seqfeature_key_id = self._get_term_id(feature_type, 376 ontology_id = ontology_id) 377 378 # XXX source is always EMBL/GenBank/SwissProt here; it should depend on 379 # the record (how?) 380 source_cat_id = self._get_ontology_id('SeqFeature Sources') 381 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt', 382 ontology_id = source_cat_id) 383 384 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 385 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 386 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 387 source_term_id, feature_rank + 1)) 388 seqfeature_id = self.adaptor.last_id('seqfeature') 389 390 return seqfeature_id
391
392 - def _load_seqfeature_locations(self, feature, seqfeature_id):
393 """Load all of the locations for a SeqFeature into tables. 394 395 This adds the locations related to the SeqFeature into the 396 seqfeature_location table. Fuzzies are not handled right now. 397 For a simple location, ie (1..2), we have a single table row 398 with seq_start = 1, seq_end = 2, location_rank = 1. 399 400 For split locations, ie (1..2, 3..4, 5..6) we would have three 401 row tables with: 402 start = 1, end = 2, rank = 1 403 start = 3, end = 4, rank = 2 404 start = 5, end = 6, rank = 3 405 """ 406 # two cases, a simple location or a split location 407 if not feature.sub_features: # simple location 408 self._insert_seqfeature_location(feature, 1, seqfeature_id) 409 else: # split location 410 for rank, cur_feature in enumerate(feature.sub_features): 411 self._insert_seqfeature_location(cur_feature, 412 rank + 1, 413 seqfeature_id)
414
415 - def _insert_seqfeature_location(self, feature, rank, seqfeature_id):
416 """Add a location of a SeqFeature to the seqfeature_location table. 417 """ 418 sql = r"INSERT INTO location (seqfeature_id, " \ 419 r"start_pos, end_pos, strand, rank) " \ 420 r"VALUES (%s, %s, %s, %s, %s)" 421 422 # convert biopython locations to the 1-based location system 423 # used in bioSQL 424 # XXX This could also handle fuzzies 425 start = feature.location.nofuzzy_start + 1 426 end = feature.location.nofuzzy_end 427 # Biopython uses None when we don't know strand information but 428 # BioSQL requires something (non null) and sets this as zero 429 # So we'll use the strand or 0 if Biopython spits out None 430 strand = feature.strand or 0 431 432 self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank))
433
434 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
435 """Insert the (key, value) pair qualifiers relating to a feature. 436 437 Qualifiers should be a dictionary of the form: 438 {key : [value1, value2]} 439 """ 440 tag_ontology_id = self._get_ontology_id('Annotation Tags') 441 for qualifier_key in qualifiers.keys(): 442 # Treat db_xref qualifiers differently to sequence annotation 443 # qualifiers by populating the seqfeature_dbxref and dbxref 444 # tables. Other qualifiers go into the seqfeature_qualifier_value 445 # and (if new) term tables. 446 if qualifier_key != 'db_xref': 447 qualifier_key_id = self._get_term_id(qualifier_key, 448 ontology_id=tag_ontology_id) 449 # now add all of the values to their table 450 for qual_value_rank in range(len(qualifiers[qualifier_key])): 451 qualifier_value = qualifiers[qualifier_key][qual_value_rank] 452 sql = r"INSERT INTO seqfeature_qualifier_value VALUES" \ 453 r" (%s, %s, %s, %s)" 454 self.adaptor.execute(sql, (seqfeature_id, 455 qualifier_key_id, 456 qual_value_rank + 1, 457 qualifier_value)) 458 else: 459 # The dbxref_id qualifier/value sets go into the dbxref table 460 # as dbname, accession, version tuples, with dbxref.dbxref_id 461 # being automatically assigned, and into the seqfeature_dbxref 462 # table as seqfeature_id, dbxref_id, and rank tuples 463 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 464 seqfeature_id)
465 466
467 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
468 """ _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id) 469 470 o dbxrefs List, dbxref data from the source file in the 471 format <database>:<accession> 472 473 o seqfeature_id Int, the identifier for the seqfeature in the 474 seqfeature table 475 476 Insert dbxref qualifier data for a seqfeature into the 477 seqfeature_dbxref and, if required, dbxref tables. 478 The dbxref_id qualifier/value sets go into the dbxref table 479 as dbname, accession, version tuples, with dbxref.dbxref_id 480 being automatically assigned, and into the seqfeature_dbxref 481 table as seqfeature_id, dbxref_id, and rank tuples 482 """ 483 # Dictionary of database types, keyed by GenBank db_xref abbreviation 484 db_dict = {'GeneID': 'Entrez', 485 'GI': 'GeneIndex', 486 'COG': 'COG', 487 'CDD': 'CDD', 488 'DDBJ': 'DNA Databank of Japan', 489 'Entrez': 'Entrez', 490 'GeneIndex': 'GeneIndex', 491 'PUBMED': 'PubMed', 492 'taxon': 'Taxon', 493 'ATCC': 'ATCC', 494 'ISFinder': 'ISFinder', 495 'GOA': 'Gene Ontology Annotation', 496 'ASAP': 'ASAP', 497 'PSEUDO': 'PSEUDO', 498 'InterPro': 'InterPro', 499 'GEO': 'Gene Expression Omnibus', 500 'EMBL': 'EMBL', 501 'UniProtKB/Swiss-Prot': 'UniProtKB/Swiss-Prot', 502 'ECOCYC': 'EcoCyc', 503 'UniProtKB/TrEMBL': 'UniProtKB/TrEMBL' 504 } 505 for rank, value in enumerate(dbxrefs): 506 # Split the DB:accession format string at colons. We have to 507 # account for multiple-line and multiple-accession entries 508 try: 509 dbxref_data = value.replace(' ','').replace('\n','').split(':') 510 key = dbxref_data[0] 511 accessions = dbxref_data[1:] 512 except: 513 raise Exception("Parsing of db_xref failed: %s; %s" % (key, accession)) 514 if key not in db_dict: 515 # Database is currently unknown, so add it to the db_dict 516 # temporarily and issue a warning 517 import warnings 518 warnings.warn("%s not recognised as database type: temporarily accepting key" % key) 519 db_dict[key] = key 520 db = db_dict[key] 521 # Loop over all the grabbed accessions, and attempt to fill the 522 # table 523 for accession in accessions: 524 # Get the dbxref_id value for the dbxref data 525 dbxref_id = self._get_dbxref_id(db, accession) 526 # Insert the seqfeature_dbxref data 527 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
528
529 - def _get_dbxref_id(self, db, accession):
530 """ _get_dbxref_id(self, db, accession) -> Int 531 532 o db String, the name of the external database containing 533 the accession number 534 535 o accession String, the accession of the dbxref data 536 537 Finds and returns the dbxref_id for the passed data. The method 538 attempts to find an existing record first, and inserts the data 539 if there is no record. 540 """ 541 # Check for an existing record 542 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 543 r'AND accession = %s' 544 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 545 # If there was a record, return the dbxref_id, else create the 546 # record and return the created dbxref_id 547 if dbxref_id: 548 return dbxref_id[0] 549 return self._add_dbxref(db, accession, 0)
550
551 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
552 """ Check for a pre-existing seqfeature_dbxref entry with the passed 553 seqfeature_id and dbxref_id. If one does not exist, insert new 554 data 555 556 """ 557 # Check for an existing record 558 sql = r'SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref ' \ 559 r'WHERE seqfeature_id = "%s" AND dbxref_id = "%s"' 560 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 561 dbxref_id)) 562 # If there was a record, return without executing anything, else create 563 # the record and return 564 if result: 565 return result 566 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
567
568 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
569 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 570 dbxref_id 571 """ 572 sql = r'INSERT INTO seqfeature_dbxref VALUES' \ 573 r'(%s, %s, %s)' 574 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 575 return (seqfeature_id, dbxref_id)
576 577
578 -class DatabaseRemover:
579 """Complement the Loader functionality by fully removing a database. 580 581 This probably isn't really useful for normal purposes, since you 582 can just do a: 583 DROP DATABASE db_name 584 and then recreate the database. But, it's really useful for testing 585 purposes. 586 587 YB: now use the cascaded deletions 588 """
589 - def __init__(self, adaptor, dbid):
590 """Initialize with a database id and adaptor connection. 591 """ 592 self.adaptor = adaptor 593 self.dbid = dbid
594
595 - def remove(self):
596 """Remove everything related to the given database id. 597 """ 598 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 599 self.adaptor.execute(sql, (self.dbid,)) 600 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 601 self.adaptor.execute(sql, (self.dbid,))
602