Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2008 by Peter Cock. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  # 
  7  # Note that BioSQL (including the database schema and scripts) is 
  8  # available and licensed separately.  Please consult www.biosql.org 
  9   
 10  """Load biopython objects into a BioSQL database for persistent storage. 
 11   
 12  This code makes it possible to store biopython objects in a relational 
 13  database and then retrieve them back. You shouldn't use any of the 
 14  classes in this module directly. Rather, call the load() method on 
 15  a database object. 
 16  """ 
 17  # standard modules 
 18  from time import gmtime, strftime 
 19   
 20  # biopython 
 21  from Bio import Alphabet 
 22  from Bio.SeqUtils.CheckSum import crc64 
 23   
 24   
25 -class DatabaseLoader:
26 """Load a database with biopython objects. 27 """
28 - def __init__(self, adaptor, dbid):
29 """Initialize with connection information for the database. 30 31 XXX Figure out what I need to load a database and document it. 32 """ 33 self.adaptor = adaptor 34 self.dbid = dbid
35
36 - def load_seqrecord(self, record):
37 """Load a Biopython SeqRecord into the database. 38 """ 39 bioentry_id = self._load_bioentry_table(record) 40 self._load_bioentry_date(record, bioentry_id) 41 self._load_biosequence(record, bioentry_id) 42 self._load_comment(record, bioentry_id) 43 self._load_dbxrefs(record, bioentry_id) 44 references = record.annotations.get('references', ()) 45 for reference, rank in zip(references, range(len(references))): 46 self._load_reference(reference, rank, bioentry_id) 47 self._load_annotations(record, bioentry_id) 48 for seq_feature_num in range(len(record.features)): 49 seq_feature = record.features[seq_feature_num] 50 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
51
52 - def _get_ontology_id(self, name, definition=None):
53 """Returns the identifier for the named ontology. 54 55 This looks through the onotology table for a the given entry name. 56 If it is not found, a row is added for this ontology (using the 57 definition if supplied). In either case, the id corresponding to 58 the provided name is returned, so that you can reference it in 59 another table. 60 """ 61 oids = self.adaptor.execute_and_fetch_col0( 62 "SELECT ontology_id FROM ontology WHERE name = %s", 63 (name,)) 64 if oids: 65 return oids[0] 66 self.adaptor.execute( 67 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 68 (name, definition)) 69 return self.adaptor.last_id("ontology")
70 71
72 - def _get_term_id(self, 73 name, 74 ontology_id=None, 75 definition=None, 76 identifier=None):
77 """Get the id that corresponds to a term. 78 79 This looks through the term table for a the given term. If it 80 is not found, a new id corresponding to this term is created. 81 In either case, the id corresponding to that term is returned, so 82 that you can reference it in another table. 83 84 The ontology_id should be used to disambiguate the term. 85 """ 86 87 # try to get the term id 88 sql = r"SELECT term_id FROM term " \ 89 r"WHERE name = %s" 90 fields = [name] 91 if ontology_id: 92 sql += ' AND ontology_id = %s' 93 fields.append(ontology_id) 94 id_results = self.adaptor.execute_and_fetchall(sql, fields) 95 # something is wrong 96 if len(id_results) > 1: 97 raise ValueError("Multiple term ids for %s: %r" % 98 (name, id_results)) 99 elif len(id_results) == 1: 100 return id_results[0][0] 101 else: 102 sql = r"INSERT INTO term (name, definition," \ 103 r" identifier, ontology_id)" \ 104 r" VALUES (%s, %s, %s, %s)" 105 self.adaptor.execute(sql, (name, definition, 106 identifier, ontology_id)) 107 return self.adaptor.last_id("term")
108
109 - def _add_dbxref(self, dbname, accession, version):
110 """Insert a dbxref and return its id.""" 111 112 self.adaptor.execute( 113 "INSERT INTO dbxref(dbname, accession, version)" \ 114 " VALUES (%s, %s, %s)", (dbname, accession, version)) 115 return self.adaptor.last_id("dbxref")
116
117 - def _get_taxon_id(self, record):
118 """Get the taxon id for this record. 119 120 record - a SeqRecord object 121 122 This searches the taxon/taxon_name tables using the 123 NCBI taxon ID, scientific name and common name to find 124 the matching taxon table entry's id. 125 126 If the species isn't in the taxon table, and we have at 127 least the NCBI taxon ID, scientific name or common name, 128 a minimal stub entry is created in the table. 129 130 If this information is not in the record's annotation, 131 then None is returned. 132 133 See also the BioSQL script load_ncbi_taxonomy.pl which 134 will populate and update the taxon/taxon_name tables 135 with the latest information from the NCBI. 136 """ 137 138 # To find the NCBI taxid, first check for a top level annotation 139 ncbi_taxon_id = None 140 if "ncbi_taxid" in record.annotations : 141 #Could be a list of IDs. 142 if isinstance(record.annotations["ncbi_taxid"],list) : 143 if len(record.annotations["ncbi_taxid"])==1 : 144 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 145 else : 146 ncbi_taxon_id = record.annotations["ncbi_taxid"] 147 if not ncbi_taxon_id: 148 # Secondly, look for a source feature 149 for f in record.features: 150 if f.type == 'source': 151 quals = getattr(f, 'qualifiers', {}) 152 if "db_xref" in quals: 153 for db_xref in f.qualifiers["db_xref"]: 154 if db_xref.startswith("taxon:"): 155 ncbi_taxon_id = int(db_xref[6:]) 156 break 157 if ncbi_taxon_id: break 158 if ncbi_taxon_id: 159 taxa = self.adaptor.execute_and_fetch_col0( 160 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 161 (ncbi_taxon_id,)) 162 if taxa: 163 #Good, we have mapped the NCBI taxid to a taxon table entry 164 return taxa[0] 165 #Bad, this NCBI taxon id is not in the database! 166 167 168 # Either we didn't find the NCBI taxid in the record, or it 169 # isn't in the database yet. Next, we'll try and find a match 170 # based on the species name (stored in GenBank files as the 171 # the organism and/or the source). 172 173 try : 174 scientific_name = record.annotations["organism"][:255] 175 except KeyError : 176 scientific_name = None 177 try : 178 common_name = record.annotations["source"][:255] 179 except KeyError : 180 common_name = None 181 # Note: The maximum length for taxon names in the schema is 255. 182 # Cropping it now should help in getting a match when searching, 183 # and avoids an error if we try and add these to the database. 184 185 if ncbi_taxon_id is None \ 186 and not common_name and not scientific_name : 187 # Nothing to go on... and there is no point adding 188 # a new entry to the database. We'll just leave this 189 # sequence's taxon as a NULL in the database. 190 return None 191 192 if scientific_name: 193 taxa = self.adaptor.execute_and_fetch_col0( 194 "SELECT taxon_id FROM taxon_name" \ 195 " WHERE name_class = 'scientific name' AND name = %s", 196 (scientific_name,)) 197 if taxa: 198 #Good, mapped the scientific name to a taxon table entry 199 return taxa[0] 200 201 # Last chance... 202 if common_name: 203 taxa = self.adaptor.execute_and_fetch_col0( 204 "SELECT DISTINCT taxon_id FROM taxon_name" \ 205 " WHERE name = %s", 206 (common_name,)) 207 #Its natural that several distinct taxa will have the same common 208 #name - in which case we can't resolve the taxon uniquely. 209 if len(taxa) > 1: 210 raise ValueError("Taxa: %d species have name %r" % ( 211 len(taxa), 212 common_name)) 213 if taxa: 214 #Good, mapped the common name to a taxon table entry 215 return taxa[0] 216 217 # At this point, as far as we can tell, this species isn't 218 # in the taxon table already. So we'll have to add it. 219 220 """ 221 # Possible simplification of the scary code below; pending 222 # discussion on the BioSQL mailing list... 223 # 224 # We will record the bare minimum; as long as the NCBI taxon 225 # id is present, then (re)running load_ncbi_taxonomy.pl should 226 # fill in the taxonomomy lineage. 227 # 228 # I am NOT going to try and record the lineage, even if it 229 # is in the record annotation as a list of names, as we won't 230 # know the NCBI taxon IDs for these parent nodes. 231 self.adaptor.execute( 232 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 233 " left_value, right_value)" \ 234 " VALUES (%s, %s, %s, %s, %s)", (None, 235 ncbi_taxon_id, 236 "species", 237 None, 238 None)) 239 taxon_id = self.adaptor.last_id("taxon") 240 if scientific_name: 241 self.adaptor.execute( 242 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 243 "VALUES (%s, %s, 'scientific name')", ( 244 taxon_id, scientific_name)) 245 if common_name: 246 self.adaptor.execute( 247 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 248 "VALUES (%s, %s, 'common name')", ( 249 taxon_id, common_name)) 250 return taxon_id 251 """ 252 253 # OK, let's try inserting the species. 254 # Chances are we don't have enough information ... 255 # Furthermore, it won't be in the hierarchy. 256 257 lineage = [] 258 for c in record.annotations.get("taxonomy", []): 259 lineage.append([None, None, c]) 260 if lineage: 261 lineage[-1][1] = "genus" 262 lineage.append([None, "species", record.annotations["organism"]]) 263 # XXX do we have them? 264 if "subspecies" in record.annotations: 265 lineage.append([None, "subspecies", 266 record.annotations["subspecies"]]) 267 if "variant" in record.annotations: 268 lineage.append([None, "varietas", 269 record.annotations["variant"]]) 270 lineage[-1][0] = ncbi_taxon_id 271 272 left_value = self.adaptor.execute_one( 273 "SELECT MAX(left_value) FROM taxon")[0] 274 if not left_value: 275 left_value = 0 276 left_value += 1 277 278 # XXX -- Brad: Fixing this for now in an ugly way because 279 # I am getting overlaps for right_values. I need to dig into this 280 # more to actually understand how it works. I'm not sure it is 281 # actually working right anyhow. 282 right_start_value = self.adaptor.execute_one( 283 "SELECT MAX(right_value) FROM taxon")[0] 284 if not right_start_value: 285 right_start_value = 0 286 right_value = right_start_value + 2 * len(lineage) - 1 287 288 parent_taxon_id = None 289 for taxon in lineage: 290 self.adaptor.execute( 291 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 292 " left_value, right_value)" \ 293 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 294 taxon[0], 295 taxon[1], 296 left_value, 297 right_value)) 298 taxon_id = self.adaptor.last_id("taxon") 299 self.adaptor.execute( 300 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 301 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 302 #Note the name field is limited to 255, some SwissProt files 303 #have a multi-species name which can be longer. So truncate this. 304 left_value += 1 305 right_value -= 1 306 parent_taxon_id = taxon_id 307 if common_name: 308 self.adaptor.execute( 309 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 310 "VALUES (%s, %s, 'common name')", ( 311 taxon_id, common_name)) 312 313 return taxon_id
314
315 - def _load_bioentry_table(self, record):
316 """Fill the bioentry table with sequence information. 317 318 record - SeqRecord object to add to the database. 319 """ 320 # get the pertinent info and insert it 321 322 if record.id.find('.') >= 0: # try to get a version from the id 323 accession, version = record.id.split('.') 324 version = int(version) 325 else: # otherwise just use a version of 0 326 accession = record.id 327 version = 0 328 329 #Find the taxon id (this is not just the NCBI Taxon ID) 330 #NOTE - If the species isn't defined in the taxon table, 331 #a new minimal entry is created. 332 taxon_id = self._get_taxon_id(record) 333 334 identifier = record.annotations.get('gi') 335 description = getattr(record, 'description', None) 336 division = record.annotations.get("data_file_division", "UNK") 337 338 sql = """ 339 INSERT INTO bioentry ( 340 biodatabase_id, 341 taxon_id, 342 name, 343 accession, 344 identifier, 345 division, 346 description, 347 version) 348 VALUES ( 349 %s, 350 %s, 351 %s, 352 %s, 353 %s, 354 %s, 355 %s, 356 %s)""" 357 self.adaptor.execute(sql, (self.dbid, 358 taxon_id, 359 record.name, 360 accession, 361 identifier, 362 division, 363 description, 364 version)) 365 # now retrieve the id for the bioentry 366 bioentry_id = self.adaptor.last_id('bioentry') 367 368 return bioentry_id
369
370 - def _load_bioentry_date(self, record, bioentry_id):
371 """Add the effective date of the entry into the database. 372 373 record - a SeqRecord object with an annotated date 374 bioentry_id - corresponding database identifier 375 """ 376 # dates are GenBank style, like: 377 # 14-SEP-2000 378 date = record.annotations.get("date", 379 strftime("%d-%b-%Y", gmtime()).upper()) 380 annotation_tags_id = self._get_ontology_id("Annotation Tags") 381 date_id = self._get_term_id("date_changed", annotation_tags_id) 382 sql = r"INSERT INTO bioentry_qualifier_value" \ 383 r" (bioentry_id, term_id, value, rank)" \ 384 r" VALUES (%s, %s, %s, 1)" 385 self.adaptor.execute(sql, (bioentry_id, date_id, date))
386
387 - def _load_biosequence(self, record, bioentry_id):
388 """Record a SeqRecord's sequence and alphabet in the database. 389 390 record - a SeqRecord object with a seq property 391 bioentry_id - corresponding database identifier 392 """ 393 # determine the string representation of the alphabet 394 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 395 alphabet = "dna" 396 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 397 alphabet = "rna" 398 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 399 alphabet = "protein" 400 else: 401 alphabet = "unknown" 402 403 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 404 r"length, seq, alphabet) " \ 405 r"VALUES (%s, 0, %s, %s, %s)" 406 self.adaptor.execute(sql, (bioentry_id, 407 len(record.seq.data), 408 record.seq.data, 409 alphabet))
410
411 - def _load_comment(self, record, bioentry_id):
412 """Record a SeqRecord's annotated comment in the database. 413 414 record - a SeqRecord object with an annotated comment 415 bioentry_id - corresponding database identifier 416 """ 417 # Assume annotations['comment'] is not a list 418 comment = record.annotations.get('comment') 419 if not comment: 420 return 421 comment = comment.replace('\n', ' ') 422 423 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 424 " VALUES (%s, %s, %s)" 425 self.adaptor.execute(sql, (bioentry_id, comment, 1))
426
427 - def _load_annotations(self, record, bioentry_id) :
428 """Record a SeqRecord's misc annotations in the database. 429 430 The annotation strings are recorded in the bioentry_qualifier_value 431 table, except for special cases like the reference, comment and 432 taxonomy which are handled with their own tables. 433 434 record - a SeqRecord object with an annotations dictionary 435 bioentry_id - corresponding database identifier 436 """ 437 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 438 "(bioentry_id, term_id, value)" \ 439 " VALUES (%s, %s, %s)" 440 many_sql = "INSERT INTO bioentry_qualifier_value" \ 441 "(bioentry_id, term_id, value, rank)" \ 442 " VALUES (%s, %s, %s, %s)" 443 tag_ontology_id = self._get_ontology_id('Annotation Tags') 444 for key, value in record.annotations.iteritems() : 445 if key in ["references", "comment", "ncbi_taxid"] : 446 #Handled separately 447 continue 448 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 449 if isinstance(value, list) : 450 rank = 0 451 for entry in value : 452 if isinstance(entry, str) or isinstance(entry, int): 453 #Easy case 454 rank += 1 455 self.adaptor.execute(many_sql, \ 456 (bioentry_id, term_id, str(entry), rank)) 457 else : 458 pass 459 #print "Ignoring annotation '%s' sub-entry of type '%s'" \ 460 # % (key, str(type(entry))) 461 elif isinstance(value, str) or isinstance(value, int): 462 #Have a simple single entry, leave rank as the DB default 463 self.adaptor.execute(mono_sql, \ 464 (bioentry_id, term_id, str(value))) 465 else : 466 pass
467 #print "Ignoring annotation '%s' entry of type '%s'" \ 468 # % (key, type(value)) 469 470
471 - def _load_reference(self, reference, rank, bioentry_id):
472 """Record a SeqRecord's annotated references in the database. 473 474 record - a SeqRecord object with annotated references 475 bioentry_id - corresponding database identifier 476 """ 477 478 refs = None 479 if reference.medline_id: 480 refs = self.adaptor.execute_and_fetch_col0( 481 "SELECT reference_id" \ 482 " FROM reference JOIN dbxref USING (dbxref_id)" \ 483 " WHERE dbname = 'MEDLINE' AND accession = %s", 484 (reference.medline_id,)) 485 if not refs and reference.pubmed_id: 486 refs = self.adaptor.execute_and_fetch_col0( 487 "SELECT reference_id" \ 488 " FROM reference JOIN dbxref USING (dbxref_id)" \ 489 " WHERE dbname = 'PUBMED' AND accession = %s", 490 (reference.pubmed_id,)) 491 if not refs: 492 s = [] 493 for f in reference.authors, reference.title, reference.journal: 494 s.append(f or "<undef>") 495 crc = crc64("".join(s)) 496 refs = self.adaptor.execute_and_fetch_col0( 497 "SELECT reference_id FROM reference" \ 498 r" WHERE crc = %s", (crc,)) 499 if not refs: 500 if reference.medline_id: 501 dbxref_id = self._add_dbxref("MEDLINE", 502 reference.medline_id, 0) 503 elif reference.pubmed_id: 504 dbxref_id = self._add_dbxref("PUBMED", 505 reference.pubmed_id, 0) 506 else: 507 dbxref_id = None 508 authors = reference.authors or None 509 title = reference.title or None 510 #The location/journal field cannot be Null, so default 511 #to an empty string rather than None: 512 journal = reference.journal or "" 513 self.adaptor.execute( 514 "INSERT INTO reference (dbxref_id, location," \ 515 " title, authors, crc)" \ 516 " VALUES (%s, %s, %s, %s, %s)", 517 (dbxref_id, journal, title, 518 authors, crc)) 519 reference_id = self.adaptor.last_id("reference") 520 else: 521 reference_id = refs[0] 522 523 if reference.location: 524 start = 1 + int(str(reference.location[0].start)) 525 end = int(str(reference.location[0].end)) 526 else: 527 start = None 528 end = None 529 530 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 531 " start_pos, end_pos, rank)" \ 532 " VALUES (%s, %s, %s, %s, %s)" 533 self.adaptor.execute(sql, (bioentry_id, reference_id, 534 start, end, rank + 1))
535
536 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
537 """Load a biopython SeqFeature into the database. 538 """ 539 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 540 bioentry_id) 541 self._load_seqfeature_locations(feature, seqfeature_id) 542 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
543
544 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
545 """Load the first tables of a seqfeature and returns the id. 546 547 This loads the "key" of the seqfeature (ie. CDS, gene) and 548 the basic seqfeature table itself. 549 """ 550 ontology_id = self._get_ontology_id('SeqFeature Keys') 551 seqfeature_key_id = self._get_term_id(feature_type, 552 ontology_id = ontology_id) 553 554 # XXX source is always EMBL/GenBank/SwissProt here; it should depend on 555 # the record (how?) 556 source_cat_id = self._get_ontology_id('SeqFeature Sources') 557 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt', 558 ontology_id = source_cat_id) 559 560 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 561 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 562 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 563 source_term_id, feature_rank + 1)) 564 seqfeature_id = self.adaptor.last_id('seqfeature') 565 566 return seqfeature_id
567
568 - def _load_seqfeature_locations(self, feature, seqfeature_id):
569 """Load all of the locations for a SeqFeature into tables. 570 571 This adds the locations related to the SeqFeature into the 572 seqfeature_location table. Fuzzies are not handled right now. 573 For a simple location, ie (1..2), we have a single table row 574 with seq_start = 1, seq_end = 2, location_rank = 1. 575 576 For split locations, ie (1..2, 3..4, 5..6) we would have three 577 row tables with: 578 start = 1, end = 2, rank = 1 579 start = 3, end = 4, rank = 2 580 start = 5, end = 6, rank = 3 581 """ 582 # two cases, a simple location or a split location 583 if not feature.sub_features: # simple location 584 self._insert_seqfeature_location(feature, 1, seqfeature_id) 585 else: # split location 586 for rank, cur_feature in enumerate(feature.sub_features): 587 self._insert_seqfeature_location(cur_feature, 588 rank + 1, 589 seqfeature_id)
590
591 - def _insert_seqfeature_location(self, feature, rank, seqfeature_id):
592 """Add a location of a SeqFeature to the seqfeature_location table. 593 """ 594 sql = r"INSERT INTO location (seqfeature_id, " \ 595 r"start_pos, end_pos, strand, rank) " \ 596 r"VALUES (%s, %s, %s, %s, %s)" 597 598 # convert biopython locations to the 1-based location system 599 # used in bioSQL 600 # XXX This could also handle fuzzies 601 start = feature.location.nofuzzy_start + 1 602 end = feature.location.nofuzzy_end 603 # Biopython uses None when we don't know strand information but 604 # BioSQL requires something (non null) and sets this as zero 605 # So we'll use the strand or 0 if Biopython spits out None 606 strand = feature.strand or 0 607 608 self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank))
609
610 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
611 """Insert the (key, value) pair qualifiers relating to a feature. 612 613 Qualifiers should be a dictionary of the form: 614 {key : [value1, value2]} 615 """ 616 tag_ontology_id = self._get_ontology_id('Annotation Tags') 617 for qualifier_key in qualifiers.keys(): 618 # Treat db_xref qualifiers differently to sequence annotation 619 # qualifiers by populating the seqfeature_dbxref and dbxref 620 # tables. Other qualifiers go into the seqfeature_qualifier_value 621 # and (if new) term tables. 622 if qualifier_key != 'db_xref': 623 qualifier_key_id = self._get_term_id(qualifier_key, 624 ontology_id=tag_ontology_id) 625 # now add all of the values to their table 626 entries = qualifiers[qualifier_key] 627 if not isinstance(entries, list) : 628 # Could be a plain string, or an int or a float. 629 # However, we exect a list of strings here. 630 entries = [entries] 631 for qual_value_rank in range(len(entries)): 632 qualifier_value = entries[qual_value_rank] 633 sql = r"INSERT INTO seqfeature_qualifier_value "\ 634 r" (seqfeature_id, term_id, rank, value) VALUES"\ 635 r" (%s, %s, %s, %s)" 636 self.adaptor.execute(sql, (seqfeature_id, 637 qualifier_key_id, 638 qual_value_rank + 1, 639 qualifier_value)) 640 else: 641 # The dbxref_id qualifier/value sets go into the dbxref table 642 # as dbname, accession, version tuples, with dbxref.dbxref_id 643 # being automatically assigned, and into the seqfeature_dbxref 644 # table as seqfeature_id, dbxref_id, and rank tuples 645 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 646 seqfeature_id)
647 648
649 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
650 """Add the database crossreferences of a SeqFeature to the database. 651 652 o dbxrefs List, dbxref data from the source file in the 653 format <database>:<accession> 654 655 o seqfeature_id Int, the identifier for the seqfeature in the 656 seqfeature table 657 658 Insert dbxref qualifier data for a seqfeature into the 659 seqfeature_dbxref and, if required, dbxref tables. 660 The dbxref_id qualifier/value sets go into the dbxref table 661 as dbname, accession, version tuples, with dbxref.dbxref_id 662 being automatically assigned, and into the seqfeature_dbxref 663 table as seqfeature_id, dbxref_id, and rank tuples 664 """ 665 # NOTE - In older versions of Biopython, we would map the GenBank 666 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 667 # for any unknown terms. This was a long term maintainance problem, 668 # and differed from BioPerl and BioJava's implementation. See bug 2405 669 for rank, value in enumerate(dbxrefs): 670 # Split the DB:accession format string at colons. We have to 671 # account for multiple-line and multiple-accession entries 672 try: 673 dbxref_data = value.replace(' ','').replace('\n','').split(':') 674 db = dbxref_data[0] 675 accessions = dbxref_data[1:] 676 except: 677 raise ValueError("Parsing of db_xref failed: '%s'" % value) 678 # Loop over all the grabbed accessions, and attempt to fill the 679 # table 680 for accession in accessions: 681 # Get the dbxref_id value for the dbxref data 682 dbxref_id = self._get_dbxref_id(db, accession) 683 # Insert the seqfeature_dbxref data 684 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
685
686 - def _get_dbxref_id(self, db, accession):
687 """ _get_dbxref_id(self, db, accession) -> Int 688 689 o db String, the name of the external database containing 690 the accession number 691 692 o accession String, the accession of the dbxref data 693 694 Finds and returns the dbxref_id for the passed data. The method 695 attempts to find an existing record first, and inserts the data 696 if there is no record. 697 """ 698 # Check for an existing record 699 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 700 r'AND accession = %s' 701 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 702 # If there was a record, return the dbxref_id, else create the 703 # record and return the created dbxref_id 704 if dbxref_id: 705 return dbxref_id[0] 706 return self._add_dbxref(db, accession, 0)
707
708 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
709 """ Check for a pre-existing seqfeature_dbxref entry with the passed 710 seqfeature_id and dbxref_id. If one does not exist, insert new 711 data 712 713 """ 714 # Check for an existing record 715 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 716 r"WHERE seqfeature_id = '%s' AND dbxref_id = '%s'" 717 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 718 dbxref_id)) 719 # If there was a record, return without executing anything, else create 720 # the record and return 721 if result: 722 return result 723 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
724
725 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
726 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 727 dbxref_id 728 """ 729 sql = r'INSERT INTO seqfeature_dbxref ' \ 730 '(seqfeature_id, dbxref_id, rank) VALUES' \ 731 r'(%s, %s, %s)' 732 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 733 return (seqfeature_id, dbxref_id)
734
735 - def _load_dbxrefs(self, record, bioentry_id) :
736 """Load any sequence level cross references into the database. 737 738 See table bioentry_dbxref""" 739 for rank, value in enumerate(record.dbxrefs): 740 # Split the DB:accession string at first colon. 741 # We have to cope with things like: 742 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 743 # "GO:GO:123" (db="GO", accession="GO:123") 744 # 745 # Annoyingly I have seen the NCBI use both the style 746 # "GO:GO:123" and "GO:123" in different vintages. 747 assert value.count("\n")==0 748 try: 749 db, accession = value.split(':',1) 750 db = db.strip() 751 accession = accession.strip() 752 except: 753 raise ValueError("Parsing of dbxrefs list failed: '%s'" % value) 754 # Get the dbxref_id value for the dbxref data 755 dbxref_id = self._get_dbxref_id(db, accession) 756 # Insert the bioentry_dbxref data 757 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank+1)
758
759 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
760 """ Check for a pre-existing bioentry_dbxref entry with the passed 761 seqfeature_id and dbxref_id. If one does not exist, insert new 762 data 763 764 """ 765 # Check for an existing record 766 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 767 r"WHERE bioentry_id = '%s' AND dbxref_id = '%s'" 768 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 769 dbxref_id)) 770 # If there was a record, return without executing anything, else create 771 # the record and return 772 if result: 773 return result 774 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
775
776 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
777 """ Insert a bioentry_dbxref row and return the seqfeature_id and 778 dbxref_id 779 """ 780 sql = r'INSERT INTO bioentry_dbxref ' \ 781 '(bioentry_id,dbxref_id,rank) VALUES ' \ 782 '(%s, %s, %s)' 783 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 784 return (bioentry_id, dbxref_id)
785
786 -class DatabaseRemover:
787 """Complement the Loader functionality by fully removing a database. 788 789 This probably isn't really useful for normal purposes, since you 790 can just do a: 791 DROP DATABASE db_name 792 and then recreate the database. But, it's really useful for testing 793 purposes. 794 795 YB: now use the cascaded deletions 796 """
797 - def __init__(self, adaptor, dbid):
798 """Initialize with a database id and adaptor connection. 799 """ 800 self.adaptor = adaptor 801 self.dbid = dbid
802
803 - def remove(self):
804 """Remove everything related to the given database id. 805 """ 806 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 807 self.adaptor.execute(sql, (self.dbid,)) 808 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 809 self.adaptor.execute(sql, (self.dbid,))
810