1 """Load biopython objects into a BioSQL database for persistent storage.
2
3 This code makes it possible to store biopython objects in a relational
4 database and then retrieve them back. You shouldn't use any of the
5 classes in this module directly. Rather, call the load() method on
6 a database object.
7 """
8
9 from time import gmtime, strftime
10
11
12 from Bio import Alphabet
13 from Bio.SeqUtils.CheckSum import crc64
14
15
17 """Load a database with biopython objects.
18 """
20 """Initialize with connection information for the database.
21
22 XXX Figure out what I need to load a database and document it.
23 """
24 self.adaptor = adaptor
25 self.dbid = dbid
26
28 """Load a Biopython SeqRecord into the database.
29 """
30 bioentry_id = self._load_bioentry_table(record)
31 self._load_bioentry_date(record, bioentry_id)
32 self._load_biosequence(record, bioentry_id)
33 self._load_comment(record, bioentry_id)
34 references = record.annotations.get('references', ())
35 for reference, rank in zip(references, range(len(references))):
36 self._load_reference(reference, rank, bioentry_id)
37 for seq_feature_num in range(len(record.features)):
38 seq_feature = record.features[seq_feature_num]
39 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
40
42 oids = self.adaptor.execute_and_fetch_col0(
43 "SELECT ontology_id FROM ontology WHERE name = %s",
44 (name,))
45 if oids:
46 return oids[0]
47 self.adaptor.execute(
48 "INSERT INTO ontology(name, definition) VALUES (%s, %s)",
49 (name, definition))
50 return self.adaptor.last_id("ontology")
51
52
53 - def _get_term_id(self,
54 name,
55 ontology_id=None,
56 definition=None,
57 identifier=None):
58 """Get the id that corresponds to a term.
59
60 This looks through the term table for a the given term. If it
61 is not found, a new id corresponding to this term is created.
62 In either case, the id corresponding to that term is returned, so
63 that you can reference it in another table.
64
65 The ontology_id should be used to disambiguate the term.
66 """
67
68
69 sql = r"SELECT term_id FROM term " \
70 r"WHERE name = %s"
71 fields = [name]
72 if ontology_id:
73 sql += ' AND ontology_id = %s'
74 fields.append(ontology_id)
75 id_results = self.adaptor.execute_and_fetchall(sql, fields)
76
77 if len(id_results) > 1:
78 raise ValueError("Multiple term ids for %s: %r" %
79 (name, id_results))
80 elif len(id_results) == 1:
81 return id_results[0][0]
82 else:
83 sql = r"INSERT INTO term (name, definition," \
84 r" identifier, ontology_id)" \
85 r" VALUES (%s, %s, %s, %s)"
86 self.adaptor.execute(sql, (name, definition,
87 identifier, ontology_id))
88 return self.adaptor.last_id("term")
89
91 """Insert a dbxref and return its id"""
92
93 self.adaptor.execute(
94 "INSERT INTO dbxref(dbname, accession, version)" \
95 " VALUES (%s, %s, %s)", (dbname, accession, version))
96 return self.adaptor.last_id("dbxref")
97
99 """Get the id corresponding to a taxon.
100
101 If the species isn't in the taxon table, it is created.
102 """
103
104 ncbi_taxon_id = record.annotations.get("ncbi_taxid")
105 if not ncbi_taxon_id:
106
107 for f in record.features:
108 if f.type == 'source':
109 quals = getattr(f, 'qualifiers', {})
110 if "db_xref" in quals:
111 for db_xref in f.qualifiers["db_xref"]:
112 if db_xref.startswith("taxon:"):
113 ncbi_taxon_id = int(db_xref[6:])
114 break
115 if ncbi_taxon_id: break
116 if ncbi_taxon_id:
117 taxa = self.adaptor.execute_and_fetch_col0(
118 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s",
119 (ncbi_taxon_id,))
120 if taxa:
121 return taxa[0]
122
123
124 if record.annotations["organism"]:
125 taxa = self.adaptor.execute_and_fetch_col0(
126 "SELECT taxon_id FROM taxon_name" \
127 " WHERE name_class = 'scientific name' AND name = %s",
128 (record.annotations["organism"],))
129 if taxa:
130 return taxa[0]
131
132
133
134 if record.annotations["source"]:
135 taxa = self.adaptor.execute_and_fetch_col0(
136 "SELECT DISTINCT taxon_id FROM taxon_name" \
137 " WHERE name = %s",
138 (record.annotations["source"],))
139 if len(taxa) > 1:
140 raise ValueError("Taxa: %d species have name %r" % (
141 len(taxa),
142 record.annotations["source"]))
143 if taxa:
144 return taxa[0]
145
146
147
148
149
150 lineage = []
151 for c in record.annotations.get("taxonomy", []):
152 lineage.append([None, None, c])
153 if lineage:
154 lineage[-1][1] = "genus"
155 lineage.append([None, "species", record.annotations["organism"]])
156
157 if "subspecies" in record.annotations:
158 lineage.append([None, "subspecies",
159 record.annotations["subspecies"]])
160 if "variant" in record.annotations:
161 lineage.append([None, "varietas",
162 record.annotations["variant"]])
163 lineage[-1][0] = ncbi_taxon_id
164
165 left_value = self.adaptor.execute_one(
166 "SELECT MAX(left_value) FROM taxon")[0]
167 if not left_value:
168 left_value = 0
169 left_value += 1
170
171
172
173
174
175 right_start_value = self.adaptor.execute_one(
176 "SELECT MAX(right_value) FROM taxon")[0]
177 if not right_start_value:
178 right_start_value = 0
179 right_value = right_start_value + 2 * len(lineage) - 1
180
181 parent_taxon_id = None
182 for taxon in lineage:
183 self.adaptor.execute(
184 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
185 " left_value, right_value)" \
186 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id,
187 taxon[0],
188 taxon[1],
189 left_value,
190 right_value))
191 taxon_id = self.adaptor.last_id("taxon")
192 self.adaptor.execute(
193 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
194 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2]))
195 left_value += 1
196 right_value -= 1
197 parent_taxon_id = taxon_id
198 if "source" in record.annotations:
199 self.adaptor.execute(
200 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
201 "VALUES (%s, %s, 'common name')", (
202 taxon_id, record.annotations["source"]))
203
204 return taxon_id
205
206 - def _load_bioentry_table(self, record):
207 """Fill the bioentry table with sequence information.
208 """
209
210
211 if record.id.find('.') >= 0:
212 accession, version = record.id.split('.')
213 version = int(version)
214 else:
215 accession = record.id
216 version = 0
217
218
219
220
221 identifier = record.annotations.get('gi')
222 description = getattr(record, 'description', None)
223 division = record.annotations.get("data_file_division", "UNK")
224
225
226
227
228 sql = """
229 INSERT INTO bioentry (
230 biodatabase_id,
231 name,
232 accession,
233 identifier,
234 division,
235 description,
236 version)
237 VALUES (
238 %s,
239 %s,
240 %s,
241 %s,
242 %s,
243 %s)"""
244 self.adaptor.execute(sql, (self.dbid,
245 record.name,
246 accession,
247 identifier,
248 division,
249 description,
250 version))
251
252 bioentry_id = self.adaptor.last_id('bioentry')
253
254 return bioentry_id
255
256 - def _load_bioentry_date(self, record, bioentry_id):
257 """Add the effective date of the entry into the database.
258 """
259
260
261 date = record.annotations.get("date",
262 strftime("%d-%b-%Y", gmtime()).upper())
263 annotation_tags_id = self._get_ontology_id("Annotation Tags")
264 date_id = self._get_term_id("date_changed", annotation_tags_id)
265 sql = r"INSERT INTO bioentry_qualifier_value" \
266 r" (bioentry_id, term_id, value, rank)" \
267 r" VALUES (%s, %s, %s, 1)"
268 self.adaptor.execute(sql, (bioentry_id, date_id, date))
269
271 """Load the biosequence table in the database.
272 """
273
274 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
275 alphabet = "dna"
276 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
277 alphabet = "rna"
278 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
279 alphabet = "protein"
280 else:
281 alphabet = "unknown"
282
283 sql = r"INSERT INTO biosequence (bioentry_id, version, " \
284 r"length, seq, alphabet) " \
285 r"VALUES (%s, 0, %s, %s, %s)"
286 self.adaptor.execute(sql, (bioentry_id,
287 len(record.seq.data),
288 record.seq.data,
289 alphabet))
290
301
303
304 refs = None
305 if reference.medline_id:
306 refs = self.adaptor.execute_and_fetch_col0(
307 "SELECT reference_id" \
308 " FROM reference JOIN dbxref USING (dbxref_id)" \
309 " WHERE dbname = 'MEDLINE' AND accession = %s",
310 (reference.medline_id,))
311 if not refs and reference.pubmed_id:
312 refs = self.adaptor.execute_and_fetch_col0(
313 "SELECT reference_id" \
314 " FROM reference JOIN dbxref USING (dbxref_id)" \
315 " WHERE dbname = 'PUBMED' AND accession = %s",
316 (reference.pubmed_id,))
317 if not refs:
318 s = []
319 for f in reference.authors, reference.title, reference.journal:
320 s.append(f or "<undef>")
321 crc = crc64("".join(s))
322 refs = self.adaptor.execute_and_fetch_col0(
323 "SELECT reference_id FROM reference" \
324 r" WHERE crc = %s", (crc,))
325 if not refs:
326 if reference.medline_id:
327 dbxref_id = self._add_dbxref("MEDLINE",
328 reference.medline_id, 0)
329 elif reference.pubmed_id:
330 dbxref_id = self._add_dbxref("PUBMED",
331 reference.pubmed_id, 0)
332 else:
333 dbxref_id = None
334 authors = reference.authors or None
335 title = reference.title or None
336 journal = reference.journal or None
337 self.adaptor.execute(
338 "INSERT INTO reference (dbxref_id, location," \
339 " title, authors, crc)" \
340 " VALUES (%s, %s, %s, %s, %s)",
341 (dbxref_id, journal, title,
342 authors, crc))
343 reference_id = self.adaptor.last_id("reference")
344 else:
345 reference_id = refs[0]
346
347 if reference.location:
348 start = 1 + int(str(reference.location[0].start))
349 end = int(str(reference.location[0].end))
350 else:
351 start = None
352 end = None
353
354 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
355 " start_pos, end_pos, rank)" \
356 " VALUES (%s, %s, %s, %s, %s)"
357 self.adaptor.execute(sql, (bioentry_id, reference_id,
358 start, end, rank + 1))
359
367
369 """Load the first tables of a seqfeature and returns the id.
370
371 This loads the "key" of the seqfeature (ie. CDS, gene) and
372 the basic seqfeature table itself.
373 """
374 ontology_id = self._get_ontology_id('SeqFeature Keys')
375 seqfeature_key_id = self._get_term_id(feature_type,
376 ontology_id = ontology_id)
377
378
379
380 source_cat_id = self._get_ontology_id('SeqFeature Sources')
381 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt',
382 ontology_id = source_cat_id)
383
384 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \
385 r"source_term_id, rank) VALUES (%s, %s, %s, %s)"
386 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
387 source_term_id, feature_rank + 1))
388 seqfeature_id = self.adaptor.last_id('seqfeature')
389
390 return seqfeature_id
391
393 """Load all of the locations for a SeqFeature into tables.
394
395 This adds the locations related to the SeqFeature into the
396 seqfeature_location table. Fuzzies are not handled right now.
397 For a simple location, ie (1..2), we have a single table row
398 with seq_start = 1, seq_end = 2, location_rank = 1.
399
400 For split locations, ie (1..2, 3..4, 5..6) we would have three
401 row tables with:
402 start = 1, end = 2, rank = 1
403 start = 3, end = 4, rank = 2
404 start = 5, end = 6, rank = 3
405 """
406
407 if not feature.sub_features:
408 self._insert_seqfeature_location(feature, 1, seqfeature_id)
409 else:
410 for rank, cur_feature in enumerate(feature.sub_features):
411 self._insert_seqfeature_location(cur_feature,
412 rank + 1,
413 seqfeature_id)
414
416 """Add a location of a SeqFeature to the seqfeature_location table.
417 """
418 sql = r"INSERT INTO location (seqfeature_id, " \
419 r"start_pos, end_pos, strand, rank) " \
420 r"VALUES (%s, %s, %s, %s, %s)"
421
422
423
424
425 start = feature.location.nofuzzy_start + 1
426 end = feature.location.nofuzzy_end
427
428
429
430 strand = feature.strand or 0
431
432 self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank))
433
435 """Insert the (key, value) pair qualifiers relating to a feature.
436
437 Qualifiers should be a dictionary of the form:
438 {key : [value1, value2]}
439 """
440 tag_ontology_id = self._get_ontology_id('Annotation Tags')
441 for qualifier_key in qualifiers.keys():
442
443
444
445
446 if qualifier_key != 'db_xref':
447 qualifier_key_id = self._get_term_id(qualifier_key,
448 ontology_id=tag_ontology_id)
449
450 for qual_value_rank in range(len(qualifiers[qualifier_key])):
451 qualifier_value = qualifiers[qualifier_key][qual_value_rank]
452 sql = r"INSERT INTO seqfeature_qualifier_value VALUES" \
453 r" (%s, %s, %s, %s)"
454 self.adaptor.execute(sql, (seqfeature_id,
455 qualifier_key_id,
456 qual_value_rank + 1,
457 qualifier_value))
458 else:
459
460
461
462
463 self._load_seqfeature_dbxref(qualifiers[qualifier_key],
464 seqfeature_id)
465
466
468 """ _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id)
469
470 o dbxrefs List, dbxref data from the source file in the
471 format <database>:<accession>
472
473 o seqfeature_id Int, the identifier for the seqfeature in the
474 seqfeature table
475
476 Insert dbxref qualifier data for a seqfeature into the
477 seqfeature_dbxref and, if required, dbxref tables.
478 The dbxref_id qualifier/value sets go into the dbxref table
479 as dbname, accession, version tuples, with dbxref.dbxref_id
480 being automatically assigned, and into the seqfeature_dbxref
481 table as seqfeature_id, dbxref_id, and rank tuples
482 """
483
484 db_dict = {'GeneID': 'Entrez',
485 'GI': 'GeneIndex',
486 'COG': 'COG',
487 'CDD': 'CDD',
488 'DDBJ': 'DNA Databank of Japan',
489 'Entrez': 'Entrez',
490 'GeneIndex': 'GeneIndex',
491 'PUBMED': 'PubMed',
492 'taxon': 'Taxon',
493 'ATCC': 'ATCC',
494 'ISFinder': 'ISFinder',
495 'GOA': 'Gene Ontology Annotation',
496 'ASAP': 'ASAP',
497 'PSEUDO': 'PSEUDO',
498 'InterPro': 'InterPro',
499 'GEO': 'Gene Expression Omnibus',
500 'EMBL': 'EMBL',
501 'UniProtKB/Swiss-Prot': 'UniProtKB/Swiss-Prot',
502 'ECOCYC': 'EcoCyc',
503 'UniProtKB/TrEMBL': 'UniProtKB/TrEMBL'
504 }
505 for rank, value in enumerate(dbxrefs):
506
507
508 try:
509 dbxref_data = value.replace(' ','').replace('\n','').split(':')
510 key = dbxref_data[0]
511 accessions = dbxref_data[1:]
512 except:
513 raise Exception("Parsing of db_xref failed: %s; %s" % (key, accession))
514 if key not in db_dict:
515
516
517 import warnings
518 warnings.warn("%s not recognised as database type: temporarily accepting key" % key)
519 db_dict[key] = key
520 db = db_dict[key]
521
522
523 for accession in accessions:
524
525 dbxref_id = self._get_dbxref_id(db, accession)
526
527 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
528
530 """ _get_dbxref_id(self, db, accession) -> Int
531
532 o db String, the name of the external database containing
533 the accession number
534
535 o accession String, the accession of the dbxref data
536
537 Finds and returns the dbxref_id for the passed data. The method
538 attempts to find an existing record first, and inserts the data
539 if there is no record.
540 """
541
542 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \
543 r'AND accession = %s'
544 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
545
546
547 if dbxref_id:
548 return dbxref_id[0]
549 return self._add_dbxref(db, accession, 0)
550
552 """ Check for a pre-existing seqfeature_dbxref entry with the passed
553 seqfeature_id and dbxref_id. If one does not exist, insert new
554 data
555
556 """
557
558 sql = r'SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref ' \
559 r'WHERE seqfeature_id = "%s" AND dbxref_id = "%s"'
560 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id,
561 dbxref_id))
562
563
564 if result:
565 return result
566 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
567
569 """ Insert a seqfeature_dbxref row and return the seqfeature_id and
570 dbxref_id
571 """
572 sql = r'INSERT INTO seqfeature_dbxref VALUES' \
573 r'(%s, %s, %s)'
574 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
575 return (seqfeature_id, dbxref_id)
576
577
579 """Complement the Loader functionality by fully removing a database.
580
581 This probably isn't really useful for normal purposes, since you
582 can just do a:
583 DROP DATABASE db_name
584 and then recreate the database. But, it's really useful for testing
585 purposes.
586
587 YB: now use the cascaded deletions
588 """
590 """Initialize with a database id and adaptor connection.
591 """
592 self.adaptor = adaptor
593 self.dbid = dbid
594
596 """Remove everything related to the given database id.
597 """
598 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s"
599 self.adaptor.execute(sql, (self.dbid,))
600 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s"
601 self.adaptor.execute(sql, (self.dbid,))
602