Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2008 by Peter Cock. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  # 
  7  # Note that BioSQL (including the database schema and scripts) is 
  8  # available and licensed separately.  Please consult www.biosql.org 
  9  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 10   
 11  This allows retrival of items stored in a BioSQL database using 
 12  a biopython-like Seq interface. 
 13  """ 
 14   
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Bio import SeqFeature 
 18   
19 -class DBSeq(Seq): # This implements the biopython Seq interface
20 - def __init__(self, primary_id, adaptor, alphabet, start, length):
21 self.primary_id = primary_id 22 self.adaptor = adaptor 23 self.alphabet = alphabet 24 self._length = length 25 self.start = start
26
27 - def __len__(self):
28 return self._length
29
30 - def __getitem__(self, index) : # Seq API requirement
31 #Note since Python 2.0, __getslice__ is deprecated 32 #and __getitem__ is used instead. 33 #See http://docs.python.org/ref/sequence-methods.html 34 if isinstance(index, int) : 35 #Return a single letter as a string 36 i = index 37 if i < 0: 38 if -i > self._length: 39 raise IndexError(i) 40 i = i + self._length 41 elif i >= self._length: 42 raise IndexError(i) 43 return self.adaptor.get_subseq_as_string(self.primary_id, 44 self.start + i, 45 self.start + i + 1) 46 if not isinstance(index, slice) : 47 raise ValueError, "Unexpected index type" 48 49 #Return the (sub)sequence as another DBSeq or Seq object 50 #(see the Seq obect's __getitem__ method) 51 if index.start is None : 52 i=0 53 else : 54 i = index.start 55 if i < 0 : 56 #Map to equavilent positive index 57 if -i > self._length: 58 raise IndexError(i) 59 i = i + self._length 60 elif i >= self._length: 61 #Trivial case, should return empty string! 62 i = self._length 63 64 if index.stop is None : 65 j = self._length 66 else : 67 j = index.stop 68 if j < 0 : 69 #Map to equavilent positive index 70 if -j > self._length: 71 raise IndexError(j) 72 j = j + self._length 73 elif j >= self._length: 74 j = self._length 75 76 if i >= j: 77 #Trivial case, empty string. 78 return Seq("", self.alphabet) 79 elif index.step is None or index.step == 1 : 80 #Easy case - can return a DBSeq with the start and end adjusted 81 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 82 self.start + i, j - i) 83 else : 84 #Tricky. Will have to create a Seq object because of the stride 85 full = self.adaptor.get_subseq_as_string(self.primary_id, 86 self.start + i, 87 self.start + j) 88 return Seq(full[::index.step], self.alphabet) 89
90 - def tostring(self):
91 """Returns the full sequence as a python string. 92 93 Although not formally deprecated, you are now encouraged to use 94 str(my_seq) instead of my_seq.tostring().""" 95 return self.adaptor.get_subseq_as_string(self.primary_id, 96 self.start, 97 self.start + self._length)
98 - def __str__(self):
99 """Returns the full sequence as a python string.""" 100 return self.adaptor.get_subseq_as_string(self.primary_id, 101 self.start, 102 self.start + self._length)
103 104 105 data = property(tostring, doc="Sequence as string (DEPRECATED)") 106
107 -def _retrieve_seq(adaptor, primary_id):
108 seqs = adaptor.execute_and_fetchall( 109 "SELECT alphabet, length(seq) FROM biosequence" \ 110 " WHERE bioentry_id = %s", (primary_id,)) 111 if seqs: 112 moltype, length = seqs[0] 113 moltype = moltype.lower() #might be upper case in database 114 from Bio.Alphabet import IUPAC 115 if moltype == "dna": 116 alphabet = IUPAC.unambiguous_dna 117 elif moltype == "rna": 118 alphabet = IUPAC.unambiguous_rna 119 elif moltype == "protein": 120 alphabet = IUPAC.protein 121 else: 122 raise AssertionError("Unknown moltype: %s" % moltype) 123 seq = DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 124 return seq 125 else: 126 return None
127
128 -def _retrieve_dbxrefs(adaptor, primary_id):
129 """Retrieve the database cross references for the sequence.""" 130 _dbxrefs = [] 131 dbxrefs = adaptor.execute_and_fetchall( 132 "SELECT dbname, accession, version" \ 133 " FROM bioentry_dbxref join dbxref using (dbxref_id)" \ 134 " WHERE bioentry_id = %s" \ 135 " ORDER BY rank", (primary_id,)) 136 for dbname, accession, version in dbxrefs: 137 if version and version != "0": 138 v = "%s.%s" % (accession, version) 139 else: 140 v = accession 141 _dbxrefs.append("%s:%s" % (dbname, v)) 142 return _dbxrefs
143
144 -def _retrieve_features(adaptor, primary_id):
145 sql = "SELECT seqfeature_id, type.name, rank" \ 146 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 147 " WHERE bioentry_id = %s" \ 148 " ORDER BY rank" 149 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 150 seq_feature_list = [] 151 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 152 # Get qualifiers [except for db_xref which is stored separately] 153 qvs = adaptor.execute_and_fetchall( 154 "SELECT name, value" \ 155 " FROM seqfeature_qualifier_value join term using (term_id)" \ 156 " WHERE seqfeature_id = %s", (seqfeature_id,)) 157 qualifiers = {} 158 for qv_name, qv_value in qvs: 159 qualifiers.setdefault(qv_name, []).append(qv_value) 160 # Get db_xrefs [special case of qualifiers] 161 qvs = adaptor.execute_and_fetchall( 162 "SELECT dbxref.dbname, dbxref.accession" \ 163 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" \ 164 " WHERE seqfeature_dbxref.seqfeature_id = %s", (seqfeature_id,)) 165 for qv_name, qv_value in qvs: 166 value = "%s:%s" % (qv_name, qv_value) 167 qualifiers.setdefault("db_xref", []).append(value) 168 # Get locations 169 results = adaptor.execute_and_fetchall( 170 "SELECT location_id, start_pos, end_pos, strand" \ 171 " FROM location" \ 172 " WHERE seqfeature_id = %s" \ 173 " ORDER BY rank", (seqfeature_id,)) 174 locations = [] 175 # convert to Python standard form 176 for location_id, start, end, strand in results: 177 if start: 178 start -= 1 179 locations.append( (location_id, start, end, strand) ) 180 # Get possible remote reference information 181 remote_results = adaptor.execute_and_fetchall( 182 "SELECT location_id, dbname, accession, version" \ 183 " FROM location join dbxref using (dbxref_id)" \ 184 " WHERE seqfeature_id = %s", (seqfeature_id,)) 185 lookup = {} 186 for location_id, dbname, accession, version in remote_results: 187 if version and version != "0": 188 v = "%s.%s" % (accession, version) 189 else: 190 v = accession 191 lookup[location_id] = (dbname, v) 192 193 feature = SeqFeature.SeqFeature(type = seqfeature_type) 194 feature.qualifiers = qualifiers 195 if len(locations) == 0: 196 pass 197 elif len(locations) == 1: 198 location_id, start, end, strand = locations[0] 199 dbname, version = lookup.get(location_id, (None, None)) 200 201 feature.location = SeqFeature.FeatureLocation(start, end) 202 feature.strand = strand 203 feature.ref_db = dbname 204 feature.ref = version 205 else: 206 min_start = locations[0][1] 207 max_end = locations[0][2] 208 sub_feature_list = [] # (start, sub feature) for sorting 209 for location in locations: 210 location_id, start, end, strand = location 211 dbname, version = lookup.get(location_id, (None, None)) 212 min_start = min(min_start, start) 213 max_end = max(max_end, end) 214 215 subfeature = SeqFeature.SeqFeature() 216 subfeature.type = seqfeature_type 217 subfeature.location_operator = "join" 218 subfeature.location = SeqFeature.FeatureLocation(start, end) 219 subfeature.strand = strand 220 subfeature.ref_db = dbname 221 subfeature.ref = version 222 sub_feature_list.append((start, subfeature)) 223 sub_feature_list.sort() 224 feature.sub_features = [sub_feature[1] 225 for sub_feature in sub_feature_list] 226 feature.location = SeqFeature.FeatureLocation(min_start, max_end) 227 feature.strand = feature.sub_features[0].strand 228 229 seq_feature_list.append( 230 (seqfeature_rank, feature.location.start.position, feature) ) 231 232 # Primary sort is on the feature's rank 233 # .. then on the start position 234 # .. then arbitrary on the feature's id (SeqFeature has no __cmp__) 235 seq_feature_list.sort() 236 237 # Get just the SeqFeature 238 return [x[2] for x in seq_feature_list]
239
240 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
241 annotations = {} 242 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 243 annotations.update(_retrieve_reference(adaptor, primary_id)) 244 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 245 return annotations
246
247 -def _retrieve_qualifier_value(adaptor, primary_id):
248 qvs = adaptor.execute_and_fetchall( 249 "SELECT name, value" \ 250 " FROM bioentry_qualifier_value JOIN term USING (term_id)" \ 251 " WHERE bioentry_id = %s" \ 252 " ORDER BY rank", (primary_id,)) 253 qualifiers = {} 254 for name, value in qvs: 255 if name == "keyword": name = "keywords" 256 elif name == "date_changed": name = "dates" 257 elif name == "secondary_accession": name = "accessions" 258 qualifiers.setdefault(name, []).append(value) 259 return qualifiers
260
261 -def _retrieve_reference(adaptor, primary_id):
262 # XXX dbxref_qualifier_value 263 264 refs = adaptor.execute_and_fetchall( 265 "SELECT start_pos, end_pos, " \ 266 " location, title, authors," \ 267 " dbname, accession" \ 268 " FROM bioentry_reference" \ 269 " JOIN reference USING (reference_id)" \ 270 " LEFT JOIN dbxref USING (dbxref_id)" \ 271 " WHERE bioentry_id = %s" \ 272 " ORDER BY rank", (primary_id,)) 273 references = [] 274 for start, end, location, title, authors, dbname, accession in refs: 275 reference = SeqFeature.Reference() 276 if start: start -= 1 277 reference.location = [SeqFeature.FeatureLocation(start, end)] 278 reference.authors = authors 279 #Don't replace the default "" with None. 280 if title : reference.title = title 281 reference.journal = location 282 if dbname == 'PUBMED': 283 reference.pubmed_id = accession 284 elif dbname == 'MEDLINE': 285 reference.medline_id = accession 286 references.append(reference) 287 return {'references': references}
288
289 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
290 a = {} 291 common_names = adaptor.execute_and_fetch_col0( 292 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 293 " AND name_class = 'genbank common name'", (taxon_id,)) 294 if common_names: 295 a['source'] = common_names[0] 296 scientific_names = adaptor.execute_and_fetch_col0( 297 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 298 " AND name_class = 'scientific name'", (taxon_id,)) 299 if scientific_names: 300 a['organism'] = scientific_names[0] 301 ncbi_taxids = adaptor.execute_and_fetch_col0( 302 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 303 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 304 a['ncbi_taxid'] = ncbi_taxids[0] 305 306 #Old code used the left/right values in the taxon table to get the 307 #taxonomy lineage in one SQL command. This was actually very slow, 308 #and would fail if the (optional) left/right values were missing. 309 # 310 #The following code is based on a contribution from Eric Gibert, and 311 #relies on the taxon table's parent_taxon_id field only (ignoring the 312 #optional left/right values). This means that it has to make a 313 #separate SQL query for each entry in the lineage, but it does still 314 #appear to be *much* faster. See Bug 2494. 315 taxonomy = [] 316 while taxon_id : 317 name, rank, parent_taxon_id = adaptor.execute_one( 318 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" \ 319 " FROM taxon, taxon_name" \ 320 " WHERE taxon.taxon_id=taxon_name.taxon_id" \ 321 " AND taxon_name.name_class='scientific name'" \ 322 " AND taxon.taxon_id = %s", (taxon_id,)) 323 if taxon_id == parent_taxon_id : 324 # If the taxon table has been populated by the BioSQL script 325 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 326 # Personally, I would have used a NULL parent_taxon_id here. 327 break 328 if rank <> "no rank" : 329 #For consistency with older versions of Biopython, we are only 330 #interested in taxonomy entries with a stated rank. 331 #Add this to the start of the lineage list. 332 taxonomy.insert(0, name) 333 taxon_id = parent_taxon_id 334 335 if taxonomy: 336 a['taxonomy'] = taxonomy 337 return a
338
339 -class DBSeqRecord(SeqRecord):
340 """BioSQL equivalent of the biopython SeqRecord object. 341 """ 342
343 - def __init__(self, adaptor, primary_id):
344 self._adaptor = adaptor 345 self._primary_id = primary_id 346 347 (self._biodatabase_id, self._taxon_id, self.name, 348 accession, version, self._identifier, 349 self._division, self.description) = self._adaptor.execute_one( 350 "SELECT biodatabase_id, taxon_id, name, accession, version," \ 351 " identifier, division, description" \ 352 " FROM bioentry" \ 353 " WHERE bioentry_id = %s", (self._primary_id,)) 354 if version and version != "0": 355 self.id = "%s.%s" % (accession, version) 356 else: 357 self.id = accession
358
359 - def __get_seq(self):
360 if not hasattr(self, "_seq"): 361 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 362 return self._seq
363 - def __set_seq(self, seq): self._seq = seq
364 - def __del_seq(self): del self._seq
365 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 366
367 - def __get_dbxrefs(self):
368 if not hasattr(self,"_dbxrefs"): 369 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 370 return self._dbxrefs
371 - def __set_dbxrefs(self, dbxrefs): self._dbxrefs = dbxrefs
372 - def __del_dbxrefs(self): del self._dbxrefs
373 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 374 "Database cross references") 375
376 - def __get_features(self):
377 if not hasattr(self, "_features"): 378 self._features = _retrieve_features(self._adaptor, 379 self._primary_id) 380 return self._features
381 - def __set_features(self, features): self._features = features
382 - def __del_features(self): del self._features
383 features = property(__get_features, __set_features, __del_features, 384 "Features") 385
386 - def __get_annotations(self):
387 if not hasattr(self, "_annotations"): 388 self._annotations = _retrieve_annotations(self._adaptor, 389 self._primary_id, 390 self._taxon_id) 391 if self._identifier: 392 self._annotations["gi"] = self._identifier 393 if self._division: 394 self._annotations["data_file_division"] = self._division 395 return self._annotations
396 - def __set_annotations(self, annotations): self._annotations = annotations
397 - def __del_annotations(self): del self._annotations
398 annotations = property(__get_annotations, __set_annotations, 399 __del_annotations, "Annotations")
400