Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  """Implementations of Biopython-like Seq objects on top of BioSQL. 
  2   
  3  This allows retrival of items stored in a BioSQL database using 
  4  a biopython-like Seq interface. 
  5  """ 
  6   
  7  from Bio import SeqFeature 
  8   
9 -class DBSeq: # This implements the biopython Seq interface
10 - def __init__(self, primary_id, adaptor, alphabet, start, length):
11 self.primary_id = primary_id 12 self.adaptor = adaptor 13 self.alphabet = alphabet 14 self._length = length 15 self.start = start
16
17 - def __len__(self):
18 return self._length
19
20 - def __getitem__(self, i):
21 if i < 0: 22 if -i >= self._length: 23 raise IndexError(i) 24 i = i + self._length 25 elif i >= self._length: 26 raise IndexError(i) 27 28 return self.adaptor.get_subseq_as_string(self.primary_id, 29 self.start + i, 30 self.start + i + 1)
31 - def __getslice__(self, i, j):
32 i = max(i, 0) 33 i = min(i, self._length) 34 j = max(j, 0) 35 j = min(j, self._length) 36 if i >= j: 37 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 38 self.start, 0) 39 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 40 self.start + i, j - i)
41 - def tostring(self):
42 return self.adaptor.get_subseq_as_string(self.primary_id, 43 self.start, 44 self.start + self._length)
45 46 data = property(tostring) 47
48 -def _retrieve_seq(adaptor, primary_id):
49 seqs = adaptor.execute_and_fetchall( 50 "SELECT alphabet, length(seq) FROM biosequence" \ 51 " WHERE bioentry_id = %s", (primary_id,)) 52 if seqs: 53 moltype, length = seqs[0] 54 from Bio.Alphabet import IUPAC 55 if moltype == "dna": 56 alphabet = IUPAC.unambiguous_dna 57 elif moltype == "rna": 58 alphabet = IUPAC.unambiguous_rna 59 elif moltype == "protein": 60 alphabet = IUPAC.protein 61 else: 62 raise AssertionError("Unknown moltype: %s" % moltype) 63 seq = DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 64 return seq 65 else: 66 return None
67
68 -def _retrieve_dbxrefs(adaptor, primary_id):
69 _dbxrefs = [] 70 dbxrefs = adaptor.execute_and_fetchall( 71 "SELECT dbname, accession, version" \ 72 " FROM bioentry_dbxref join dbxref using (dbxref_id)" \ 73 " WHERE bioentry_id = %s" \ 74 " ORDER BY rank", (primary_id,)) 75 for dbname, accession, version in dbxrefs: 76 if version and version != "0": 77 v = "%s.%s" % (accession, version) 78 else: 79 v = accession 80 _dbxrefs.append((dbname, v)) 81 return _dbxrefs
82
83 -def _retrieve_features(adaptor, primary_id):
84 sql = "SELECT seqfeature_id, type.name, rank" \ 85 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 86 " WHERE bioentry_id = %s" \ 87 " ORDER BY rank" 88 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 89 seq_feature_list = [] 90 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 91 # Get qualifiers 92 qvs = adaptor.execute_and_fetchall( 93 "SELECT name, value" \ 94 " FROM seqfeature_qualifier_value join term using (term_id)" \ 95 " WHERE seqfeature_id = %s", (seqfeature_id,)) 96 qualifiers = {} 97 for qv_name, qv_value in qvs: 98 qualifiers.setdefault(qv_name, []).append(qv_value) 99 # Get locations 100 results = adaptor.execute_and_fetchall( 101 "SELECT location_id, start_pos, end_pos, strand" \ 102 " FROM location" \ 103 " WHERE seqfeature_id = %s" \ 104 " ORDER BY rank", (seqfeature_id,)) 105 locations = [] 106 # convert to Python standard form 107 for location_id, start, end, strand in results: 108 if start: 109 start -= 1 110 locations.append( (location_id, start, end, strand) ) 111 # Get possible remote reference information 112 remote_results = adaptor.execute_and_fetchall( 113 "SELECT location_id, dbname, accession, version" \ 114 " FROM location join dbxref using (dbxref_id)" \ 115 " WHERE seqfeature_id = %s", (seqfeature_id,)) 116 lookup = {} 117 for location_id, dbname, accession, version in remote_results: 118 if version and version != "0": 119 v = "%s.%s" % (accession, version) 120 else: 121 v = accession 122 lookup[location_id] = (dbname, v) 123 124 feature = SeqFeature.SeqFeature(type = seqfeature_type) 125 feature.qualifiers = qualifiers 126 if len(locations) == 0: 127 pass 128 elif len(locations) == 1: 129 location_id, start, end, strand = locations[0] 130 dbname, version = lookup.get(location_id, (None, None)) 131 132 feature.location = SeqFeature.FeatureLocation(start, end) 133 feature.strand = strand 134 feature.ref_db = dbname 135 feature.ref = version 136 else: 137 min_start = locations[0][1] 138 max_end = locations[0][2] 139 sub_feature_list = [] # (start, sub feature) for sorting 140 for location in locations: 141 location_id, start, end, strand = location 142 dbname, version = lookup.get(location_id, (None, None)) 143 min_start = min(min_start, start) 144 max_end = max(max_end, end) 145 146 subfeature = SeqFeature.SeqFeature() 147 subfeature.type = seqfeature_type 148 subfeature.location_operator = "join" 149 subfeature.location = SeqFeature.FeatureLocation(start, end) 150 subfeature.strand = strand 151 subfeature.ref_db = dbname 152 subfeature.ref = version 153 sub_feature_list.append((start, subfeature)) 154 sub_feature_list.sort() 155 feature.sub_features = [sub_feature[1] 156 for sub_feature in sub_feature_list] 157 feature.location = SeqFeature.FeatureLocation(min_start, max_end) 158 feature.strand = feature.sub_features[0].strand 159 160 seq_feature_list.append( 161 (seqfeature_rank, feature.location.start.position, feature) ) 162 163 # Primary sort is on the feature's rank 164 # .. then on the start position 165 # .. then arbitrary on the feature's id (SeqFeature has no __cmp__) 166 seq_feature_list.sort() 167 168 # Get just the SeqFeature 169 return [x[2] for x in seq_feature_list]
170
171 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
172 annotations = {} 173 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 174 annotations.update(_retrieve_reference(adaptor, primary_id)) 175 annotations.update(_retrieve_dbxref(adaptor, primary_id)) 176 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 177 return annotations
178
179 -def _retrieve_qualifier_value(adaptor, primary_id):
180 qvs = adaptor.execute_and_fetchall( 181 "SELECT name, value" \ 182 " FROM bioentry_qualifier_value JOIN term USING (term_id)" \ 183 " WHERE bioentry_id = %s" \ 184 " ORDER BY rank", (primary_id,)) 185 qualifiers = {} 186 for name, value in qvs: 187 if name == "keyword": name = "keywords" 188 elif name == "date_changed": name = "dates" 189 elif name == "secondary_accession": name = "accessions" 190 qualifiers.setdefault(name, []).append(value) 191 return qualifiers
192
193 -def _retrieve_reference(adaptor, primary_id):
194 # XXX dbxref_qualifier_value 195 196 refs = adaptor.execute_and_fetchall( 197 "SELECT start_pos, end_pos, " \ 198 " location, title, authors," \ 199 " dbname, accession" \ 200 " FROM bioentry_reference" \ 201 " JOIN reference USING (reference_id)" \ 202 " LEFT JOIN dbxref USING (dbxref_id)" \ 203 " WHERE bioentry_id = %s" \ 204 " ORDER BY rank", (primary_id,)) 205 references = [] 206 for start, end, location, title, authors, dbname, accession in refs: 207 reference = SeqFeature.Reference() 208 if start: start -= 1 209 reference.location = [SeqFeature.FeatureLocation(start, end)] 210 reference.authors = authors 211 reference.title = title 212 reference.journal = location 213 if dbname == 'PUBMED': 214 reference.pubmed_id = accession 215 elif dbname == 'MEDLINE': 216 reference.medline_id = accession 217 references.append(reference) 218 return {'references': references}
219
220 -def _retrieve_dbxref(adaptor, primary_id):
221 # XXX dbxref_qualifier_value 222 223 refs = adaptor.execute_and_fetchall( 224 "SELECT dbname, accession, version" \ 225 " FROM bioentry_dbxref JOIN dbxref USING (dbxref_id)" \ 226 " WHERE bioentry_id = %s" \ 227 " ORDER BY rank", (primary_id,)) 228 dbxrefs = [] 229 for dbname, accession, version in refs: 230 if version and version != "0": 231 v = "%s.%s" % (accession, version) 232 else: 233 v = accession 234 dbxrefs.append((dbname, v)) 235 return {'cross_references': dbxrefs}
236
237 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
238 a = {} 239 common_names = adaptor.execute_and_fetch_col0( 240 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 241 " AND name_class = 'genbank common name'", (taxon_id,)) 242 if common_names: 243 a['source'] = common_names[0] 244 scientific_names = adaptor.execute_and_fetch_col0( 245 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 246 " AND name_class = 'scientific name'", (taxon_id,)) 247 if scientific_names: 248 a['organism'] = scientific_names[0] 249 ncbi_taxids = adaptor.execute_and_fetch_col0( 250 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 251 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 252 a['ncbi_taxid'] = ncbi_taxids[0] 253 254 # XXX 'no rank' or not? 255 # If we keep them, perhaps add " AND parent.parent_taxon_id <> 1" 256 taxonomy = adaptor.execute_and_fetch_col0( 257 "SELECT taxon_name.name" \ 258 " FROM taxon t, taxon parent, taxon_name" \ 259 " WHERE parent.taxon_id=taxon_name.taxon_id" \ 260 " AND t.left_value BETWEEN parent.left_value AND parent.right_value" \ 261 " AND name_class='scientific name'" \ 262 " AND parent.node_rank <> 'no rank'" \ 263 " AND t.taxon_id = %s" \ 264 " ORDER BY parent.left_value", (taxon_id,)) 265 if taxonomy: 266 a['taxonomy'] = taxonomy 267 return a
268
269 -class DBSeqRecord(object):
270 """BioSQL equivalent of the biopython SeqRecord object. 271 """ 272
273 - def __init__(self, adaptor, primary_id):
274 self._adaptor = adaptor 275 self._primary_id = primary_id 276 277 (self._biodatabase_id, self._taxon_id, self.name, 278 accession, version, self._identifier, 279 self._division, self.description) = self._adaptor.execute_one( 280 "SELECT biodatabase_id, taxon_id, name, accession, version," \ 281 " identifier, division, description" \ 282 " FROM bioentry" \ 283 " WHERE bioentry_id = %s", (self._primary_id,)) 284 if version and version != "0": 285 self.id = "%s.%s" % (accession, version) 286 else: 287 self.id = accession
288
289 - def __get_seq(self):
290 if not hasattr(self, "_seq"): 291 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 292 return self._seq
293 - def __set_seq(self, seq): self._seq = seq
294 - def __del_seq(self): del self._seq
295 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 296
297 - def __get_dbxrefs(self):
298 if not hasattr(self,"_dbxrefs"): 299 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 300 return self._dbxrefs
301 - def __set_dbxrefs(self, dbxrefs): self._dbxrefs = dbxrefs
302 - def __del_dbxrefs(self): del self._dbxrefs
303 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, "Dbxrefs") 304
305 - def __get_features(self):
306 if not hasattr(self, "_features"): 307 self._features = _retrieve_features(self._adaptor, 308 self._primary_id) 309 return self._features
310 - def __set_features(self, features): self._features = features
311 - def __del_features(self): del self._features
312 features = property(__get_features, __set_features, __del_features, 313 "Features") 314
315 - def __get_annotations(self):
316 if not hasattr(self, "_annotations"): 317 self._annotations = _retrieve_annotations(self._adaptor, 318 self._primary_id, 319 self._taxon_id) 320 if self._identifier: 321 self._annotations["gi"] = self._identifier 322 if self._division: 323 self._annotations["data_file_division"] = self._division 324 return self._annotations
325 - def __set_annotations(self, annotations): self._annotations = annotations
326 - def __del_annotations(self): del self._annotations
327 annotations = property(__get_annotations, __set_annotations, 328 __del_annotations, "Annotations")
329