Package Bio :: Package GFF
[hide private]
[frames] | no frames]

Source Code for Package Bio.GFF

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """ 
  9  Access to General Feature Format databases created with Bio::DB:GFF 
 10   
 11  based on documentation for Lincoln Stein's Perl Bio::DB::GFF 
 12   
 13  >>> import os 
 14  >>> import Bio.GFF 
 15  >>> PASSWORD = os.environ['MYSQLPASS'] 
 16  >>> DATABASE_GFF = 'wormbase_ws93' 
 17  >>> FASTADIR = '/local/wormbase_ws93/' 
 18  >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR) 
 19   
 20  """ 
 21   
 22  __version__ = "$Revision: 1.7 $" 
 23  # $Source: /home/repository/biopython/biopython/Bio/GFF/__init__.py,v $ 
 24   
 25  import exceptions 
 26  import operator 
 27  import os.path 
 28  import sys 
 29  import types 
 30   
 31  from Bio import MissingExternalDependencyError 
 32   
 33  try: 
 34      import MySQLdb 
 35  except: 
 36      raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.") 
 37   
 38  from Bio.Alphabet import IUPAC 
 39  from Bio import DocSQL 
 40  from Bio.Seq import Seq 
 41  from Bio.SeqRecord import SeqRecord 
 42  from Bio import Translate 
 43   
 44  import binning 
 45  import easy 
 46  import GenericTools 
 47   
 48   
 49  DEFAULT_ALPHABET = IUPAC.unambiguous_dna 
 50  standard_translator = Translate.unambiguous_dna_by_id[1] 
 51   
52 -class Segment(object):
53 """ 54 this will only work for the simplest of easy.Location objects 55 """
56 - def __init__(self, gff, location=None):
57 self.gff = gff 58 if location is None: 59 self.seqname = None 60 self.coords = None 61 self.complement = None 62 else: 63 self.seqname = location.seqname 64 self.coords = [location.start(), location.end()] 65 self.complement = location.complement
66
67 - def features(self, *args, **keywds):
68 return FeatureQuery(self.seqname, self.coords, self.complement, connection=self.gff.db, *args, **keywds)
69
70 - def single_feature(self, *args, **keywds):
71 features = self.features(*args, **keywds) 72 all_features = GenericTools.all(features) 73 assert len(all_features) == 1 74 return all_features[0]
75
76 -class Connection(Segment):
77 """ 78 Connection to GFF database 79 80 also functions as whole segment 81 """ 82
83 - def __init__(self, *args, **keywds):
84 try: 85 RetrieveSeqname._dir = keywds['fastadir'] 86 del keywds['fastadir'] 87 except KeyError: 88 RetrieveSeqname._dir = '.' 89 self.db = MySQLdb.connect(*args, **keywds) 90 Segment.__init__(self, self)
91
92 - def segment(self, *args, **keywds):
93 return Segment(self, *args, **keywds)
94
95 -class RetrieveSeqname(GenericTools.Surrogate, SeqRecord):
96 """ 97 Singleton: contain records of loaded FASTA files 98 99 >>> RetrieveSeqname._dir = 'GFF' 100 >>> RetrieveSeqname._diagnostics = 1 101 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them 102 >>> record1 = RetrieveSeqname('NC_001802.fna') 103 Loading GFF/NC_001802.fna... Done. 104 >>> record1.id 105 'gi|9629357|ref|NC_001802.1|' 106 >>> record2 = RetrieveSeqname('NC_001802.fna') 107 >>> record1.seq is record2.seq # should be same space in memory 108 1 109 """ 110 __records = {} 111 _diagnostics = 0 112
113 - def __init__(self, seqname):
114 try: 115 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 116 except KeyError: 117 filename = os.path.join(self._dir, seqname) 118 if self._diagnostics: 119 sys.stderr.write("Loading %s..." % filename) 120 record = easy.fasta_single(filename) 121 self.__records[seqname] = record 122 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 123 if self._diagnostics: 124 print >>sys.stderr, " Done."
125
126 -class Feature(object):
127 """ 128 strand may be: 129 +/0 = Watson 130 -/1 = Crick 131 132 I propose that we start calling these the Rosalind and Franklin strands 133 134 >>> RetrieveSeqname._dir = 'GFF' 135 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test 136 >>> feature.seq() 137 Seq('AATAAA', Alphabet()) 138 >>> print feature.location() 139 NC_001802x.fna:73..78 140 >>> from Bio.SeqIO.FASTA import FastaWriter 141 >>> writer = FastaWriter(sys.stdout) 142 >>> writer.write(feature.record()) 143 > NC_001802x.fna:73..78 144 AATAAA 145 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78")) 146 >>> writer.write(feature2.record()) 147 > NC_001802x.fna:73..78 148 AATAAA 149 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)") 150 >>> feature3 = Feature(location=location3) 151 >>> writer.write(feature3.record()) 152 > NC_001802x.fna:complement(73..78) 153 TTTATT 154 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631") 155 >>> feature4 = Feature(location=location4, frame=0) 156 >>> feature4.frame 157 0 158 >>> feature4.translate()[:7] 159 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 160 >>> feature4.frame = 6 # can't happen, but a useful demonstration 161 >>> feature4.translate()[:5] 162 Seq('ARASV', HasStopCodon(IUPACProtein(), '*')) 163 >>> feature4.frame = 1 164 >>> feature4.translate()[:5] 165 Seq('WVRER', HasStopCodon(IUPACProtein(), '*')) 166 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data 167 >>> feature5 = Feature(location=location5, frame=0) 168 >>> feature5.translate()[:7] 169 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 170 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351") 171 >>> feature6 = Feature(location=location6, frame=1) 172 >>> feature6.translate() 173 Seq('MGARA', HasStopCodon(IUPACProtein(), '*')) 174 """
175 - def __init__(self, 176 seqname=None, 177 start=None, 178 end=None, 179 strand="+", 180 frame=None, 181 location=None, 182 alphabet=DEFAULT_ALPHABET):
183 if not isinstance(seqname, (types.StringType, types.NoneType)): 184 raise exceptions.TypeError, "seqname needs to be string" 185 self.frame = frame 186 self.alphabet = alphabet 187 if location is None: 188 self.seqname = seqname 189 self.start = start 190 self.end = end 191 self.strand = strand 192 return 193 else: 194 self.seqname = location.seqname 195 self.start = location.start() + 1 196 self.end = location.end() + 1 197 self.strand = location.complement
198
199 - def __hash__(self):
200 return hash((self.seqname, 201 self.start, 202 self.end, 203 self.strand, 204 self.frame, 205 self.alphabet))
206
207 - def seq(self):
208 rec = RetrieveSeqname(self.seqname) 209 return easy.record_subseq(rec, self.location(), upper=1)
210
211 - def translate(self):
212 seq = self.seq() 213 try: 214 seq = Seq(seq.tostring()[self.frame:], self.alphabet) 215 except TypeError: 216 seq.alphabet = self.alphabet 217 try: 218 return standard_translator.translate(seq) 219 except AssertionError: 220 # if the feature was pickled then we have problems 221 import cPickle 222 if cPickle.dumps(seq.alphabet) == cPickle.dumps(DEFAULT_ALPHABET): 223 seq.alphabet = DEFAULT_ALPHABET 224 return standard_translator.translate(seq) 225 else: 226 raise
227
228 - def location(self):
229 return easy.LocationFromCoords(self.start-1, self.end-1, self.strand, seqname=self.seqname)
230
231 - def target_location(self):
232 if self.target_start <= self.target_end: 233 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname) 234 else: 235 # switch start and end and make it complement: 236 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
237
238 - def id(self):
239 try: 240 return "%s:%s" % (self.gclass, self.gname) 241 except AttributeError: 242 return ""
243
244 - def record(self):
245 return SeqRecord(self.seq(), self.id(), "", self.location())
246
247 - def xrange(self):
248 return xrange(self.start, self.end)
249
250 - def __str__(self):
251 return "Feature(%s)" % self.location()
252
253 -class FeatureQueryRow(DocSQL.QueryRow, Feature):
254 """ 255 row of FeatureQuery results 256 works like a Feature 257 """
258 - def __init__(self, *args, **keywds):
259 DocSQL.QueryRow.__init__(self, *args, **keywds) 260 try: 261 self.frame = int(self.frame) 262 except ValueError: 263 self.frame = '' 264 except TypeError: 265 self.frame = None 266 self.alphabet = DEFAULT_ALPHABET
267
268 -class FeatureQuery(DocSQL.Query):
269 """ 270 SELECT fdata.fref AS seqname, 271 ftype.fsource AS source, 272 ftype.fmethod AS feature, 273 fdata.fstart AS start, 274 fdata.fstop AS end, 275 fdata.fscore AS score, 276 fdata.fstrand AS strand, 277 fdata.fphase AS frame, 278 fdata.ftarget_start AS target_start, 279 fdata.ftarget_stop AS target_end, 280 fdata.gid, 281 fgroup.gclass, 282 fgroup.gname 283 FROM fdata, ftype, fgroup 284 WHERE fdata.ftypeid = ftype.ftypeid 285 AND fdata.gid = fgroup.gid 286 """ 287 288 row_class = FeatureQueryRow
289 - def __init__(self, 290 seqname=None, 291 coords=None, 292 complement=0, 293 method=None, 294 source=None, 295 object=None, # "class:name" 296 gid=None, 297 *args, 298 **keywds):
299 300 DocSQL.Query.__init__(self, *args, **keywds) 301 302 if seqname is not None: 303 self.statement += ' AND fref="%s"\n' % seqname 304 if coords is not None: 305 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1]) 306 if coords[0] is not None: 307 self.statement += (" AND fstop >= %s\n" % coords[0]) 308 if coords[1] is not None: 309 self.statement += (" AND fstart <= %s\n" % coords[1]) 310 if method is not None: 311 self.statement += ' AND fmethod LIKE "%s"\n' % method # LIKE allows SQL "%" wildcards 312 if source is not None: 313 self.statement += ' AND fsource LIKE "%s"\n' % source 314 if object is not None: 315 self.statement += ' AND %s\n' % object2fgroup_sql(object) 316 if gid is not None: 317 self.statement += ' AND fgroup.gid = %s\n' % gid 318 319 if complement: 320 self.statement += " ORDER BY 0-fstart, 0-fstop" 321 else: 322 self.statement += " ORDER BY fstart, fstop"
323
324 - def aggregate(self):
325 return FeatureAggregate(self)
326
327 -def object2fgroup_sql(object):
328 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
329
330 -class FeatureAggregate(list, Feature):
331 """ 332 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol 333 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage 334 >>> aggregate = FeatureAggregate([feature1_1, feature1_2]) 335 >>> print aggregate.location() 336 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642) 337 >>> xlate_str = aggregate.translate().tostring() 338 >>> xlate_str[:5], xlate_str[-5:] 339 ('MGARA', 'RQDED') 340 341 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)") 342 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)") 343 >>> feature2_1 = Feature(location=location1, frame=0) 344 >>> feature2_2 = Feature(location=location2, frame=0) 345 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2]) 346 >>> print aggregate2.location() 347 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)) 348 >>> print aggregate2.translate() 349 Seq('TRET', HasStopCodon(IUPACProtein(), '*')) 350 >>> location1.reverse() 351 >>> location2.reverse() 352 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]]) 353 >>> print aggregate3.location() 354 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12) 355 >>> print aggregate3.translate() 356 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 357 >>> aggregate3[0].frame = 3 358 >>> print aggregate3.translate() 359 Seq('LSG', HasStopCodon(IUPACProtein(), '*')) 360 361 >>> aggregate4 = FeatureAggregate() 362 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0)) 363 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2)) 364 >>> aggregate4.seq() 365 Seq('GGTCTCTCTGGT', Alphabet()) 366 >>> aggregate4.translate() 367 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 368 """
369 - def __init__(self, feature_query=None):
370 if feature_query is None: 371 list.__init__(self, []) 372 else: 373 list.__init__(self, map(lambda x: x, feature_query))
374
375 - def location(self):
376 loc = easy.LocationJoin(map(lambda x: x.location(), self)) 377 loc.reorient() 378 return loc
379
380 - def map(self, func):
381 mapped = map(func, self) 382 if self.location().complement: 383 mapped.reverse() 384 return reduce(operator.add, mapped)
385
386 - def seq(self):
387 return self.map(lambda x: x.seq())
388
389 - def translate(self):
390 attributes = ['alphabet', 'frame'] 391 index = [0, -1][self.location().complement] 392 for attribute in attributes: 393 self.__dict__[attribute] = self[index].__dict__[attribute] 394 translation = Feature.translate(self) 395 try: 396 assert translation.tostring().index("*") == len(translation) - 1 397 return translation[:translation.tostring().index("*")] 398 except ValueError: 399 return translation
400
401 -def object_split(object):
402 """ 403 >>> object_split("Sequence:F02E9.2a") 404 ('Sequence', 'F02E9.2a') 405 """ 406 return tuple(object.split(":"))
407
408 -def _test(*args, **keywds):
409 import doctest, sys 410 doctest.testmod(sys.modules[__name__], *args, **keywds)
411 412 if __name__ == "__main__": 413 if __debug__: 414 _test() 415