1
2
3
4
5
6
7
8 """
9 Access to General Feature Format databases created with Bio::DB:GFF
10
11 based on documentation for Lincoln Stein's Perl Bio::DB::GFF
12
13 >>> import os
14 >>> import Bio.GFF
15 >>> PASSWORD = os.environ['MYSQLPASS']
16 >>> DATABASE_GFF = 'wormbase_ws93'
17 >>> FASTADIR = '/local/wormbase_ws93/'
18 >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR)
19
20 """
21
22 __version__ = "$Revision: 1.7 $"
23
24
25 import exceptions
26 import operator
27 import os.path
28 import sys
29 import types
30
31 from Bio import MissingExternalDependencyError
32
33 try:
34 import MySQLdb
35 except:
36 raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.")
37
38 from Bio.Alphabet import IUPAC
39 from Bio import DocSQL
40 from Bio.Seq import Seq
41 from Bio.SeqRecord import SeqRecord
42 from Bio import Translate
43
44 import binning
45 import easy
46 import GenericTools
47
48
49 DEFAULT_ALPHABET = IUPAC.unambiguous_dna
50 standard_translator = Translate.unambiguous_dna_by_id[1]
51
53 """
54 this will only work for the simplest of easy.Location objects
55 """
66
69
75
77 """
78 Connection to GFF database
79
80 also functions as whole segment
81 """
82
91
92 - def segment(self, *args, **keywds):
93 return Segment(self, *args, **keywds)
94
96 """
97 Singleton: contain records of loaded FASTA files
98
99 >>> RetrieveSeqname._dir = 'GFF'
100 >>> RetrieveSeqname._diagnostics = 1
101 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them
102 >>> record1 = RetrieveSeqname('NC_001802.fna')
103 Loading GFF/NC_001802.fna... Done.
104 >>> record1.id
105 'gi|9629357|ref|NC_001802.1|'
106 >>> record2 = RetrieveSeqname('NC_001802.fna')
107 >>> record1.seq is record2.seq # should be same space in memory
108 1
109 """
110 __records = {}
111 _diagnostics = 0
112
125
127 """
128 strand may be:
129 +/0 = Watson
130 -/1 = Crick
131
132 I propose that we start calling these the Rosalind and Franklin strands
133
134 >>> RetrieveSeqname._dir = 'GFF'
135 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test
136 >>> feature.seq()
137 Seq('AATAAA', Alphabet())
138 >>> print feature.location()
139 NC_001802x.fna:73..78
140 >>> from Bio.SeqIO.FASTA import FastaWriter
141 >>> writer = FastaWriter(sys.stdout)
142 >>> writer.write(feature.record())
143 > NC_001802x.fna:73..78
144 AATAAA
145 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78"))
146 >>> writer.write(feature2.record())
147 > NC_001802x.fna:73..78
148 AATAAA
149 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)")
150 >>> feature3 = Feature(location=location3)
151 >>> writer.write(feature3.record())
152 > NC_001802x.fna:complement(73..78)
153 TTTATT
154 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631")
155 >>> feature4 = Feature(location=location4, frame=0)
156 >>> feature4.frame
157 0
158 >>> feature4.translate()[:7]
159 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
160 >>> feature4.frame = 6 # can't happen, but a useful demonstration
161 >>> feature4.translate()[:5]
162 Seq('ARASV', HasStopCodon(IUPACProtein(), '*'))
163 >>> feature4.frame = 1
164 >>> feature4.translate()[:5]
165 Seq('WVRER', HasStopCodon(IUPACProtein(), '*'))
166 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data
167 >>> feature5 = Feature(location=location5, frame=0)
168 >>> feature5.translate()[:7]
169 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
170 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351")
171 >>> feature6 = Feature(location=location6, frame=1)
172 >>> feature6.translate()
173 Seq('MGARA', HasStopCodon(IUPACProtein(), '*'))
174 """
175 - def __init__(self,
176 seqname=None,
177 start=None,
178 end=None,
179 strand="+",
180 frame=None,
181 location=None,
182 alphabet=DEFAULT_ALPHABET):
198
206
210
227
230
232 if self.target_start <= self.target_end:
233 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname)
234 else:
235
236 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
237
239 try:
240 return "%s:%s" % (self.gclass, self.gname)
241 except AttributeError:
242 return ""
243
246
249
251 return "Feature(%s)" % self.location()
252
254 """
255 row of FeatureQuery results
256 works like a Feature
257 """
267
269 """
270 SELECT fdata.fref AS seqname,
271 ftype.fsource AS source,
272 ftype.fmethod AS feature,
273 fdata.fstart AS start,
274 fdata.fstop AS end,
275 fdata.fscore AS score,
276 fdata.fstrand AS strand,
277 fdata.fphase AS frame,
278 fdata.ftarget_start AS target_start,
279 fdata.ftarget_stop AS target_end,
280 fdata.gid,
281 fgroup.gclass,
282 fgroup.gname
283 FROM fdata, ftype, fgroup
284 WHERE fdata.ftypeid = ftype.ftypeid
285 AND fdata.gid = fgroup.gid
286 """
287
288 row_class = FeatureQueryRow
289 - def __init__(self,
290 seqname=None,
291 coords=None,
292 complement=0,
293 method=None,
294 source=None,
295 object=None,
296 gid=None,
297 *args,
298 **keywds):
299
300 DocSQL.Query.__init__(self, *args, **keywds)
301
302 if seqname is not None:
303 self.statement += ' AND fref="%s"\n' % seqname
304 if coords is not None:
305 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1])
306 if coords[0] is not None:
307 self.statement += (" AND fstop >= %s\n" % coords[0])
308 if coords[1] is not None:
309 self.statement += (" AND fstart <= %s\n" % coords[1])
310 if method is not None:
311 self.statement += ' AND fmethod LIKE "%s"\n' % method
312 if source is not None:
313 self.statement += ' AND fsource LIKE "%s"\n' % source
314 if object is not None:
315 self.statement += ' AND %s\n' % object2fgroup_sql(object)
316 if gid is not None:
317 self.statement += ' AND fgroup.gid = %s\n' % gid
318
319 if complement:
320 self.statement += " ORDER BY 0-fstart, 0-fstop"
321 else:
322 self.statement += " ORDER BY fstart, fstop"
323
326
328 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
329
331 """
332 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol
333 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage
334 >>> aggregate = FeatureAggregate([feature1_1, feature1_2])
335 >>> print aggregate.location()
336 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642)
337 >>> xlate_str = aggregate.translate().tostring()
338 >>> xlate_str[:5], xlate_str[-5:]
339 ('MGARA', 'RQDED')
340
341 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)")
342 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)")
343 >>> feature2_1 = Feature(location=location1, frame=0)
344 >>> feature2_2 = Feature(location=location2, frame=0)
345 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2])
346 >>> print aggregate2.location()
347 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12))
348 >>> print aggregate2.translate()
349 Seq('TRET', HasStopCodon(IUPACProtein(), '*'))
350 >>> location1.reverse()
351 >>> location2.reverse()
352 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]])
353 >>> print aggregate3.location()
354 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)
355 >>> print aggregate3.translate()
356 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
357 >>> aggregate3[0].frame = 3
358 >>> print aggregate3.translate()
359 Seq('LSG', HasStopCodon(IUPACProtein(), '*'))
360
361 >>> aggregate4 = FeatureAggregate()
362 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0))
363 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2))
364 >>> aggregate4.seq()
365 Seq('GGTCTCTCTGGT', Alphabet())
366 >>> aggregate4.translate()
367 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
368 """
369 - def __init__(self, feature_query=None):
374
379
380 - def map(self, func):
385
388
400
402 """
403 >>> object_split("Sequence:F02E9.2a")
404 ('Sequence', 'F02E9.2a')
405 """
406 return tuple(object.split(":"))
407
408 -def _test(*args, **keywds):
409 import doctest, sys
410 doctest.testmod(sys.modules[__name__], *args, **keywds)
411
412 if __name__ == "__main__":
413 if __debug__:
414 _test()
415