1 """Utilities for working with FASTA-formatted sequences.
2
3 This module uses Martel-based parsing to speed up the parsing process.
4
5 Classes:
6 Record Holds FASTA sequence data.
7 Iterator Iterates over sequence data in a FASTA file.
8 Dictionary Accesses a FASTA file using a dictionary interface.
9 RecordParser Parses FASTA sequence data into a Record object.
10 SequenceParser Parses FASTA sequence data into a Sequence object.
11
12 Functions:
13 index_file Index a FASTA file for a Dictionary.
14 """
15 from Bio import Seq
16 from Bio import SeqRecord
17 from Bio import Alphabet
18
19
20 import cStringIO
21 from Bio import Mindy
22 from Bio.Mindy import SimpleSeqRecord
23
25 """Holds information from a FASTA record.
26
27 Members:
28 title Title line ('>' character not included).
29 sequence The sequence.
30
31 """
33 """__init__(self, colwidth=60)
34
35 Create a new Record. colwidth specifies the number of residues
36 to put on each line when generating FASTA format.
37
38 """
39 self.title = ''
40 self.sequence = ''
41 self._colwidth = colwidth
42
53
55 """Returns one record at a time from a FASTA file.
56 """
57 - def __init__(self, handle, parser = None, debug = 0):
58 """Initialize a new iterator.
59 """
60 self.handle = handle
61 self._parser = parser
62 self._debug = debug
63
64
65 while True :
66 line = handle.readline()
67 if line[0] == ">" :
68 break
69 if debug : print "Skipping: " + line
70 self._lookahead = line
71
73 return iter(self.next, None)
74
76 """Return the next record in the file"""
77 line = self._lookahead
78 if not line:
79 return None
80 assert line[0]==">", line
81 lines = [line.rstrip()]
82 line = self.handle.readline()
83 while line:
84 if line[0] == ">": break
85 if line[0] == "#" :
86 if self._debug : print "Ignoring comment line"
87 pass
88 else :
89 lines.append(line.rstrip())
90 line = self.handle.readline()
91 self._lookahead = line
92 if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines))
93 if self._parser is None:
94 return "\n".join(lines)
95 else :
96 return self._parser.parse_string("\n".join(lines))
97
99 """Parses FASTA sequence data into a Fasta.Record object.
100 """
103
114
115 - def parse(self, handle):
117
119 """Parses FASTA sequence data into a SeqRecord object.
120 """
123 """Initialize a Scanner and Sequence Consumer.
124
125 Arguments:
126 o alphabet - The alphabet of the sequences to be parsed. If not
127 passed, this will be set as generic_alphabet.
128 o title2ids - A function that, when given the title of the FASTA
129 file (without the beginning >), will return the id, name and
130 description (in that order) for the record. If this is not given,
131 then the entire title line will be used as the description.
132 """
133 self.alphabet = alphabet
134 self.title2ids = title2ids
135
155
156 - def parse(self, handle):
158
160 """Accesses an indexed FASTA file using a dictionary interface. DEPRECATED
161 """
162 - def __init__(self, indexname, parser=None, filename = None):
163 """Open a Fasta Dictionary. DEPRECATED
164
165 indexname is the name of the index for the dictionary. The index should
166 have been created using the index_file function.
167
168 parser is an optional Parser object to change the results into another
169 form. If set to None, then the raw contents of the file will be returned.
170
171 filename specifies the name of the file that this index references.
172 This is useful in cases where the file has been moved since indexing.
173 If no filename is supplied (the default) the filename stored in the
174 index will be used. XXX This is no longer supported -- use symbolic
175 links in the filesystem.
176 """
177
178 import warnings
179 warnings.warn("Bio.Fasta.index_file Bio.Fasta.Dictionary are deprecated." \
180 + " We hope an in memory dictionary, for example using the" \
181 + " Bio.SeqIO.to_dict() function, will be suitable for" \
182 + " most users. Please get in touch on the mailing lists if" \
183 + " this (or its removal) causes any problems for you.",
184 DeprecationWarning)
185
186
187
188 if filename is not None:
189 raise AttributeError("Specifying filenames is no longer supported")
190
191 self._index = Mindy.open(indexname)
192 self._parser = parser
193
194 primary_key_retriever = self._index['id']
195 for k in primary_key_retriever.keys():
196 dict.__setitem__(self,k,None)
197
198
200 try:
201 seqs = self._index.lookup(id = key)
202
203 except KeyError:
204 seqs = self._index.lookup(aliases = key)
205
206 if len(seqs) == 1:
207 seq = seqs[0]
208 else:
209 raise KeyError("Multiple sequences found for %s" % key)
210
211 if self._parser:
212 handle = cStringIO.StringIO(seq.text)
213 self[key] = self._parser.parse(handle)
214 else:
215 self[key] = seq.text
216
217
222
223
224 -def index_file(filename, indexname, rec2key = None, use_berkeley = 0):
225 """Index a FASTA file. DEPRECATED
226
227 filename is the name of the file to index.
228
229 indexname is the name of the dictionary to be created. This can be
230 just the name of the index, in which case the index information will
231 be created in a directory of the given index name in the current
232 directory, or a full pathname to a directory to save the indexing
233 information.
234
235 rec2key is an optional callback fuction that takes a Fasta Record and
236 generates a unique key (e.g. the accession number) for the record.
237 Optionally, it can also return 3 items, to be used as the id (unique key)
238 name, and aliases for the index. If not specified, the sequence title
239 will be used.
240
241 use_berkeley specifies whether to use the BerkeleyDB indexer, which
242 uses the bsddb3 wrappers around the embedded database Berkeley DB. By
243 default, the standard flat file (non-Berkeley) indexes are used.
244 """
245
246 import warnings
247 warnings.warn("Bio.Fasta.index_file Bio.Fasta.Dictionary are deprecated." \
248 + " We hope an in memory dictionary, for example using the" \
249 + " Bio.SeqIO.to_dict() function, will be suitable for" \
250 + " most users. Please get in touch on the mailing lists if" \
251 + " this (or its removal) causes any problems for you.",
252 DeprecationWarning)
253
254 if rec2key:
255 indexer = _FastaFunctionIndexer(rec2key)
256 else:
257 indexer = _FastaTitleIndexer()
258
259 if use_berkeley:
260 SimpleSeqRecord.create_berkeleydb([filename], indexname, indexer)
261 else:
262 SimpleSeqRecord.create_flatdb([filename], indexname, indexer)
263
265 """Simple indexer to index by the title of a FASTA record.
266
267 This doesn't do anything fancy, just gets the title and uses that as the
268 identifier.
269 """
272
275
277 return ["name", "aliases"]
278
286
287
289 """Indexer to index based on values returned by a function.
290
291 This class is passed a function to parse description titles from a Fasta
292 title. It needs to return either one item, which is an id from the title,
293 or three items which are (in order), the id, a list of names, and a list
294 of aliases.
295
296 This indexer allows indexing to be completely flexible based on passed
297 functions.
298 """
302
305
307 return ["name", "aliases"]
308
331