1
2
3
4
5
6
7
8
9
10 from Bio.Alphabet import single_letter_alphabet
11 from Bio.Seq import Seq
12 from Bio.SeqRecord import SeqRecord
13 from Interfaces import SequentialSequenceWriter
14
15
17 """Generator function to iterate over Fasta records (as SeqRecord objects).
18
19 handle - input file
20 alphabet - optional alphabet
21 title2ids - A function that, when given the title of the FASTA
22 file (without the beginning >), will return the id, name and
23 description (in that order) for the record as a tuple of strings.
24
25 If this is not given, then the entire title line will be used
26 as the description, and the first word as the id and name.
27
28 Note that use of title2ids matches that of Bio.Fasta.SequenceParser
29 but the defaults are slightly different.
30 """
31
32 while True :
33 line = handle.readline()
34 if line == "" : return
35 if line[0] == ">" :
36 break
37
38 while True :
39 if line[0]<>">" :
40 raise SyntaxError("Records in Fasta files should start with '>' character")
41 if title2ids :
42 id, name, descr = title2ids(line[1:].rstrip())
43 else :
44 descr = line[1:].rstrip()
45 id = descr.split()[0]
46 name = id
47
48 lines = []
49 line = handle.readline()
50 while True:
51 if not line : break
52 if line[0] == ">": break
53
54 lines.append(line.rstrip().replace(" ",""))
55 line = handle.readline()
56
57
58 yield SeqRecord(Seq("".join(lines), alphabet),
59 id = id, name = name, description = descr)
60
61 if not line : return
62 assert False, "Should not reach this line"
63
65 """Class to write Fasta format files"""
66 - def __init__(self, handle, wrap=60, record2title=None):
67 """Create a Fasta writer.
68
69 handle - Handle to an output file, e.g. as returned
70 by open(filename, "w")
71 wrap - Optional line length used to wrap sequence lines.
72 Defaults to wrapping the sequence at 60 characters
73 Use zero (or None) for no wrapping, giving a single
74 long line for the sequence.
75 record2title - Optional function to return the text to be
76 used for the title line of each record. By default the
77 a combination of the record.id and record.description
78 is used. If the record.description starts with the
79 record.id, then just the record.description is used.
80
81 You can either use:
82
83 myWriter = FastaWriter(open(filename,"w"))
84 writer.write_file(myRecords)
85
86 Or, follow the sequential file writer system, for example:
87
88 myWriter = FastaWriter(open(filename,"w"))
89 writer.write_header() # does nothing for Fasta files
90 ...
91 Multiple calls to writer.write_record() and/or writer.write_records()
92 ...
93 writer.write_footer() # does nothing for Fasta files
94 writer.close()
95 """
96 SequentialSequenceWriter.__init__(self, handle)
97
98 self.wrap = None
99 if wrap :
100 if wrap < 1 :
101 raise ValueError
102 self.wrap = wrap
103 self.record2title = record2title
104
137
138 if __name__ == "__main__" :
139 print "Running quick self test"
140
141 import os
142 from Bio.Alphabet import generic_protein, generic_nucleotide
143
144
145
146 fna_filename = "NC_005213.fna"
147 faa_filename = "NC_005213.faa"
148
154
167
168 if os.path.isfile(fna_filename) :
169 print "--------"
170 print "FastaIterator (single sequence)"
171 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
172 count=0
173 for record in iterator :
174 count=count+1
175 print_record(record)
176 assert count == 1
177 print str(record.__class__)
178
179 if os.path.isfile(faa_filename) :
180 print "--------"
181 print "FastaIterator (multiple sequences)"
182 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
183 count=0
184 for record in iterator :
185 count=count+1
186 print_record(record)
187 break
188 assert count>0
189 print str(record.__class__)
190
191 from cStringIO import StringIO
192 print "--------"
193 print "FastaIterator (empty input file)"
194
195 iterator = FastaIterator(StringIO(""))
196 count = 0
197 for record in iterator :
198 count = count+1
199 assert count==0
200
201 print "Done"
202