Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writting FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  from Bio.Alphabet import single_letter_alphabet 
 11  from Bio.Seq import Seq 
 12  from Bio.SeqRecord import SeqRecord 
 13  from Interfaces import SequentialSequenceWriter 
 14   
 15  #This is a generator function! 
16 -def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
17 """Generator function to iterate over Fasta records (as SeqRecord objects). 18 19 handle - input file 20 alphabet - optional alphabet 21 title2ids - A function that, when given the title of the FASTA 22 file (without the beginning >), will return the id, name and 23 description (in that order) for the record as a tuple of strings. 24 25 If this is not given, then the entire title line will be used 26 as the description, and the first word as the id and name. 27 28 Note that use of title2ids matches that of Bio.Fasta.SequenceParser 29 but the defaults are slightly different. 30 """ 31 #Skip any text before the first record (e.g. blank lines, comments) 32 while True : 33 line = handle.readline() 34 if line == "" : return #Premature end of file, or just empty? 35 if line[0] == ">" : 36 break 37 38 while True : 39 if line[0]<>">" : 40 raise SyntaxError("Records in Fasta files should start with '>' character") 41 if title2ids : 42 id, name, descr = title2ids(line[1:].rstrip()) 43 else : 44 descr = line[1:].rstrip() 45 id = descr.split()[0] 46 name = id 47 48 lines = [] 49 line = handle.readline() 50 while True: 51 if not line : break 52 if line[0] == ">": break 53 #Remove trailing whitespace, and any internal spaces 54 lines.append(line.rstrip().replace(" ","")) 55 line = handle.readline() 56 57 #Return the record and then continue... 58 yield SeqRecord(Seq("".join(lines), alphabet), 59 id = id, name = name, description = descr) 60 61 if not line : return #StopIteration 62 assert False, "Should not reach this line"
63
64 -class FastaWriter(SequentialSequenceWriter):
65 """Class to write Fasta format files"""
66 - def __init__(self, handle, wrap=60, record2title=None):
67 """Create a Fasta writer. 68 69 handle - Handle to an output file, e.g. as returned 70 by open(filename, "w") 71 wrap - Optional line length used to wrap sequence lines. 72 Defaults to wrapping the sequence at 60 characters 73 Use zero (or None) for no wrapping, giving a single 74 long line for the sequence. 75 record2title - Optional function to return the text to be 76 used for the title line of each record. By default the 77 a combination of the record.id and record.description 78 is used. If the record.description starts with the 79 record.id, then just the record.description is used. 80 81 You can either use: 82 83 myWriter = FastaWriter(open(filename,"w")) 84 writer.write_file(myRecords) 85 86 Or, follow the sequential file writer system, for example: 87 88 myWriter = FastaWriter(open(filename,"w")) 89 writer.write_header() # does nothing for Fasta files 90 ... 91 Multiple calls to writer.write_record() and/or writer.write_records() 92 ... 93 writer.write_footer() # does nothing for Fasta files 94 writer.close() 95 """ 96 SequentialSequenceWriter.__init__(self, handle) 97 #self.handle = handle 98 self.wrap = None 99 if wrap : 100 if wrap < 1 : 101 raise ValueError 102 self.wrap = wrap 103 self.record2title = record2title
104
105 - def write_record(self, record):
106 """Write a single Fasta record to the file""" 107 assert self._header_written 108 assert not self._footer_written 109 self._record_written = True 110 111 if self.record2title : 112 title=self.clean(record2title(record)) 113 else : 114 id = self.clean(record.id) 115 description = self.clean(record.description) 116 117 #if description[:len(id)]==id : 118 if description and description.split(None,1)[0]==id : 119 #The description includes the id at the start 120 title = description 121 else : 122 title = "%s %s" % (id, description) 123 124 assert "\n" not in title 125 assert "\r" not in title 126 self.handle.write(">%s\n" % title) 127 128 data = record.seq.tostring() 129 assert "\n" not in data 130 assert "\r" not in data 131 132 if self.wrap : 133 for i in range(0, len(data), self.wrap): 134 self.handle.write(data[i:i+self.wrap] + "\n") 135 else : 136 self.handle.write(data + "\n")
137 138 if __name__ == "__main__" : 139 print "Running quick self test" 140 141 import os 142 from Bio.Alphabet import generic_protein, generic_nucleotide 143 144 #Download the files from here: 145 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 146 fna_filename = "NC_005213.fna" 147 faa_filename = "NC_005213.faa" 148
149 - def genbank_name_function(text) :
150 text, descr = text.split(None,1) 151 id = text.split("|")[3] 152 name = id.split(".",1)[0] 153 return id, name, descr
154 167 168 if os.path.isfile(fna_filename) : 169 print "--------" 170 print "FastaIterator (single sequence)" 171 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function) 172 count=0 173 for record in iterator : 174 count=count+1 175 print_record(record) 176 assert count == 1 177 print str(record.__class__) 178 179 if os.path.isfile(faa_filename) : 180 print "--------" 181 print "FastaIterator (multiple sequences)" 182 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) 183 count=0 184 for record in iterator : 185 count=count+1 186 print_record(record) 187 break 188 assert count>0 189 print str(record.__class__) 190 191 from cStringIO import StringIO 192 print "--------" 193 print "FastaIterator (empty input file)" 194 #Just to make sure no errors happen 195 iterator = FastaIterator(StringIO("")) 196 count = 0 197 for record in iterator : 198 count = count+1 199 assert count==0 200 201 print "Done" 202