Package Bio :: Package Fasta
[hide private]
[frames] | no frames]

Source Code for Package Bio.Fasta

  1  """Utilities for working with FASTA-formatted sequences. 
  2   
  3  This module uses Martel-based parsing to speed up the parsing process. 
  4   
  5  Classes: 
  6  Record             Holds FASTA sequence data. 
  7  Iterator           Iterates over sequence data in a FASTA file. 
  8  Dictionary         Accesses a FASTA file using a dictionary interface. 
  9  RecordParser       Parses FASTA sequence data into a Record object. 
 10  SequenceParser     Parses FASTA sequence data into a Sequence object. 
 11   
 12  Functions: 
 13  index_file         Index a FASTA file for a Dictionary. 
 14  """ 
 15  from Bio import Seq 
 16  from Bio import SeqRecord 
 17  from Bio import Alphabet 
 18   
 19  #These imports are only used by the deprecated dictionary functions/classes 
 20  import cStringIO 
 21  from Bio import Mindy 
 22  from Bio.Mindy import SimpleSeqRecord 
 23   
24 -class Record:
25 """Holds information from a FASTA record. 26 27 Members: 28 title Title line ('>' character not included). 29 sequence The sequence. 30 31 """
32 - def __init__(self, colwidth=60):
33 """__init__(self, colwidth=60) 34 35 Create a new Record. colwidth specifies the number of residues 36 to put on each line when generating FASTA format. 37 38 """ 39 self.title = '' 40 self.sequence = '' 41 self._colwidth = colwidth
42
43 - def __str__(self):
44 s = [] 45 s.append('>%s' % self.title) 46 i = 0 47 while i < len(self.sequence): 48 s.append(self.sequence[i:i+self._colwidth]) 49 i = i + self._colwidth 50 #Was having a problem getting the tests to pass on windows... 51 #return os.linesep.join(s) 52 return "\n".join(s)
53
54 -class Iterator:
55 """Returns one record at a time from a FASTA file. 56 """
57 - def __init__(self, handle, parser = None, debug = 0):
58 """Initialize a new iterator. 59 """ 60 self.handle = handle 61 self._parser = parser 62 self._debug = debug 63 64 #Skip any text before the first record (e.g. blank lines) 65 while True : 66 line = handle.readline() 67 if line[0] == ">" : 68 break 69 if debug : print "Skipping: " + line 70 self._lookahead = line
71
72 - def __iter__(self):
73 return iter(self.next, None)
74
75 - def next(self):
76 """Return the next record in the file""" 77 line = self._lookahead 78 if not line: 79 return None 80 assert line[0]==">", line 81 lines = [line.rstrip()] 82 line = self.handle.readline() 83 while line: 84 if line[0] == ">": break 85 if line[0] == "#" : 86 if self._debug : print "Ignoring comment line" 87 pass 88 else : 89 lines.append(line.rstrip()) 90 line = self.handle.readline() 91 self._lookahead = line 92 if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines)) 93 if self._parser is None: 94 return "\n".join(lines) 95 else : 96 return self._parser.parse_string("\n".join(lines))
97
98 -class RecordParser:
99 """Parses FASTA sequence data into a Fasta.Record object. 100 """
101 - def __init__(self, debug = 0):
102 pass
103
104 - def parse_string(self, text) :
105 text = text.replace("\r\n","\n") #Crude way of dealing with \r\n 106 assert text[0] == ">", text 107 text = text.split("\n>",1)[0] # Only do the first record if more than one 108 title, sequence = text.split("\n", 1) 109 title = title[1:] 110 rec = Record() 111 rec.title = title 112 rec.sequence = sequence.replace("\n","") 113 return rec
114
115 - def parse(self, handle):
116 return self.parse_string(handle.read())
117
118 -class SequenceParser:
119 """Parses FASTA sequence data into a SeqRecord object. 120 """
121 - def __init__(self, alphabet = Alphabet.generic_alphabet, title2ids = None, 122 debug = 0):
123 """Initialize a Scanner and Sequence Consumer. 124 125 Arguments: 126 o alphabet - The alphabet of the sequences to be parsed. If not 127 passed, this will be set as generic_alphabet. 128 o title2ids - A function that, when given the title of the FASTA 129 file (without the beginning >), will return the id, name and 130 description (in that order) for the record. If this is not given, 131 then the entire title line will be used as the description. 132 """ 133 self.alphabet = alphabet 134 self.title2ids = title2ids
135
136 - def parse_string(self, text) :
137 text = text.replace("\r\n","\n") #Crude way of dealing with \r\n 138 assert text[0] == ">", text 139 text = text.split("\n>",1)[0] # Only do the first record if more than one 140 title, sequence = text.split("\n", 1) 141 title = title[1:] 142 143 seq = Seq.Seq(sequence.replace("\n",""), self.alphabet) 144 rec = SeqRecord.SeqRecord(seq) 145 146 if self.title2ids: 147 seq_id, name, descr = self.title2ids(title) 148 rec.id = seq_id 149 rec.name = name 150 rec.description = descr 151 else: 152 rec.description = title 153 154 return rec
155
156 - def parse(self, handle):
157 return self.parse_string(handle.read())
158
159 -class Dictionary(dict):
160 """Accesses an indexed FASTA file using a dictionary interface. DEPRECATED 161 """
162 - def __init__(self, indexname, parser=None, filename = None):
163 """Open a Fasta Dictionary. DEPRECATED 164 165 indexname is the name of the index for the dictionary. The index should 166 have been created using the index_file function. 167 168 parser is an optional Parser object to change the results into another 169 form. If set to None, then the raw contents of the file will be returned. 170 171 filename specifies the name of the file that this index references. 172 This is useful in cases where the file has been moved since indexing. 173 If no filename is supplied (the default) the filename stored in the 174 index will be used. XXX This is no longer supported -- use symbolic 175 links in the filesystem. 176 """ 177 178 import warnings 179 warnings.warn("Bio.Fasta.index_file Bio.Fasta.Dictionary are deprecated." \ 180 + " We hope an in memory dictionary, for example using the" \ 181 + " Bio.SeqIO.to_dict() function, will be suitable for" \ 182 + " most users. Please get in touch on the mailing lists if" \ 183 + " this (or its removal) causes any problems for you.", 184 DeprecationWarning) 185 186 # we can't support finding the index file name if we want to follow 187 # standard open-bio fetching protocols. 188 if filename is not None: 189 raise AttributeError("Specifying filenames is no longer supported") 190 191 self._index = Mindy.open(indexname) 192 self._parser = parser 193 194 primary_key_retriever = self._index['id'] 195 for k in primary_key_retriever.keys(): 196 dict.__setitem__(self,k,None)
197 198
199 - def _load_seq(self,key):
200 try: 201 seqs = self._index.lookup(id = key) 202 # if we can't do that, we have to try and fetch by alias 203 except KeyError: 204 seqs = self._index.lookup(aliases = key) 205 206 if len(seqs) == 1: 207 seq = seqs[0] 208 else: 209 raise KeyError("Multiple sequences found for %s" % key) 210 211 if self._parser: 212 handle = cStringIO.StringIO(seq.text) 213 self[key] = self._parser.parse(handle) 214 else: 215 self[key] = seq.text
216 217
218 - def __getitem__(self, key):
219 if self.has_key(key) and dict.__getitem__(self,key) is None: 220 self._load_seq(key) 221 return dict.__getitem__(self,key)
222 223
224 -def index_file(filename, indexname, rec2key = None, use_berkeley = 0):
225 """Index a FASTA file. DEPRECATED 226 227 filename is the name of the file to index. 228 229 indexname is the name of the dictionary to be created. This can be 230 just the name of the index, in which case the index information will 231 be created in a directory of the given index name in the current 232 directory, or a full pathname to a directory to save the indexing 233 information. 234 235 rec2key is an optional callback fuction that takes a Fasta Record and 236 generates a unique key (e.g. the accession number) for the record. 237 Optionally, it can also return 3 items, to be used as the id (unique key) 238 name, and aliases for the index. If not specified, the sequence title 239 will be used. 240 241 use_berkeley specifies whether to use the BerkeleyDB indexer, which 242 uses the bsddb3 wrappers around the embedded database Berkeley DB. By 243 default, the standard flat file (non-Berkeley) indexes are used. 244 """ 245 246 import warnings 247 warnings.warn("Bio.Fasta.index_file Bio.Fasta.Dictionary are deprecated." \ 248 + " We hope an in memory dictionary, for example using the" \ 249 + " Bio.SeqIO.to_dict() function, will be suitable for" \ 250 + " most users. Please get in touch on the mailing lists if" \ 251 + " this (or its removal) causes any problems for you.", 252 DeprecationWarning) 253 254 if rec2key: 255 indexer = _FastaFunctionIndexer(rec2key) 256 else: 257 indexer = _FastaTitleIndexer() 258 259 if use_berkeley: 260 SimpleSeqRecord.create_berkeleydb([filename], indexname, indexer) 261 else: 262 SimpleSeqRecord.create_flatdb([filename], indexname, indexer)
263
264 -class _FastaTitleIndexer(SimpleSeqRecord.BaseSeqRecordIndexer):
265 """Simple indexer to index by the title of a FASTA record. 266 267 This doesn't do anything fancy, just gets the title and uses that as the 268 identifier. 269 """
270 - def __init__(self):
272
273 - def primary_key_name(self):
274 return "id"
275
276 - def secondary_key_names(self):
277 return ["name", "aliases"]
278
279 - def get_id_dictionary(self, seq_record):
280 sequence_id = seq_record.description 281 282 id_info = {"id" : [sequence_id], 283 "name" : [], 284 "aliases" : []} 285 return id_info
286 287
288 -class _FastaFunctionIndexer(SimpleSeqRecord.BaseSeqRecordIndexer):
289 """Indexer to index based on values returned by a function. 290 291 This class is passed a function to parse description titles from a Fasta 292 title. It needs to return either one item, which is an id from the title, 293 or three items which are (in order), the id, a list of names, and a list 294 of aliases. 295 296 This indexer allows indexing to be completely flexible based on passed 297 functions. 298 """
299 - def __init__(self, index_function):
300 SimpleSeqRecord.BaseSeqRecordIndexer.__init__(self) 301 self.index_function = index_function
302
303 - def primary_key_name(self):
304 return "id"
305
306 - def secondary_key_names(self):
307 return ["name", "aliases"]
308
309 - def get_id_dictionary(self, seq_record):
310 # make a FASTA record to make this compatible with previous Biopython 311 # code 312 tmp_rec = Record() 313 tmp_rec.title = seq_record.description 314 tmp_rec.sequence = seq_record.seq.data 315 items = self.index_function(tmp_rec) 316 if type(items) is not type([]) and type(items) is not type(()): 317 items = [items] 318 if len(items) == 1: 319 seq_id = items[0] 320 name = [] 321 aliases = [] 322 elif len(items) == 3: 323 seq_id, name, aliases = items 324 else: 325 raise ValueError("Unexpected items from index function: %s" % 326 (items)) 327 328 return {"id" : [seq_id], 329 "name" : name, 330 "aliases" : aliases}
331