Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  This module contains a parser for the pairwise alignments produced by Bill 
  8  Pearson's FASTA tool, for use from the Bio.AlignIO interface where it is 
  9  refered to as the "fasta-m10" file format (as we only support the machine 
 10  readable output format selected with the -m 10 command line option). 
 11   
 12  This module does NOT cover the generic "fasta" file format originally 
 13  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 14  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 15  which can also be used to store a multiple sequence alignments. 
 16  """ 
 17   
 18  from Bio.Align.Generic import Alignment 
 19  from Interfaces import AlignmentIterator 
 20   
21 -class FastaM10Iterator(AlignmentIterator) :
22 """Alignment iterator for the FASTA tool's pairwise alignment output. 23 24 This is for reading the pairwise alignments output by Bill Pearson's 25 FASTA program when called with the -m 10 command line option for machine 26 readable output. For more details about the FASTA tools, see the website 27 http://fasta.bioch.virginia.edu/ and the paper: 28 29 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 30 31 This class is intended to be used via the Bio.AlignIO.parse() function 32 by specifying the format as "fasta-m10" as shown in the following code: 33 34 from Bio import AlignIO 35 handle = ... 36 for a in AlignIO.parse(handle, "fasta-m10") : 37 assert len(a.get_all_seqs()) == 2, "Should be pairwise!" 38 print "Alignment length %i" % a.get_alignment_length() 39 for record in a : 40 print record.seq, record.name, record.id 41 42 Note that this is not a full blown parser for all the information 43 in the FASTA output - for example, most of the header and all of the 44 footer is ignored. Also, the alignments are not batched according to 45 the input queries. 46 47 Also note that there can be up to about 30 letters of flanking region 48 included in the raw FASTA output as contextual information. This is NOT 49 part of the alignment itself, and is not included in the resulting 50 Alignment objects returned. 51 """ 52
53 - def next(self) :
54 """Reads from the handle to construct and return the next alignment. 55 56 This returns the pairwise alignment of query and match/library 57 sequences as an Alignment object containing two rows.""" 58 handle = self.handle 59 60 try : 61 #Header we saved from when we were parsing 62 #the previous alignment. 63 line = self._header 64 del self._header 65 except AttributeError: 66 line = handle.readline() 67 if not line: 68 return None 69 70 if line.startswith("#") : 71 #Skip the file header before the alignments. e.g. 72 line = self._skip_file_header(line) 73 if ">>>" in line and not line.startswith(">>>") : 74 #Moved onto the next query sequence! 75 self._query_descr = "" 76 self._query_header_annotation = {} 77 #Read in the query header 78 line = self._parse_query_header(line) 79 if not line : 80 #End of file 81 return None 82 if ">>><<<" in line : 83 #Reached the end of the alignments, no need to read the footer... 84 return None 85 86 87 assert line.startswith(">>") and not line.startswith(">>>"), line 88 89 query_seq_parts, match_seq_parts = [], [] 90 query_annotation, match_annotation = {}, {} 91 match_descr = "" 92 alignment_annotation = {} 93 94 #This should be followed by the target match ID line, then more tags. 95 #e.g. 96 """ 97 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 98 ; fa_frame: f 99 ; fa_initn: 52 100 ; fa_init1: 52 101 ; fa_opt: 70 102 ; fa_z-score: 105.5 103 ; fa_bits: 27.5 104 ; fa_expect: 0.082 105 ; sw_score: 70 106 ; sw_ident: 0.279 107 ; sw_sim: 0.651 108 ; sw_overlap: 43 109 """ 110 if not line.startswith(">>") and not line.startswith(">>>") : 111 raise ValueError("Expected target line starting '>>'") 112 match_descr = line[2:].strip() 113 #Handle the following "alignment hit" tagged data, e.g. 114 line = handle.readline() 115 line = self._parse_tag_section(line, alignment_annotation) 116 assert not line.startswith("; ") 117 118 #Then we have the alignment numbers and sequence for the query 119 """ 120 >gi|10955265| .. 121 ; sq_len: 346 122 ; sq_offset: 1 123 ; sq_type: p 124 ; al_start: 197 125 ; al_stop: 238 126 ; al_display_start: 167 127 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 128 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 129 GEYFTENKPKYIIREIHQET 130 """ 131 if not (line.startswith(">") and line.strip().endswith("..")): 132 raise ValueError("Expected line starting '>' and ending '..'") 133 assert self._query_descr.startswith(line[1:].split()[0]) 134 135 #Handle the following "query alignment" tagged data 136 line = handle.readline() 137 line = self._parse_tag_section(line, query_annotation) 138 assert not line.startswith("; ") 139 140 #Now should have the aligned query sequence (with leading flanking region) 141 while not line.startswith(">") : 142 query_seq_parts.append(line.strip()) 143 line = handle.readline() 144 145 #Handle the following "match alignment" data 146 """ 147 >gi|152973545|ref|YP_001338596.1| .. 148 ; sq_len: 242 149 ; sq_type: p 150 ; al_start: 52 151 ; al_stop: 94 152 ; al_display_start: 22 153 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 154 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 155 QDFAFTRKMRREARQVEQSW 156 """ 157 #Match identifier 158 if not (line.startswith(">") and line.strip().endswith("..")): 159 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) 160 assert match_descr.startswith(line[1:].split()[0]) 161 162 #Tagged data, 163 line = handle.readline() 164 line = self._parse_tag_section(line, match_annotation) 165 assert not line.startswith("; ") 166 167 #Now should have the aligned query sequence with flanking region... 168 while not (line.startswith(">") or ">>>" in line): 169 match_seq_parts.append(line.strip()) 170 line = handle.readline() 171 self._header = line 172 173 #We built a list of strings and then joined them because 174 #its faster than appending to a string. 175 query_seq = "".join(query_seq_parts) 176 match_seq = "".join(match_seq_parts) 177 del query_seq_parts, match_seq_parts 178 #Note, query_seq and match_seq will usually be of different lengths, apparently 179 #because in the m10 format leading gaps are added but not trailing gaps! 180 181 #Remove the flanking regions, 182 query_align_seq = self._extract_alignment_region(query_seq, query_annotation) 183 match_align_seq = self._extract_alignment_region(match_seq, match_annotation) 184 185 #The "sq_offset" values can be specified with the -X command line option. 186 #The appear to just shift the origin used in the calculation of the coordinates. 187 188 if ("sq_offset" in query_annotation and query_annotation["sq_offset"]<>"1") \ 189 or ("sq_offset" in match_annotation and match_annotation["sq_offset"]<>"1") : 190 #Note that until some point in the v35 series, FASTA always recorded one 191 #for the query offset, and ommitted the match offset (even when these were 192 #query_seq the -X command line option). 193 #TODO - Work out how exactly the use of -X offsets changes things. 194 #raise ValueError("Offsets from the -X command line option are not (yet) supported") 195 pass 196 197 if len(query_align_seq) <> len(match_align_seq) : 198 raise ValueError("Problem parsing the alignment sequence coordinates") 199 if "sw_overlap" in alignment_annotation : 200 if int(alignment_annotation["sw_overlap"]) <> len(query_align_seq) : 201 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ 202 % (alignment_annotation["sw_overlap"], 203 len(query_align_seq))) 204 205 #TODO - Look at the "sq_type" to assign a sensible alphabet? 206 alignment = Alignment(self.alphabet) 207 208 #TODO - Introduce an annotated alignment class? 209 #For now, store the annotation a new private property: 210 alignment._annotations = {} 211 212 #Want to record both the query header tags, and the alignment tags. 213 for key, value in self._query_header_annotation.iteritems() : 214 alignment._annotations[key] = value 215 for key, value in alignment_annotation.iteritems() : 216 alignment._annotations[key] = value 217 218 219 #TODO - Once the alignment object gets an append method, use it. 220 #(i.e. an add SeqRecord method) 221 alignment.add_sequence(self._query_descr, query_align_seq) 222 record = alignment.get_all_seqs()[-1] 223 assert record.id == self._query_descr or record.description == self._query_descr 224 assert record.seq.tostring() == query_align_seq 225 record.id = self._query_descr.split()[0].strip(",") 226 record.name = "query" 227 record.annotations["original_length"] = int(query_annotation["sq_len"]) 228 229 alignment.add_sequence(match_descr, match_align_seq) 230 record = alignment.get_all_seqs()[-1] 231 assert record.id == match_descr or record.description == match_descr 232 assert record.seq.tostring() == match_align_seq 233 record.id = match_descr.split()[0].strip(",") 234 record.name = "match" 235 record.annotations["original_length"] = int(match_annotation["sq_len"]) 236 237 return alignment
238
239 - def _skip_file_header(self, line) :
240 """Helper function for the main parsing code. 241 242 Skips over the file header region. 243 """ 244 #e.g. This region: 245 """ 246 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa 247 FASTA searches a protein or DNA sequence data bank 248 version 35.03 Feb. 18, 2008 249 Please cite: 250 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 251 252 Query: NC_002127.faa 253 """ 254 #Note that there is no point recording the command line here 255 #from the # line, as it is included again in each alignment 256 #under the "pg_argv" tag. Likewise for the program version. 257 while ">>>" not in line : 258 line = self.handle.readline() 259 return line
260
261 - def _parse_query_header(self, line) :
262 """Helper function for the main parsing code. 263 264 Skips over the free format query header, extracting the tagged parameters. 265 266 If there are no hits for the current query, it is skipped entirely.""" 267 #e.g. this region (where there is often a histogram too): 268 """ 269 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 270 Library: NC_009649.faa 45119 residues in 180 sequences 271 272 45119 residues in 180 sequences 273 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 274 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 275 Lambda= 0.191631 276 Algorithm: FASTA (3.5 Sept 2006) [optimized] 277 Parameters: BL50 matrix (15:-5) ktup: 2 278 join: 36, opt: 24, open/ext: -10/-2, width: 16 279 Scan time: 0.040 280 281 The best scores are: opt bits E(180) 282 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22 283 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93 284 """ 285 #Sometime have queries with no matches, in which case we continue to the next query block: 286 """ 287 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa 288 vs NC_002127.faa library 289 290 579 residues in 3 sequences 291 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100 292 293 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 294 join: 36, opt: 24, open/ext: -10/-2, width: 16 295 Scan time: 0.000 296 !! No library sequences with E() < 0.5 297 """ 298 299 self._query_header_annotation = {} 300 self._query_descr = "" 301 302 assert ">>>" in line and not line.startswith(">>>") 303 #There is nothing useful in this line, the query description is truncated. 304 305 line = self.handle.readline() 306 #We ignore the free form text... 307 while not line.startswith(">>>") : 308 #print "Ignoring %s" % line.strip() 309 line = self.handle.readline() 310 if not line : 311 raise ValueError("Premature end of file!") 312 if ">>><<<" in line : 313 #End of alignments, looks like the last query 314 #or queries had no hits. 315 return line 316 317 #Now want to parse this section: 318 """ 319 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 320 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 321 ; pg_ver: 35.03 322 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa 323 ; pg_name: FASTA 324 ; pg_ver: 3.5 Sept 2006 325 ; pg_matrix: BL50 (15:-5) 326 ; pg_open-ext: -10 -2 327 ; pg_ktup: 2 328 ; pg_optcut: 24 329 ; pg_cgap: 36 330 ; mp_extrap: 60000 500 331 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631 332 ; mp_KS: -0.0000 (N=1066338402) at 20 333 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized] 334 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16 335 """ 336 337 assert line.startswith(">>>"), line 338 self._query_descr = line[3:].strip() 339 340 #Handle the following "program" tagged data, 341 line = self.handle.readline() 342 line = self._parse_tag_section(line, self._query_header_annotation) 343 assert not line.startswith("; "), line 344 assert line.startswith(">>"), line 345 return line
346 347
348 - def _extract_alignment_region(self, alignment_seq_with_flanking, annotation) :
349 """Helper function for the main parsing code. 350 351 To get the actual pairwise alignment sequences, we must first 352 translate the un-gapped sequence based coordinates into positions 353 in the gapped sequence (which may have a flanking region shown 354 using leading - characters). To date, I have never seen any 355 trailing flanking region shown in the m10 file, but the 356 following code should also cope with that. 357 358 Note that this code seems to work fine even when the "sq_offset" 359 entries are prsent as a result of using the -X command line option. 360 """ 361 align_stripped = alignment_seq_with_flanking.strip("-") 362 start = int(annotation['al_start']) \ 363 - int(annotation['al_display_start']) 364 end = int(annotation['al_stop']) \ 365 - int(annotation['al_display_start']) \ 366 + align_stripped.count("-") + 1 367 return align_stripped[start:end]
368 369
370 - def _parse_tag_section(self, line, dictionary) :
371 """Helper function for the main parsing code. 372 373 line - supply line just read from the handle containing the start of 374 the tagged section. 375 dictionary - where to record the tagged values 376 377 Returns a string, the first line following the tagged section.""" 378 if not line.startswith("; ") : 379 raise ValueError("Expected line starting '; '") 380 while line.startswith("; ") : 381 tag, value = line[2:].strip().split(": ",1) 382 #fasta34 and fasta35 will reuse the pg_name and pg_ver tags 383 #for the program executable and name, and the program version 384 #and the algorithm version, respectively. This may be a bug. 385 #if tag in dictionary : 386 # raise ValueError("Repeated tag '%s' in section" % tag) 387 dictionary[tag] = value 388 line = self.handle.readline() 389 return line
390 391 if __name__ == "__main__" : 392 print "Running a quick self-test" 393 394 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 395 simple_example = \ 396 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 397 FASTA searches a protein or DNA sequence data bank 398 version 34.26 January 12, 2007 399 Please cite: 400 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 401 402 Query library NC_002127.faa vs NC_009649.faa library 403 searching NC_009649.faa library 404 405 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 406 vs NC_009649.faa library 407 408 45119 residues in 180 sequences 409 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 410 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 411 Lambda= 0.175043 412 413 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 414 join: 36, opt: 24, open/ext: -10/-2, width: 16 415 Scan time: 0.000 416 The best scores are: opt bits E(180) 417 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 418 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 419 420 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 421 ; pg_name: /opt/fasta/fasta34 422 ; pg_ver: 34.26 423 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 424 ; pg_name: FASTA 425 ; pg_ver: 3.5 Sept 2006 426 ; pg_matrix: BL50 (15:-5) 427 ; pg_open-ext: -10 -2 428 ; pg_ktup: 2 429 ; pg_optcut: 24 430 ; pg_cgap: 36 431 ; mp_extrap: 60000 180 432 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 433 ; mp_KS: -0.0000 (N=0) at 8159228 434 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 435 ; fa_frame: f 436 ; fa_initn: 65 437 ; fa_init1: 43 438 ; fa_opt: 71 439 ; fa_z-score: 90.3 440 ; fa_bits: 24.9 441 ; fa_expect: 0.58 442 ; sw_score: 71 443 ; sw_ident: 0.250 444 ; sw_sim: 0.574 445 ; sw_overlap: 108 446 >gi|10955263| .. 447 ; sq_len: 107 448 ; sq_offset: 1 449 ; sq_type: p 450 ; al_start: 5 451 ; al_stop: 103 452 ; al_display_start: 1 453 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 454 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 455 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 456 >gi|152973457|ref|YP_001338508.1| .. 457 ; sq_len: 931 458 ; sq_type: p 459 ; al_start: 96 460 ; al_stop: 195 461 ; al_display_start: 66 462 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 463 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 464 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 465 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 466 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 467 ; fa_frame: f 468 ; fa_initn: 33 469 ; fa_init1: 33 470 ; fa_opt: 63 471 ; fa_z-score: 86.1 472 ; fa_bits: 23.1 473 ; fa_expect: 0.99 474 ; sw_score: 63 475 ; sw_ident: 0.266 476 ; sw_sim: 0.656 477 ; sw_overlap: 64 478 >gi|10955263| .. 479 ; sq_len: 107 480 ; sq_offset: 1 481 ; sq_type: p 482 ; al_start: 32 483 ; al_stop: 94 484 ; al_display_start: 2 485 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 486 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 487 LSRLMAD 488 >gi|152973588|ref|YP_001338639.1| .. 489 ; sq_len: 459 490 ; sq_type: p 491 ; al_start: 191 492 ; al_stop: 248 493 ; al_display_start: 161 494 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 495 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 496 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 497 SNKALKSQISALLSSIQNKAVADEKLTDQE 498 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 499 vs NC_009649.faa library 500 501 45119 residues in 180 sequences 502 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 503 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 504 Lambda= 0.179384 505 506 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 507 join: 36, opt: 24, open/ext: -10/-2, width: 16 508 Scan time: 0.000 509 The best scores are: opt bits E(180) 510 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 511 512 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 513 ; pg_name: /opt/fasta/fasta34 514 ; pg_ver: 34.26 515 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 516 ; pg_name: FASTA 517 ; pg_ver: 3.5 Sept 2006 518 ; pg_matrix: BL50 (15:-5) 519 ; pg_open-ext: -10 -2 520 ; pg_ktup: 2 521 ; pg_optcut: 24 522 ; pg_cgap: 36 523 ; mp_extrap: 60000 180 524 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 525 ; mp_KS: -0.0000 (N=0) at 8159228 526 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 527 ; fa_frame: f 528 ; fa_initn: 50 529 ; fa_init1: 50 530 ; fa_opt: 58 531 ; fa_z-score: 95.8 532 ; fa_bits: 22.9 533 ; fa_expect: 0.29 534 ; sw_score: 58 535 ; sw_ident: 0.289 536 ; sw_sim: 0.632 537 ; sw_overlap: 38 538 >gi|10955264| .. 539 ; sq_len: 126 540 ; sq_offset: 1 541 ; sq_type: p 542 ; al_start: 1 543 ; al_stop: 38 544 ; al_display_start: 1 545 ------------------------------MKKDKKYQIEAIKNKDKTLF 546 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 547 NGEKFSSYTLNKVTKTDEYN 548 >gi|152973462|ref|YP_001338513.1| .. 549 ; sq_len: 101 550 ; sq_type: p 551 ; al_start: 44 552 ; al_stop: 81 553 ; al_display_start: 14 554 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 555 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 556 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 557 vs NC_009649.faa library 558 559 45119 residues in 180 sequences 560 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 561 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 562 Lambda= 0.210386 563 564 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 565 join: 37, opt: 25, open/ext: -10/-2, width: 16 566 Scan time: 0.020 567 The best scores are: opt bits E(180) 568 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 569 570 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 571 ; pg_name: /opt/fasta/fasta34 572 ; pg_ver: 34.26 573 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 574 ; pg_name: FASTA 575 ; pg_ver: 3.5 Sept 2006 576 ; pg_matrix: BL50 (15:-5) 577 ; pg_open-ext: -10 -2 578 ; pg_ktup: 2 579 ; pg_optcut: 25 580 ; pg_cgap: 37 581 ; mp_extrap: 60000 180 582 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 583 ; mp_KS: -0.0000 (N=0) at 8159228 584 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 585 ; fa_frame: f 586 ; fa_initn: 52 587 ; fa_init1: 52 588 ; fa_opt: 70 589 ; fa_z-score: 105.5 590 ; fa_bits: 27.5 591 ; fa_expect: 0.082 592 ; sw_score: 70 593 ; sw_ident: 0.279 594 ; sw_sim: 0.651 595 ; sw_overlap: 43 596 >gi|10955265| .. 597 ; sq_len: 346 598 ; sq_offset: 1 599 ; sq_type: p 600 ; al_start: 197 601 ; al_stop: 238 602 ; al_display_start: 167 603 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 604 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 605 GEYFTENKPKYIIREIHQET 606 >gi|152973545|ref|YP_001338596.1| .. 607 ; sq_len: 242 608 ; sq_type: p 609 ; al_start: 52 610 ; al_stop: 94 611 ; al_display_start: 22 612 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 613 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 614 QDFAFTRKMRREARQVEQSW 615 >>><<< 616 617 618 579 residues in 3 query sequences 619 45119 residues in 180 library sequences 620 Scomplib [34.26] 621 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 622 Total Scan time: 0.020 Total Display time: 0.010 623 624 Function used was FASTA [version 34.26 January 12, 2007] 625 626 """ 627 628 629 from StringIO import StringIO 630 631 alignments = list(FastaM10Iterator(StringIO(simple_example))) 632 assert len(alignments) == 4, len(alignments) 633 assert len(alignments[0].get_all_seqs()) == 2 634 for a in alignments : 635 print "Alignment %i sequences of length %i" \ 636 % (len(a.get_all_seqs()), a.get_alignment_length()) 637 for r in a : 638 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"]) 639 #print a.annotations 640 print "Done" 641 642 import os 643 path = "../../Tests/Fasta/" 644 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"] 645 files.sort() 646 for filename in files : 647 if os.path.splitext(filename)[-1] == ".m10" : 648 print 649 print filename 650 print "="*len(filename) 651 for a in FastaM10Iterator(open(os.path.join(path,filename))): 652 print a 653