1
2
3
4
5
6
7
8
9
10
11
12 """Sequence input/output designed to look similar to the bioperl design.
13
14 Input
15 =====
16 The main function is Bio.SeqIO.parse(...) which takes an input file handle,
17 and format string. This returns an iterator giving SeqRecord objects.
18
19 from Bio import SeqIO
20 handle = open("example.fasta", "rU")
21 for record in SeqIO.parse(handle, "fasta") :
22 print record
23
24 Note that the parse() function will all invoke the relevant parser for
25 the format with its default settings. You may want more control, in which case
26 you need to create a format specific sequence iterator directly.
27
28 For non-interlaced files (e.g. Fasta, GenBank, EMBL) with multiple records
29 using a sequence iterator can save you a lot of memory (RAM). There is less
30 benefit for interlaced file formats (e.g. most multiple alignment file formats).
31 However, an iterator only lets you access the records one by one.
32
33 If you want random access to the records by number, turn this into a list:
34
35 from Bio import SeqIO
36 handle = open("example.fasta", "rU")
37 records = list(SeqIO.parse(handle, "fasta"))
38 print records[0]
39
40 If you want random access to the records by a key such as the record id, turn
41 the iterator into a dictionary:
42
43 from Bio import SeqIO
44 handle = open("example.fasta", "rU")
45 record_dict = SeqIO.to_dict(SeqIO.parse(handle, "format"))
46 print record["gi:12345678"]
47
48
49 Input - Alignments
50 ==================
51 Currently an alignment class cannot be created from SeqRecord objects.
52 Instead, use the to_alignment(...) function, like so:
53
54 from Bio import SeqIO
55 handle = open("example.aln", "rU")
56 alignment = SeqIO.to_alignment(SeqIO.parse(handle, "clustal"))
57
58 This function may be removed in future once alignments can be created
59 directly from SeqRecord objects.
60
61 Output
62 ======
63 Use the function Bio.SeqIO.write(...), which takes a complete set of SeqRecord
64 objects (either as a list, or an iterator), an output file handle and of course
65 the file format.
66
67 from Bio import SeqIO
68 records = ...
69 handle = open("example.faa", "w")
70 SeqIO.write(records, handle, "fasta")
71 handle.close()
72
73 In general, you are expected to call this function once (with all your records)
74 and then close the file handle.
75
76 Output - Advanced
77 =================
78 The effect of calling write() multiple times on a single file will vary
79 depending on the file format, and is best avoided unless you have a strong reason
80 to do so.
81
82 Trying this for certain alignment formats (e.g. phylip, clustal, stockholm) would
83 have the effect of concatenating several multiple sequence alignments together.
84 Such files are created by the PHYLIP suite of programs for bootstrap analysis.
85
86 For sequential files formats (e.g. fasta, genbank) each "record block" holds a
87 single sequence. For these files it would probably be safe to call write()
88 multiple times.
89
90 File Formats
91 ============
92 When specifying formats, use lowercase strings.
93
94 Old Files
95 =========
96 The modules Bio.SeqIO.FASTA and Bio.SeqIO.generic are depreciated and may be
97 removed.
98 """
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118 """
119 FAO BioPython Developers
120 ========================
121 The way I envision this SeqIO system working as that for any sequence file format
122 we have an iterator that returns SeqRecord objects.
123
124 This also applies to interlaced fileformats (like clustal) where the file cannot
125 be read record by record. You should still return an iterator!
126
127 These file format specific sequence iterators may be implemented as:
128 * Classes which take a handle for __init__ and provide the __iter__ method
129 * Functions that take a handle, and return an iterator object
130 * Generator functions that take a handle, and yeild SeqRecord objects
131
132 It is then trivial to turn this iterator into a list of SeqRecord objects, an in
133 memory dictionary, or a multiple sequence alignment object.
134
135 For building the dictionary by default the id propery of each SeqRecord is used
136 as the key. You should always populate the id property, and it should be unique.
137 For some file formats the accession number is a good choice.
138
139 When adding a new file format, please use the same lower case format name as
140 BioPerl, or if they have not defined one, try the names used by EMBOSS.
141 """
142
143 import os
144
145 from StringIO import StringIO
146 from Bio.Alphabet import generic_alphabet, generic_protein
147 from Bio.Seq import Seq
148 from Bio.SeqRecord import SeqRecord
149 from Bio.Align.Generic import Alignment
150
151 import FastaIO
152 import InsdcIO
153 import StockholmIO
154 import ClustalIO
155 import PhylipIO
156 import NexusIO
157 import SwissIO
158
159
160
161
162
163
164
165
166 _FormatToIterator ={"fasta" : FastaIO.FastaIterator,
167 "genbank" : InsdcIO.GenBankIterator,
168 "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
169 "embl" : InsdcIO.EmblIterator,
170 "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
171 "clustal" : ClustalIO.ClustalIterator,
172 "phylip" : PhylipIO.PhylipIterator,
173 "nexus" : NexusIO.NexusIterator,
174 "stockholm" : StockholmIO.StockholmIterator,
175 "swiss" : SwissIO.SwissIterator,
176 }
177
178 _FormatToWriter ={"fasta" : FastaIO.FastaWriter,
179 "phylip" : PhylipIO.PhylipWriter,
180 "stockholm" : StockholmIO.StockholmWriter,
181 "clustal" : ClustalIO.ClustalWriter,
182 }
183
184 -def write(sequences, handle, format) :
185 """Write complete set of sequences to a file
186
187 sequences - A list (or iterator) of SeqRecord objects
188 handle - File handle object to write to
189 format - What format to use.
190
191 You should close the handle after calling this function.
192
193 There is no return value.
194 """
195
196
197 if isinstance(handle, basestring) :
198 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
199 if not isinstance(format, basestring) :
200 raise TypeError("Need a string for the file format (lower case)")
201 if not format :
202 raise ValueError("Format required (lower case string)")
203 if format <> format.lower() :
204 raise ValueError("Format string '%s' should be lower case" % format)
205 if isinstance(sequences,SeqRecord):
206 raise ValueError("Use a SeqRecord list/iterator, not just a single SeqRecord")
207
208
209 try :
210 writer_class = _FormatToWriter[format]
211 except KeyError :
212 raise ValueError("Unknown format '%s'" % format)
213
214 writer_class(handle).write_file(sequences)
215
216
217
218 return
219
220 -def parse(handle, format) :
221 """Turns a sequence file into a iterator returning SeqRecords
222
223 handle - handle to the file.
224 format - string describing the file format.
225
226 If you have the file name in a string 'filename', use:
227
228 from Bio import SeqIO
229 my_iterator = SeqIO.parse(open(filename,"rU"), format)
230
231 If you have a string 'data' containing the file contents, use:
232
233 from Bio import SeqIO
234 from StringIO import StringIO
235 my_iterator = SeqIO.parse(StringIO(data), format)
236
237 Note that file will be parsed with default settings,
238 which may result in a generic alphabet or other non-ideal
239 settings. For more control, you must use the format specific
240 iterator directly...
241 """
242
243
244 if isinstance(handle, basestring) :
245 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
246 if not isinstance(format, basestring) :
247 raise TypeError("Need a string for the file format (lower case)")
248 if not format :
249 raise ValueError("Format required (lower case string)")
250 if format <> format.lower() :
251 raise ValueError("Format string '%s' should be lower case" % format)
252
253
254 try :
255 iterator_generator = _FormatToIterator[format]
256 except KeyError :
257 raise ValueError("Unknown format '%s'" % format)
258
259
260 return iterator_generator(handle)
261
262 -def to_dict(sequences, key_function=None) :
263 """Turns a sequence iterator or list into a dictionary
264
265 sequences - An iterator that returns SeqRecord objects,
266 or simply a list of SeqRecord objects.
267 key_function - Optional function which when given a SeqRecord
268 returns a unique string for the dictionary key.
269
270 e.g. key_function = lambda rec : rec.name
271 or, key_function = lambda rec : rec.description.split()[0]
272
273 If key_function is ommitted then record.id is used, on the
274 assumption that the records objects returned are SeqRecords
275 with a unique id field.
276
277 If there are duplicate keys, an error is raised.
278
279 Example usage:
280
281 from Bio import SeqIO
282 filename = "example.fasta"
283 d = SeqIO.to_dict(SeqIO.parse(open(faa_filename, "rU")),
284 key_function = lambda rec : rec.description.split()[0])
285 print len(d)
286 print d.keys()[0:10]
287 key = d.keys()[0]
288 print d[key]
289 """
290 if key_function is None :
291 key_function = lambda rec : rec.id
292
293 d = dict()
294 for record in sequences :
295 key = key_function(record)
296 if key in d :
297 raise ValueError("Duplicate key '%s'" % key)
298 d[key] = record
299 return d
300
302 """Returns a multiple sequence alignment
303
304 sequences -An iterator that returns SeqRecord objects,
305 or simply a list of SeqRecord objects.
306 All the record sequences must be the same length.
307 alphabet - Optional alphabet. Stongly recommended.
308 strict - Optional, defaults to True. Should error checking
309 be done?
310 """
311
312 alignment_length = None
313 alignment = Alignment(alphabet)
314 for record in sequences :
315 if strict :
316 if alignment_length is None :
317 alignment_length = len(record.seq)
318 elif alignment_length <> len(record.seq) :
319 raise ValueError("Sequences of different lengths")
320
321 if not isinstance(record.seq.alphabet, alphabet.__class__) :
322 raise ValueError("Incompatible sequence alphabet")
323
324
325
326
327
328
329
330 alignment._records.append(record)
331 return alignment
332
333 if __name__ == "__main__" :
334
335 from Bio.Alphabet import generic_nucleotide
336 from sets import Set
337
338
339
340 faa_example = \
341 """>V_Harveyi_PATH
342 mknwikvava aialsaatvq aatevkvgms gryfpftfvk qdklqgfevd mwdeigkrnd
343 ykieyvtanf sglfglletg ridtisnqit mtdarkakyl fadpyvvdga qitvrkgnds
344 iqgvedlagk tvavnlgsnf eqllrdydkd gkiniktydt giehdvalgr adafimdrls
345 alelikktgl plqlagepfe tiqnawpfvd nekgrklqae vnkalaemra dgtvekisvk
346 wfgaditk
347 >B_subtilis_YXEM
348 mkmkkwtvlv vaallavlsa cgngnssske ddnvlhvgat gqsypfayke ngkltgfdve
349 vmeavakkid mkldwkllef sglmgelqtg kldtisnqva vtderketyn ftkpyayagt
350 qivvkkdntd iksvddlkgk tvaavlgsnh aknleskdpd kkiniktyet qegtlkdvay
351 grvdayvnsr tvliaqikkt glplklagdp ivyeqvafpf akddahdklr kkvnkaldel
352 rkdgtlkkls ekyfneditv eqkh
353 >FLIY_ECOLI
354 mklahlgrqa lmgvmavalv agmsvksfad egllnkvker gtllvglegt yppfsfqgdd
355 gkltgfevef aqqlakhlgv easlkptkwd gmlasldskr idvvinqvti sderkkkydf
356 stpytisgiq alvkkgnegt iktaddlkgk kvgvglgtny eewlrqnvqg vdvrtydddp
357 tkyqdlrvgr idailvdrla aldlvkktnd tlavtgeafs rqesgvalrk gnedllkavn
358 daiaemqkdg tlqalsekwf gadvtk
359 >Deinococcus_radiodurans
360 mkksllslkl sgllvpsvla lslsacssps stlnqgtlki amegtyppft skneqgelvg
361 fdvdiakava qklnlkpefv ltewsgilag lqankydviv nqvgitperq nsigfsqpya
362 ysrpeiivak nntfnpqsla dlkgkrvgst lgsnyekqli dtgdikivty pgapeiladl
363 vagridaayn drlvvnyiin dqklpvrgag qigdaapvgi alkkgnsalk dqidkaltem
364 rsdgtfekis qkwfgqdvgq p
365 >B_subtilis_GlnH_homo_YCKK
366 mkkallalfm vvsiaalaac gagndnqskd nakdgdlwas ikkkgvltvg tegtyepfty
367 hdkdtdkltg ydveviteva krlglkvdfk etqwgsmfag lnskrfdvva nqvgktdred
368 kydfsdkytt sravvvtkkd nndikseadv kgktsaqslt snynklatna gakvegvegm
369 aqalqmiqqa rvdmtyndkl avlnylktsg nknvkiafet gepqstyftf rkgsgevvdq
370 vnkalkemke dgtlskiskk wfgedvsk
371 >YA80_HAEIN
372 mkkllfttal ltgaiafstf shageiadrv ektktllvgt egtyapftfh dksgkltgfd
373 vevirkvaek lglkvefket qwdamyagln akrfdvianq tnpsperlkk ysfttpynys
374 ggvivtkssd nsiksfedlk grksaqsats nwgkdakaag aqilvvdgla qslelikqgr
375 aeatindkla vldyfkqhpn sglkiaydrg dktptafafl qgedalitkf nqvlealrqd
376 gtlkqisiew fgyditq
377 >E_coli_GlnH
378 mksvlkvsla altlafavss haadkklvva tdtafvpfef kqgdkyvgfd vdlwaaiake
379 lkldyelkpm dfsgiipalq tknvdlalag ititderkka idfsdgyyks gllvmvkann
380 ndvksvkdld gkvvavksgt gsvdyakani ktkdlrqfpn idnaymelgt nradavlhdt
381 pnilyfikta gngqfkavgd sleaqqygia fpkgsdelrd kvngalktlr engtyneiyk
382 kwfgtepk
383 >HISJ_E_COLI
384 mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg fdidlakelc
385 krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly aadsrlvvak
386 nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys dltagridaa
387 fqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre alnkafaemr
388 adgtyeklak kyfdfdvygg"""
389
390
391 aln_example = \
392 """CLUSTAL X (1.83) multiple sequence alignment
393
394
395 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
396 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
397 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
398 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
399 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
400 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
401 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
402 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
403 : . : :.
404
405 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
406 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
407 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
408 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
409 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
410 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
411 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
412 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
413 ** .: *::::. : :. . ..:
414
415 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQITVRKGNDSIQGVE
416 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQIVVKKDNTDIKSVD
417 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVVVTKKDNNDIKSEA
418 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVIVTKSSDNSIKSFE
419 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQALVKKGNEGTIKTAD
420 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLVMVKANNNDVKSVK
421 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEIIVAKNNTFNPQSLA
422 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAKNSDIQP-TVE
423 *.: . * . * *: : : .
424
425 V_Harveyi_PATH DLAGKTVAVNLGSNFEQLLRDYDKDGKINIKTYDT--GIEHDVALGRADA
426 B_subtilis_YXEM DLKGKTVAAVLGSNHAKNLESKDPDKKINIKTYETQEGTLKDVAYGRVDA
427 B_subtilis_GlnH_homo_YCKK DVKGKTSAQSLTSNYNKLATN----AGAKVEGVEGMAQALQMIQQARVDM
428 YA80_HAEIN DLKGRKSAQSATSNWGKDAKA----AGAQILVVDGLAQSLELIKQGRAEA
429 FLIY_ECOLI DLKGKKVGVGLGTNYEEWLRQNV--QGVDVRTYDDDPTKYQDLRVGRIDA
430 E_coli_GlnH DLDGKVVAVKSGTGSVDYAKAN--IKTKDLRQFPNIDNAYMELGTNRADA
431 Deinococcus_radiodurans DLKGKRVGSTLGSNYEKQLIDTG---DIKIVTYPGAPEILADLVAGRIDA
432 HISJ_E_COLI SLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDA
433 .: *: . : .: : * :
434
435 V_Harveyi_PATH FIMDRLSALE-LIKKT-GLPLQLAGEPFETI-----QNAWPFVDNEKGRK
436 B_subtilis_YXEM YVNSRTVLIA-QIKKT-GLPLKLAGDPIVYE-----QVAFPFAKDDAHDK
437 B_subtilis_GlnH_homo_YCKK TYNDKLAVLN-YLKTSGNKNVKIAFETGEPQ-----STYFTFRKGS--GE
438 YA80_HAEIN TINDKLAVLD-YFKQHPNSGLKIAYDRGDKT-----PTAFAFLQGE--DA
439 FLIY_ECOLI ILVDRLAALD-LVKKT-NDTLAVTGEAFSRQ-----ESGVALRKGN--ED
440 E_coli_GlnH VLHDTPNILY-FIKTAGNGQFKAVGDSLEAQ-----QYGIAFPKGS--DE
441 Deinococcus_radiodurans AYNDRLVVNY-IINDQ-KLPVRGAGQIGDAA-----PVGIALKKGN--SA
442 HISJ_E_COLI AFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRKED--NE
443 . .: : . .
444
445 V_Harveyi_PATH LQAEVNKALAEMRADGTVEKISVKWFGADITK----
446 B_subtilis_YXEM LRKKVNKALDELRKDGTLKKLSEKYFNEDITVEQKH
447 B_subtilis_GlnH_homo_YCKK VVDQVNKALKEMKEDGTLSKISKKWFGEDVSK----
448 YA80_HAEIN LITKFNQVLEALRQDGTLKQISIEWFGYDITQ----
449 FLIY_ECOLI LLKAVNDAIAEMQKDGTLQALSEKWFGADVTK----
450 E_coli_GlnH LRDKVNGALKTLRENGTYNEIYKKWFGTEPK-----
451 Deinococcus_radiodurans LKDQIDKALTEMRSDGTFEKISQKWFGQDVGQP---
452 HISJ_E_COLI LREALNKAFAEMRADGTYEKLAKKYFDFDVYGG---
453 : .: .: :: :** . : ::*. :
454 """
455
456
457
458
459
460 phy_example = \
461 """ 8 286
462 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
463 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
464 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
465 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
466 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
467 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
468 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
469 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
470
471 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
472 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
473 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
474 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
475 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
476 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
477 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
478 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
479
480 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
481 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
482 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
483 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
484 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
485 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
486 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
487 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
488
489 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
490 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
491 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
492 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
493 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
494 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
495 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
496 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
497
498 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
499 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
500 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
501 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
502 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
503 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
504 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
505 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
506
507 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
508 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
509 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
510 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
511 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
512 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
513 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
514 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
515 """
516
517 nxs_example = \
518 """#NEXUS
519 BEGIN DATA;
520 dimensions ntax=8 nchar=286;
521 format missing=?
522 symbols="ABCDEFGHIKLMNPQRSTUVWXYZ"
523 interleave datatype=PROTEIN gap= -;
524
525 matrix
526 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
527 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
528 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
529 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
530 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
531 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
532 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
533 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
534
535 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
536 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
537 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
538 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
539 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
540 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
541 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
542 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
543
544 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQITVRKGNDSIQGVE
545 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQIVVKKDNTDIKSVD
546 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVVVTKKDNNDIKSEA
547 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVIVTKSSDNSIKSFE
548 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQALVKKGNEGTIKTAD
549 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLVMVKANNNDVKSVK
550 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEIIVAKNNTFNPQSLA
551 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAKNSDIQP-TVE
552
553 V_Harveyi_PATH DLAGKTVAVNLGSNFEQLLRDYDKDGKINIKTYDT--GIEHDVALGRADA
554 B_subtilis_YXEM DLKGKTVAAVLGSNHAKNLESKDPDKKINIKTYETQEGTLKDVAYGRVDA
555 B_subtilis_GlnH_homo_YCKK DVKGKTSAQSLTSNYNKLATN----AGAKVEGVEGMAQALQMIQQARVDM
556 YA80_HAEIN DLKGRKSAQSATSNWGKDAKA----AGAQILVVDGLAQSLELIKQGRAEA
557 FLIY_ECOLI DLKGKKVGVGLGTNYEEWLRQNV--QGVDVRTYDDDPTKYQDLRVGRIDA
558 E_coli_GlnH DLDGKVVAVKSGTGSVDYAKAN--IKTKDLRQFPNIDNAYMELGTNRADA
559 Deinococcus_radiodurans DLKGKRVGSTLGSNYEKQLIDTG---DIKIVTYPGAPEILADLVAGRIDA
560 HISJ_E_COLI SLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDA
561
562 V_Harveyi_PATH FIMDRLSALE-LIKKT-GLPLQLAGEPFETI-----QNAWPFVDNEKGRK
563 B_subtilis_YXEM YVNSRTVLIA-QIKKT-GLPLKLAGDPIVYE-----QVAFPFAKDDAHDK
564 B_subtilis_GlnH_homo_YCKK TYNDKLAVLN-YLKTSGNKNVKIAFETGEPQ-----STYFTFRKGS--GE
565 YA80_HAEIN TINDKLAVLD-YFKQHPNSGLKIAYDRGDKT-----PTAFAFLQGE--DA
566 FLIY_ECOLI ILVDRLAALD-LVKKT-NDTLAVTGEAFSRQ-----ESGVALRKGN--ED
567 E_coli_GlnH VLHDTPNILY-FIKTAGNGQFKAVGDSLEAQ-----QYGIAFPKGS--DE
568 Deinococcus_radiodurans AYNDRLVVNY-IINDQ-KLPVRGAGQIGDAA-----PVGIALKKGN--SA
569 HISJ_E_COLI AFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRKED--NE
570
571 V_Harveyi_PATH LQAEVNKALAEMRADGTVEKISVKWFGADITK----
572 B_subtilis_YXEM LRKKVNKALDELRKDGTLKKLSEKYFNEDITVEQKH
573 B_subtilis_GlnH_homo_YCKK VVDQVNKALKEMKEDGTLSKISKKWFGEDVSK----
574 YA80_HAEIN LITKFNQVLEALRQDGTLKQISIEWFGYDITQ----
575 FLIY_ECOLI LLKAVNDAIAEMQKDGTLQALSEKWFGADVTK----
576 E_coli_GlnH LRDKVNGALKTLRENGTYNEIYKKWFGTEPK-----
577 Deinococcus_radiodurans LKDQIDKALTEMRSDGTFEKISQKWFGQDVGQP---
578 HISJ_E_COLI LREALNKAFAEMRADGTYEKLAKKYFDFDVYGG---
579 ;
580 end;
581 """
582
583
584
585 nxs_example2 = \
586 """#NEXUS
587
588 Begin data;
589 Dimensions ntax=10 nchar=705;
590 Format datatype=dna interleave=yes gap=- missing=?;
591 Matrix
592 Cow ATGGCATATCCCATACAACTAGGATTCCAAGATGCAACATCACCAATCATAGAAGAACTA
593 Carp ATGGCACACCCAACGCAACTAGGTTTCAAGGACGCGGCCATACCCGTTATAGAGGAACTT
594 Chicken ATGGCCAACCACTCCCAACTAGGCTTTCAAGACGCCTCATCCCCCATCATAGAAGAGCTC
595 Human ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTT
596 Loach ATGGCACATCCCACACAATTAGGATTCCAAGACGCGGCCTCACCCGTAATAGAAGAACTT
597 Mouse ATGGCCTACCCATTCCAACTTGGTCTACAAGACGCCACATCCCCTATTATAGAAGAGCTA
598 Rat ATGGCTTACCCATTTCAACTTGGCTTACAAGACGCTACATCACCTATCATAGAAGAACTT
599 Seal ATGGCATACCCCCTACAAATAGGCCTACAAGATGCAACCTCTCCCATTATAGAGGAGTTA
600 Whale ATGGCATATCCATTCCAACTAGGTTTCCAAGATGCAGCATCACCCATCATAGAAGAGCTC
601 Frog ATGGCACACCCATCACAATTAGGTTTTCAAGACGCAGCCTCTCCAATTATAGAAGAATTA
602
603 Cow CTTCACTTTCATGACCACACGCTAATAATTGTCTTCTTAATTAGCTCATTAGTACTTTAC
604 Carp CTTCACTTCCACGACCACGCATTAATAATTGTGCTCCTAATTAGCACTTTAGTTTTATAT
605 Chicken GTTGAATTCCACGACCACGCCCTGATAGTCGCACTAGCAATTTGCAGCTTAGTACTCTAC
606 Human ATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCCTGTAT
607 Loach CTTCACTTCCATGACCATGCCCTAATAATTGTATTTTTGATTAGCGCCCTAGTACTTTAT
608 Mouse ATAAATTTCCATGATCACACACTAATAATTGTTTTCCTAATTAGCTCCTTAGTCCTCTAT
609 Rat ACAAACTTTCATGACCACACCCTAATAATTGTATTCCTCATCAGCTCCCTAGTACTTTAT
610 Seal CTACACTTCCATGACCACACATTAATAATTGTGTTCCTAATTAGCTCATTAGTACTCTAC
611 Whale CTACACTTTCACGATCATACACTAATAATCGTTTTTCTAATTAGCTCTTTAGTTCTCTAC
612 Frog CTTCACTTCCACGACCATACCCTCATAGCCGTTTTTCTTATTAGTACGCTAGTTCTTTAC
613
614 Cow ATTATTTCACTAATACTAACGACAAAGCTGACCCATACAAGCACGATAGATGCACAAGAA
615 Carp ATTATTACTGCAATGGTATCAACTAAACTTACTAATAAATATATTCTAGACTCCCAAGAA
616 Chicken CTTCTAACTCTTATACTTATAGAAAAACTATCA---TCAAACACCGTAGATGCCCAAGAA
617 Human GCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAA
618 Loach GTTATTATTACAACCGTCTCAACAAAACTCACTAACATATATATTTTGGACTCACAAGAA
619 Mouse ATCATCTCGCTAATATTAACAACAAAACTAACACATACAAGCACAATAGATGCACAAGAA
620 Rat ATTATTTCACTAATACTAACAACAAAACTAACACACACAAGCACAATAGACGCCCAAGAA
621 Seal ATTATCTCACTTATACTAACCACGAAACTCACCCACACAAGTACAATAGACGCACAAGAA
622 Whale ATTATTACCCTAATGCTTACAACCAAATTAACACATACTAGTACAATAGACGCCCAAGAA
623 Frog ATTATTACTATTATAATAACTACTAAACTAACTAATACAAACCTAATGGACGCACAAGAG
624
625 Cow GTAGAGACAATCTGAACCATTCTGCCCGCCATCATCTTAATTCTAATTGCTCTTCCTTCT
626 Carp ATCGAAATCGTATGAACCATTCTACCAGCCGTCATTTTAGTACTAATCGCCCTGCCCTCC
627 Chicken GTTGAACTAATCTGAACCATCCTACCCGCTATTGTCCTAGTCCTGCTTGCCCTCCCCTCC
628 Human ATAGAAACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCC
629 Loach ATTGAAATCGTATGAACTGTGCTCCCTGCCCTAATCCTCATTTTAATCGCCCTCCCCTCA
630 Mouse GTTGAAACCATTTGAACTATTCTACCAGCTGTAATCCTTATCATAATTGCTCTCCCCTCT
631 Rat GTAGAAACAATTTGAACAATTCTCCCAGCTGTCATTCTTATTCTAATTGCCCTTCCCTCC
632 Seal GTGGAAACGGTGTGAACGATCCTACCCGCTATCATTTTAATTCTCATTGCCCTACCATCA
633 Whale GTAGAAACTGTCTGAACTATCCTCCCAGCCATTATCTTAATTTTAATTGCCTTGCCTTCA
634 Frog ATCGAAATAGTGTGAACTATTATACCAGCTATTAGCCTCATCATAATTGCCCTTCCATCC
635
636 Cow TTACGAATTCTATACATAATAGATGAAATCAATAACCCATCTCTTACAGTAAAAACCATA
637 Carp CTACGCATCCTGTACCTTATAGACGAAATTAACGACCCTCACCTGACAATTAAAGCAATA
638 Chicken CTCCAAATCCTCTACATAATAGACGAAATCGACGAACCTGATCTCACCCTAAAAGCCATC
639 Human CTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATT
640 Loach CTACGAATTCTATATCTTATAGACGAGATTAATGACCCCCACCTAACAATTAAGGCCATG
641 Mouse CTACGCATTCTATATATAATAGACGAAATCAACAACCCCGTATTAACCGTTAAAACCATA
642 Rat CTACGAATTCTATACATAATAGACGAGATTAATAACCCAGTTCTAACAGTAAAAACTATA
643 Seal TTACGAATCCTCTACATAATGGACGAGATCAATAACCCTTCCTTGACCGTAAAAACTATA
644 Whale TTACGGATCCTTTACATAATAGACGAAGTCAATAACCCCTCCCTCACTGTAAAAACAATA
645 Frog CTTCGTATCCTATATTTAATAGATGAAGTTAATGATCCACACTTAACAATTAAAGCAATC
646
647 Cow GGACATCAGTGATACTGAAGCTATGAGTATACAGATTATGAGGACTTAAGCTTCGACTCC
648 Carp GGACACCAATGATACTGAAGTTACGAGTATACAGACTATGAAAATCTAGGATTCGACTCC
649 Chicken GGACACCAATGATACTGAACCTATGAATACACAGACTTCAAGGACCTCTCATTTGACTCC
650 Human GGCCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCC
651 Loach GGGCACCAATGATACTGAAGCTACGAGTATACTGATTATGAAAACTTAAGTTTTGACTCC
652 Mouse GGGCACCAATGATACTGAAGCTACGAATATACTGACTATGAAGACCTATGCTTTGATTCA
653 Rat GGACACCAATGATACTGAAGCTATGAATATACTGACTATGAAGACCTATGCTTTGACTCC
654 Seal GGACATCAGTGATACTGAAGCTATGAGTACACAGACTACGAAGACCTGAACTTTGACTCA
655 Whale GGTCACCAATGATATTGAAGCTATGAGTATACCGACTACGAAGACCTAAGCTTCGACTCC
656 Frog GGCCACCAATGATACTGAAGCTACGAATATACTAACTATGAGGATCTCTCATTTGACTCT
657
658 Cow TACATAATTCCAACATCAGAATTAAAGCCAGGGGAGCTACGACTATTAGAAGTCGATAAT
659 Carp TATATAGTACCAACCCAAGACCTTGCCCCCGGACAATTCCGACTTCTGGAAACAGACCAC
660 Chicken TACATAACCCCAACAACAGACCTCCCCCTAGGCCACTTCCGCCTACTAGAAGTCGACCAT
661 Human TACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTGACAAT
662 Loach TACATAATCCCCACCCAGGACCTAACCCCTGGACAATTCCGGCTACTAGAGACAGACCAC
663 Mouse TATATAATCCCAACAAACGACCTAAAACCTGGTGAACTACGACTGCTAGAAGTTGATAAC
664 Rat TACATAATCCCAACCAATGACCTAAAACCAGGTGAACTTCGTCTATTAGAAGTTGATAAT
665 Seal TATATGATCCCCACACAAGAACTAAAGCCCGGAGAACTACGACTGCTAGAAGTAGACAAT
666 Whale TATATAATCCCAACATCAGACCTAAAGCCAGGAGAACTACGATTATTAGAAGTAGATAAC
667 Frog TATATAATTCCAACTAATGACCTTACCCCTGGACAATTCCGGCTGCTAGAAGTTGATAAT
668
669 Cow CGAGTTGTACTACCAATAGAAATAACAATCCGAATGTTAGTCTCCTCTGAAGACGTATTA
670 Carp CGAATAGTTGTTCCAATAGAATCCCCAGTCCGTGTCCTAGTATCTGCTGAAGACGTGCTA
671 Chicken CGCATTGTAATCCCCATAGAATCCCCCATTCGAGTAATCATCACCGCTGATGACGTCCTC
672 Human CGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTG
673 Loach CGAATGGTTGTTCCCATAGAATCCCCTATTCGCATTCTTGTTTCCGCCGAAGATGTACTA
674 Mouse CGAGTCGTTCTGCCAATAGAACTTCCAATCCGTATATTAATTTCATCTGAAGACGTCCTC
675 Rat CGGGTAGTCTTACCAATAGAACTTCCAATTCGTATACTAATCTCATCCGAAGACGTCCTG
676 Seal CGAGTAGTCCTCCCAATAGAAATAACAATCCGCATACTAATCTCATCAGAAGATGTACTC
677 Whale CGAGTTGTCTTACCTATAGAAATAACAATCCGAATATTAGTCTCATCAGAAGACGTACTC
678 Frog CGAATAGTAGTCCCAATAGAATCTCCAACCCGACTTTTAGTTACAGCCGAAGACGTCCTC
679
680 Cow CACTCATGAGCTGTGCCCTCTCTAGGACTAAAAACAGACGCAATCCCAGGCCGTCTAAAC
681 Carp CATTCTTGAGCTGTTCCATCCCTTGGCGTAAAAATGGACGCAGTCCCAGGACGACTAAAT
682 Chicken CACTCATGAGCCGTACCCGCCCTCGGGGTAAAAACAGACGCAATCCCTGGACGACTAAAT
683 Human CACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTCTAAAC
684 Loach CACTCCTGGGCCCTTCCAGCCATGGGGGTAAAGATAGACGCGGTCCCAGGACGCCTTAAC
685 Mouse CACTCATGAGCAGTCCCCTCCCTAGGACTTAAAACTGATGCCATCCCAGGCCGACTAAAT
686 Rat CACTCATGAGCCATCCCTTCACTAGGGTTAAAAACCGACGCAATCCCCGGCCGCCTAAAC
687 Seal CACTCATGAGCCGTACCGTCCCTAGGACTAAAAACTGATGCTATCCCAGGACGACTAAAC
688 Whale CACTCATGGGCCGTACCCTCCTTGGGCCTAAAAACAGATGCAATCCCAGGACGCCTAAAC
689 Frog CACTCGTGAGCTGTACCCTCCTTGGGTGTCAAAACAGATGCAATCCCAGGACGACTTCAT
690
691 Cow CAAACAACCCTTATATCGTCCCGTCCAGGCTTATATTACGGTCAATGCTCAGAAATTTGC
692 Carp CAAGCCGCCTTTATTGCCTCACGCCCAGGGGTCTTTTACGGACAATGCTCTGAAATTTGT
693 Chicken CAAACCTCCTTCATCACCACTCGACCAGGAGTGTTTTACGGACAATGCTCAGAAATCTGC
694 Human CAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGT
695 Loach CAAACCGCCTTTATTGCCTCCCGCCCCGGGGTATTCTATGGGCAATGCTCAGAAATCTGT
696 Mouse CAAGCAACAGTAACATCAAACCGACCAGGGTTATTCTATGGCCAATGCTCTGAAATTTGT
697 Rat CAAGCTACAGTCACATCAAACCGACCAGGTCTATTCTATGGCCAATGCTCTGAAATTTGC
698 Seal CAAACAACCCTAATAACCATACGACCAGGACTGTACTACGGTCAATGCTCAGAAATCTGT
699 Whale CAAACAACCTTAATATCAACACGACCAGGCCTATTTTATGGACAATGCTCAGAGATCTGC
700 Frog CAAACATCATTTATTGCTACTCGTCCGGGAGTATTTTACGGACAATGTTCAGAAATTTGC
701
702 Cow GGGTCAAACCACAGTTTCATACCCATTGTCCTTGAGTTAGTCCCACTAAAGTACTTTGAA
703 Carp GGAGCTAATCACAGCTTTATACCAATTGTAGTTGAAGCAGTACCTCTCGAACACTTCGAA
704 Chicken GGAGCTAACCACAGCTACATACCCATTGTAGTAGAGTCTACCCCCCTAAAACACTTTGAA
705 Human GGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAA
706 Loach GGAGCAAACCACAGCTTTATACCCATCGTAGTAGAAGCGGTCCCACTATCTCACTTCGAA
707 Mouse GGATCTAACCATAGCTTTATGCCCATTGTCCTAGAAATGGTTCCACTAAAATATTTCGAA
708 Rat GGCTCAAATCACAGCTTCATACCCATTGTACTAGAAATAGTGCCTCTAAAATATTTCGAA
709 Seal GGTTCAAACCACAGCTTCATACCTATTGTCCTCGAATTGGTCCCACTATCCCACTTCGAG
710 Whale GGCTCAAACCACAGTTTCATACCAATTGTCCTAGAACTAGTACCCCTAGAAGTCTTTGAA
711 Frog GGAGCAAACCACAGCTTTATACCAATTGTAGTTGAAGCAGTACCGCTAACCGACTTTGAA
712
713 Cow AAATGATCTGCGTCAATATTA---------------------TAA
714 Carp AACTGATCCTCATTAATACTAGAAGACGCCTCGCTAGGAAGCTAA
715 Chicken GCCTGATCCTCACTA------------------CTGTCATCTTAA
716 Human ATA---------------------GGGCCCGTATTTACCCTATAG
717 Loach AACTGGTCCACCCTTATACTAAAAGACGCCTCACTAGGAAGCTAA
718 Mouse AACTGATCTGCTTCAATAATT---------------------TAA
719 Rat AACTGATCAGCTTCTATAATT---------------------TAA
720 Seal AAATGATCTACCTCAATGCTT---------------------TAA
721 Whale AAATGATCTGTATCAATACTA---------------------TAA
722 Frog AACTGATCTTCATCAATACTA---GAAGCATCACTA------AGA
723 ;
724 End;
725 """
726
727
728
729 nxs_example3 = \
730 """#NEXUS
731
732 Begin data;
733 Dimensions ntax=10 nchar=234;
734 Format datatype=protein gap=- interleave;
735 Matrix
736 Cow MAYPMQLGFQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
737 Carp MAHPTQLGFKDAAMPVMEELLHFHDHALMIVLLISTLVLYIITAMVSTKLTNKYILDSQE
738 Chicken MANHSQLGFQDASSPIMEELVEFHDHALMVALAICSLVLYLLTLMLMEKLS-SNTVDAQE
739 Human MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTLTTKLTNTNISDAQE
740 Loach MAHPTQLGFQDAASPVMEELLHFHDHALMIVFLISALVLYVIITTVSTKLTNMYILDSQE
741 Mouse MAYPFQLGLQDATSPIMEELMNFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
742 Rat MAYPFQLGLQDATSPIMEELTNFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
743 Seal MAYPLQMGLQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQE
744 Whale MAYPFQLGFQDAASPIMEELLHFHDHTLMIVFLISSLVLYIITLMLTTKLTHTSTMDAQE
745 Frog MAHPSQLGFQDAASPIMEELLHFHDHTLMAVFLISTLVLYIITIMMTTKLTNTNLMDAQE
746
747 Cow VETIWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLSFDS
748 Carp IEIVWTILPAVILVLIALPSLRILYLMDEINDPHLTIKAMGHQWYWSYEYTDYENLGFDS
749 Chicken VELIWTILPAIVLVLLALPSLQILYMMDEIDEPDLTLKAIGHQWYWTYEYTDFKDLSFDS
750 Human METVWTILPAIILVLIALPSLRILYMTDEVNDPSLTIKSIGHQWYWTYEYTDYGGLIFNS
751 Loach IEIVWTVLPALILILIALPSLRILYLMDEINDPHLTIKAMGHQWYWSYEYTDYENLSFDS
752 Mouse VETIWTILPAVILIMIALPSLRILYMMDEINNPVLTVKTMGHQWYWSYEYTDYEDLCFDS
753 Rat VETIWTILPAVILILIALPSLRILYMMDEINNPVLTVKTMGHQWYWSYEYTDYEDLCFDS
754 Seal VETVWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLNFDS
755 Whale VETVWTILPAIILILIALPSLRILYMMDEVNNPSLTVKTMGHQWYWSYEYTDYEDLSFDS
756 Frog IEMVWTIMPAISLIMIALPSLRILYLMDEVNDPHLTIKAIGHQWYWSYEYTNYEDLSFDS
757
758 Cow YMIPTSELKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLN
759 Carp YMVPTQDLAPGQFRLLETDHRMVVPMESPVRVLVSAEDVLHSWAVPSLGVKMDAVPGRLN
760 Chicken YMTPTTDLPLGHFRLLEVDHRIVIPMESPIRVIITADDVLHSWAVPALGVKTDAIPGRLN
761 Human YMLPPLFLEPGDLRLLDVDNRVVLPIEAPIRMMITSQDVLHSWAVPTLGLKTDAIPGRLN
762 Loach YMIPTQDLTPGQFRLLETDHRMVVPMESPIRILVSAEDVLHSWALPAMGVKMDAVPGRLN
763 Mouse YMIPTNDLKPGELRLLEVDNRVVLPMELPIRMLISSEDVLHSWAVPSLGLKTDAIPGRLN
764 Rat YMIPTNDLKPGELRLLEVDNRVVLPMELPIRMLISSEDVLHSWAIPSLGLKTDAIPGRLN
765 Seal YMIPTQELKPGELRLLEVDNRVVLPMEMTIRMLISSEDVLHSWAVPSLGLKTDAIPGRLN
766 Whale YMIPTSDLKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLN
767 Frog YMIPTNDLTPGQFRLLEVDNRMVVPMESPTRLLVTAEDVLHSWAVPSLGVKTDAIPGRLH
768
769 Cow QTTLMSSRPGLYYGQCSEICGSNHSFMPIVLELVPLKYFEKWSASML-------
770 Carp QAAFIASRPGVFYGQCSEICGANHSFMPIVVEAVPLEHFENWSSLMLEDASLGS
771 Chicken QTSFITTRPGVFYGQCSEICGANHSYMPIVVESTPLKHFEAWSSL------LSS
772 Human QTTFTATRPGVYYGQCSEICGANHSFMPIVLELIPLKIFEM-------GPVFTL
773 Loach QTAFIASRPGVFYGQCSEICGANHSFMPIVVEAVPLSHFENWSTLMLKDASLGS
774 Mouse QATVTSNRPGLFYGQCSEICGSNHSFMPIVLEMVPLKYFENWSASMI-------
775 Rat QATVTSNRPGLFYGQCSEICGSNHSFMPIVLEMVPLKYFENWSASMI-------
776 Seal QTTLMTMRPGLYYGQCSEICGSNHSFMPIVLELVPLSHFEKWSTSML-------
777 Whale QTTLMSTRPGLFYGQCSEICGSNHSFMPIVLELVPLEVFEKWSVSML-------
778 Frog QTSFIATRPGVFYGQCSEICGANHSFMPIVVEAVPLTDFENWSSSML-EASL--
779 ;
780 End;
781 """
782
783
784
785 sth_example = \
786 """# STOCKHOLM 1.0
787 #=GF ID CBS
788 #=GF AC PF00571
789 #=GF DE CBS domain
790 #=GF AU Bateman A
791 #=GF CC CBS domains are small intracellular modules mostly found
792 #=GF CC in 2 or four copies within a protein.
793 #=GF SQ 67
794 #=GS O31698/18-71 AC O31698
795 #=GS O83071/192-246 AC O83071
796 #=GS O83071/259-312 AC O83071
797 #=GS O31698/88-139 AC O31698
798 #=GS O31698/88-139 OS Bacillus subtilis
799 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
800 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
801 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
802 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
803 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
804 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
805 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
806 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
807 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
808 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
809 #=GR O31699/88-139 AS ________________*__________________________
810 #=GR_O31699/88-139_IN ____________1______________2__________0____
811 //
812 """
813
814
815
816 sth_example2 = \
817 """# STOCKHOLM 1.0
818 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
819 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
820 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
821 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
822 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
823
824 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
825 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
826 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
827 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
828 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
829 //"""
830
831
832
833 gbk_example = \
834 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
835 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
836 (AXL2) and Rev7p (REV7) genes, complete cds.
837 ACCESSION U49845
838 VERSION U49845.1 GI:1293613
839 KEYWORDS .
840 SOURCE Saccharomyces cerevisiae (baker's yeast)
841 ORGANISM Saccharomyces cerevisiae
842 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
843 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
844 REFERENCE 1 (bases 1 to 5028)
845 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
846 TITLE Cloning and sequence of REV7, a gene whose function is required for
847 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
848 JOURNAL Yeast 10 (11), 1503-1509 (1994)
849 PUBMED 7871890
850 REFERENCE 2 (bases 1 to 5028)
851 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
852 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
853 plasma membrane glycoprotein
854 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
855 PUBMED 8846915
856 REFERENCE 3 (bases 1 to 5028)
857 AUTHORS Roemer,T.
858 TITLE Direct Submission
859 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
860 Haven, CT, USA
861 FEATURES Location/Qualifiers
862 source 1..5028
863 /organism="Saccharomyces cerevisiae"
864 /db_xref="taxon:4932"
865 /chromosome="IX"
866 /map="9"
867 CDS <1..206
868 /codon_start=3
869 /product="TCP1-beta"
870 /protein_id="AAA98665.1"
871 /db_xref="GI:1293614"
872 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
873 AEVLLRVDNIIRARPRTANRQHM"
874 gene 687..3158
875 /gene="AXL2"
876 CDS 687..3158
877 /gene="AXL2"
878 /note="plasma membrane glycoprotein"
879 /codon_start=1
880 /function="required for axial budding pattern of S.
881 cerevisiae"
882 /product="Axl2p"
883 /protein_id="AAA98666.1"
884 /db_xref="GI:1293615"
885 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
886 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
887 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
888 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
889 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
890 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
891 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
892 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
893 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
894 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
895 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
896 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
897 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
898 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
899 VDFSNKSNVNVGQVKDIHGRIPEML"
900 gene complement(3300..4037)
901 /gene="REV7"
902 CDS complement(3300..4037)
903 /gene="REV7"
904 /codon_start=1
905 /product="Rev7p"
906 /protein_id="AAA98667.1"
907 /db_xref="GI:1293616"
908 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
909 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
910 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
911 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
912 LISGDDKILNGVYSQYEEGESIFGSLF"
913 ORIGIN
914 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
915 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
916 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
917 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
918 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
919 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
920 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
921 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
922 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
923 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
924 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
925 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
926 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
927 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
928 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
929 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
930 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
931 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
932 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
933 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
934 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
935 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
936 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
937 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
938 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
939 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
940 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
941 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
942 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
943 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
944 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
945 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
946 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
947 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
948 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
949 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
950 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
951 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
952 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
953 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
954 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
955 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
956 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
957 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
958 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
959 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
960 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
961 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
962 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
963 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
964 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
965 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
966 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
967 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
968 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
969 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
970 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
971 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
972 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
973 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
974 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
975 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
976 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
977 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
978 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
979 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
980 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
981 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
982 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
983 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
984 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
985 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
986 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
987 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
988 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
989 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
990 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
991 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
992 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
993 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
994 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
995 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
996 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
997 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
998 //"""
999
1000
1001
1002 gbk_example2 = \
1003 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1004 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1005 ACCESSION AAD51968
1006 VERSION AAD51968.1 GI:5805369
1007 DBSOURCE locus AF171097 accession AF171097.1
1008 KEYWORDS .
1009 SOURCE Yersinia enterocolitica
1010 ORGANISM Yersinia enterocolitica
1011 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1012 Enterobacteriaceae; Yersinia.
1013 REFERENCE 1 (residues 1 to 143)
1014 AUTHORS Revell,P.A. and Miller,V.L.
1015 TITLE A chromosomally encoded regulator is required for expression of the
1016 Yersinia enterocolitica inv gene and for virulence
1017 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1018 MEDLINE 20138369
1019 PUBMED 10672189
1020 REFERENCE 2 (residues 1 to 143)
1021 AUTHORS Revell,P.A. and Miller,V.L.
1022 TITLE Direct Submission
1023 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1024 University School of Medicine, Campus Box 8230, 660 South Euclid,
1025 St. Louis, MO 63110, USA
1026 COMMENT Method: conceptual translation.
1027 FEATURES Location/Qualifiers
1028 source 1..143
1029 /organism="Yersinia enterocolitica"
1030 /mol_type="unassigned DNA"
1031 /strain="JB580v"
1032 /serotype="O:8"
1033 /db_xref="taxon:630"
1034 Protein 1..143
1035 /product="transcriptional regulator RovA"
1036 /name="regulates inv expression"
1037 CDS 1..143
1038 /gene="rovA"
1039 /coded_by="AF171097.1:380..811"
1040 /note="regulator of virulence"
1041 /transl_table=11
1042 ORIGIN
1043 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1044 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1045 121 deiellsgli dklerniiql qsk
1046 //"""
1047
1048
1049 swiss_example = \
1050 """ID 104K_THEAN Reviewed; 893 AA.
1051 AC Q4U9M9;
1052 DT 18-APR-2006, integrated into UniProtKB/Swiss-Prot.
1053 DT 05-JUL-2005, sequence version 1.
1054 DT 31-OCT-2006, entry version 8.
1055 DE 104 kDa microneme-rhoptry antigen precursor (p104).
1056 GN ORFNames=TA08425;
1057 OS Theileria annulata.
1058 OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;
1059 OC Theileria.
1060 OX NCBI_TaxID=5874;
1061 RN [1]
1062 RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].
1063 RC STRAIN=Ankara;
1064 RX PubMed=15994557; DOI=10.1126/science.1110418;
1065 RA Pain A., Renauld H., Berriman M., Murphy L., Yeats C.A., Weir W.,
1066 RA Kerhornou A., Aslett M., Bishop R., Bouchier C., Cochet M.,
1067 RA Coulson R.M.R., Cronin A., de Villiers E.P., Fraser A., Fosker N.,
1068 RA Gardner M., Goble A., Griffiths-Jones S., Harris D.E., Katzer F.,
1069 RA Larke N., Lord A., Maser P., McKellar S., Mooney P., Morton F.,
1070 RA Nene V., O'Neil S., Price C., Quail M.A., Rabbinowitsch E.,
1071 RA Rawlings N.D., Rutter S., Saunders D., Seeger K., Shah T., Squares R.,
1072 RA Squares S., Tivey A., Walker A.R., Woodward J., Dobbelaere D.A.E.,
1073 RA Langsley G., Rajandream M.A., McKeever D., Shiels B., Tait A.,
1074 RA Barrell B.G., Hall N.;
1075 RT "Genome of the host-cell transforming parasite Theileria annulata
1076 RT compared with T. parva.";
1077 RL Science 309:131-133(2005).
1078 CC -!- SUBCELLULAR LOCATION: Cell membrane; lipid-anchor; GPI-anchor
1079 CC (Potential). In microneme/rhoptry complexes (By similarity).
1080 DR EMBL; CR940353; CAI76474.1; -; Genomic_DNA.
1081 DR InterPro; IPR007480; DUF529.
1082 DR Pfam; PF04385; FAINT; 4.
1083 KW Complete proteome; GPI-anchor; Lipoprotein; Membrane; Repeat; Signal;
1084 KW Sporozoite.
1085 FT SIGNAL 1 19 Potential.
1086 FT CHAIN 20 873 104 kDa microneme-rhoptry antigen.
1087 FT /FTId=PRO_0000232680.
1088 FT PROPEP 874 893 Removed in mature form (Potential).
1089 FT /FTId=PRO_0000232681.
1090 FT COMPBIAS 215 220 Poly-Leu.
1091 FT COMPBIAS 486 683 Lys-rich.
1092 FT COMPBIAS 854 859 Poly-Arg.
1093 FT LIPID 873 873 GPI-anchor amidated aspartate
1094 FT (Potential).
1095 SQ SEQUENCE 893 AA; 101921 MW; 2F67CEB3B02E7AC1 CRC64;
1096 MKFLVLLFNI LCLFPILGAD ELVMSPIPTT DVQPKVTFDI NSEVSSGPLY LNPVEMAGVK
1097 YLQLQRQPGV QVHKVVEGDI VIWENEEMPL YTCAIVTQNE VPYMAYVELL EDPDLIFFLK
1098 EGDQWAPIPE DQYLARLQQL RQQIHTESFF SLNLSFQHEN YKYEMVSSFQ HSIKMVVFTP
1099 KNGHICKMVY DKNIRIFKAL YNEYVTSVIG FFRGLKLLLL NIFVIDDRGM IGNKYFQLLD
1100 DKYAPISVQG YVATIPKLKD FAEPYHPIIL DISDIDYVNF YLGDATYHDP GFKIVPKTPQ
1101 CITKVVDGNE VIYESSNPSV ECVYKVTYYD KKNESMLRLD LNHSPPSYTS YYAKREGVWV
1102 TSTYIDLEEK IEELQDHRST ELDVMFMSDK DLNVVPLTNG NLEYFMVTPK PHRDIIIVFD
1103 GSEVLWYYEG LENHLVCTWI YVTEGAPRLV HLRVKDRIPQ NTDIYMVKFG EYWVRISKTQ
1104 YTQEIKKLIK KSKKKLPSIE EEDSDKHGGP PKGPEPPTGP GHSSSESKEH EDSKESKEPK
1105 EHGSPKETKE GEVTKKPGPA KEHKPSKIPV YTKRPEFPKK SKSPKRPESP KSPKRPVSPQ
1106 RPVSPKSPKR PESLDIPKSP KRPESPKSPK RPVSPQRPVS PRRPESPKSP KSPKSPKSPK
1107 VPFDPKFKEK LYDSYLDKAA KTKETVTLPP VLPTDESFTH TPIGEPTAEQ PDDIEPIEES
1108 VFIKETGILT EEVKTEDIHS ETGEPEEPKR PDSPTKHSPK PTGTHPSMPK KRRRSDGLAL
1109 STTDLESEAG RILRDPTGKI VTMKRSKSFD DLTTVREKEH MGAEIRKIVV DDDGTEADDE
1110 DTHPSKEKHL STVRRRRPRP KKSSKSSKPR KPDSAFVPSI IFIFLVSLIV GIL
1111 //
1112 ID 104K_THEPA Reviewed; 924 AA.
1113 AC P15711; Q4N2B5;
1114 DT 01-APR-1990, integrated into UniProtKB/Swiss-Prot.
1115 DT 01-APR-1990, sequence version 1.
1116 DT 31-OCT-2006, entry version 31.
1117 DE 104 kDa microneme-rhoptry antigen precursor (p104).
1118 GN OrderedLocusNames=TP04_0437;
1119 OS Theileria parva.
1120 OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae;
1121 OC Theileria.
1122 OX NCBI_TaxID=5875;
1123 RN [1]
1124 RP NUCLEOTIDE SEQUENCE [GENOMIC DNA].
1125 RC STRAIN=Muguga;
1126 RX MEDLINE=90158697; PubMed=1689460; DOI=10.1016/0166-6851(90)90007-9;
1127 RA Iams K.P., Young J.R., Nene V., Desai J., Webster P., Ole-Moiyoi O.K.,
1128 RA Musoke A.J.;
1129 RT "Characterisation of the gene encoding a 104-kilodalton microneme-
1130 RT rhoptry protein of Theileria parva.";
1131 RL Mol. Biochem. Parasitol. 39:47-60(1990).
1132 RN [2]
1133 RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].
1134 RC STRAIN=Muguga;
1135 RX PubMed=15994558; DOI=10.1126/science.1110439;
1136 RA Gardner M.J., Bishop R., Shah T., de Villiers E.P., Carlton J.M.,
1137 RA Hall N., Ren Q., Paulsen I.T., Pain A., Berriman M., Wilson R.J.M.,
1138 RA Sato S., Ralph S.A., Mann D.J., Xiong Z., Shallom S.J., Weidman J.,
1139 RA Jiang L., Lynn J., Weaver B., Shoaibi A., Domingo A.R., Wasawo D.,
1140 RA Crabtree J., Wortman J.R., Haas B., Angiuoli S.V., Creasy T.H., Lu C.,
1141 RA Suh B., Silva J.C., Utterback T.R., Feldblyum T.V., Pertea M.,
1142 RA Allen J., Nierman W.C., Taracha E.L.N., Salzberg S.L., White O.R.,
1143 RA Fitzhugh H.A., Morzaria S., Venter J.C., Fraser C.M., Nene V.;
1144 RT "Genome sequence of Theileria parva, a bovine pathogen that transforms
1145 RT lymphocytes.";
1146 RL Science 309:134-137(2005).
1147 CC -!- SUBCELLULAR LOCATION: Cell membrane; lipid-anchor; GPI-anchor
1148 CC (Potential). In microneme/rhoptry complexes.
1149 CC -!- DEVELOPMENTAL STAGE: Sporozoite antigen.
1150 DR EMBL; M29954; AAA18217.1; -; Unassigned_DNA.
1151 DR EMBL; AAGK01000004; EAN31789.1; -; Genomic_DNA.
1152 DR PIR; A44945; A44945.
1153 DR InterPro; IPR007480; DUF529.
1154 DR Pfam; PF04385; FAINT; 4.
1155 KW Complete proteome; GPI-anchor; Lipoprotein; Membrane; Repeat; Signal;
1156 KW Sporozoite.
1157 FT SIGNAL 1 19 Potential.
1158 FT CHAIN 20 904 104 kDa microneme-rhoptry antigen.
1159 FT /FTId=PRO_0000046081.
1160 FT PROPEP 905 924 Removed in mature form (Potential).
1161 FT /FTId=PRO_0000232679.
1162 FT COMPBIAS 508 753 Pro-rich.
1163 FT COMPBIAS 880 883 Poly-Arg.
1164 FT LIPID 904 904 GPI-anchor amidated aspartate
1165 FT (Potential).
1166 SQ SEQUENCE 924 AA; 103626 MW; 289B4B554A61870E CRC64;
1167 MKFLILLFNI LCLFPVLAAD NHGVGPQGAS GVDPITFDIN SNQTGPAFLT AVEMAGVKYL
1168 QVQHGSNVNI HRLVEGNVVI WENASTPLYT GAIVTNNDGP YMAYVEVLGD PNLQFFIKSG
1169 DAWVTLSEHE YLAKLQEIRQ AVHIESVFSL NMAFQLENNK YEVETHAKNG ANMVTFIPRN
1170 GHICKMVYHK NVRIYKATGN DTVTSVVGFF RGLRLLLINV FSIDDNGMMS NRYFQHVDDK
1171 YVPISQKNYE TGIVKLKDYK HAYHPVDLDI KDIDYTMFHL ADATYHEPCF KIIPNTGFCI
1172 TKLFDGDQVL YESFNPLIHC INEVHIYDRN NGSIICLHLN YSPPSYKAYL VLKDTGWEAT
1173 THPLLEEKIE ELQDQRACEL DVNFISDKDL YVAALTNADL NYTMVTPRPH RDVIRVSDGS
1174 EVLWYYEGLD NFLVCAWIYV SDGVASLVHL RIKDRIPANN DIYVLKGDLY WTRITKIQFT
1175 QEIKRLVKKS KKKLAPITEE DSDKHDEPPE GPGASGLPPK APGDKEGSEG HKGPSKGSDS
1176 SKEGKKPGSG KKPGPAREHK PSKIPTLSKK PSGPKDPKHP RDPKEPRKSK SPRTASPTRR
1177 PSPKLPQLSK LPKSTSPRSP PPPTRPSSPE RPEGTKIIKT SKPPSPKPPF DPSFKEKFYD
1178 DYSKAASRSK ETKTTVVLDE SFESILKETL PETPGTPFTT PRPVPPKRPR TPESPFEPPK
1179 DPDSPSTSPS EFFTPPESKR TRFHETPADT PLPDVTAELF KEPDVTAETK SPDEAMKRPR
1180 SPSEYEDTSP GDYPSLPMKR HRLERLRLTT TEMETDPGRM AKDASGKPVK LKRSKSFDDL
1181 TTVELAPEPK ASRIVVDDEG TEADDEETHP PEERQKTEVR RRRPPKKPSK SPRPSKPKKP
1182 KKPDSAYIPS ILAILVVSLI VGIL
1183 //
1184 ID 108_SOLLC Reviewed; 102 AA.
1185 AC Q43495;
1186 DT 15-JUL-1999, integrated into UniProtKB/Swiss-Prot.
1187 DT 01-NOV-1996, sequence version 1.
1188 DT 31-OCT-2006, entry version 37.
1189 DE Protein 108 precursor.
1190 OS Solanum lycopersicum (Tomato) (Lycopersicon esculentum).
1191 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1192 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons;
1193 OC asterids; lamiids; Solanales; Solanaceae; Solanum; Lycopersicon.
1194 OX NCBI_TaxID=4081;
1195 RN [1]
1196 RP NUCLEOTIDE SEQUENCE [MRNA].
1197 RC STRAIN=cv. VF36; TISSUE=Anther;
1198 RX MEDLINE=94143497; PubMed=8310077; DOI=10.1104/pp.101.4.1413;
1199 RA Chen R., Smith A.G.;
1200 RT "Nucleotide sequence of a stamen- and tapetum-specific gene from
1201 RT Lycopersicon esculentum.";
1202 RL Plant Physiol. 101:1413-1413(1993).
1203 CC -!- TISSUE SPECIFICITY: Stamen- and tapetum-specific.
1204 CC -!- SIMILARITY: Belongs to the A9/FIL1 family.
1205 DR EMBL; Z14088; CAA78466.1; -; mRNA.
1206 DR PIR; S26409; S26409.
1207 DR InterPro; IPR013770; LPT_helical.
1208 DR InterPro; IPR003612; LTP/seed_store/tryp_amyl_inhib.
1209 DR Pfam; PF00234; Tryp_alpha_amyl; 1.
1210 DR SMART; SM00499; AAI; 1.
1211 KW Signal.
1212 FT SIGNAL 1 30 Potential.
1213 FT CHAIN 31 102 Protein 108.
1214 FT /FTId=PRO_0000000238.
1215 FT DISULFID 41 77 By similarity.
1216 FT DISULFID 51 66 By similarity.
1217 FT DISULFID 67 92 By similarity.
1218 FT DISULFID 79 99 By similarity.
1219 SQ SEQUENCE 102 AA; 10576 MW; CFBAA1231C3A5E92 CRC64;
1220 MASVKSSSSS SSSSFISLLL LILLVIVLQS QVIECQPQQS CTASLTGLNV CAPFLVPGSP
1221 TASTECCNAV QSINHDCMCN TMRIAAQIPA QCNLPPLSCS AN
1222 //
1223 """
1224
1225 print "#########################################################"
1226 print "# Sequence Input Tests #"
1227 print "#########################################################"
1228
1229
1230
1231 tests = [
1232 (aln_example, "clustal", 8, "HISJ_E_COLI",
1233 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
1234 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
1235 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLVVAKNSDIQP-TVE" + \
1236 "SLKGKRVGVLQGTTQETFGNEHWAPKGIEIVSYQGQDNIYSDLTAGRIDA" + \
1237 "AFQDEVAASEGFLKQPVGKDYKFGGPSVKDEKLFGVGTGMGLRKED--NE" + \
1238 "LREALNKAFAEMRADGTYEKLAKKYFDFDVYGG---", True),
1239 (phy_example, "phylip", 8, "HISJ_E_COL", None, False),
1240 (nxs_example, "nexus", 8, "HISJ_E_COLI", None, True),
1241 (nxs_example2, "nexus", 10, "Frog",
1242 "ATGGCACACCCATCACAATTAGGTTTTCAAGACGCAGCCTCTCCAATTATAGAAGAATTA" + \
1243 "CTTCACTTCCACGACCATACCCTCATAGCCGTTTTTCTTATTAGTACGCTAGTTCTTTAC" + \
1244 "ATTATTACTATTATAATAACTACTAAACTAACTAATACAAACCTAATGGACGCACAAGAG" + \
1245 "ATCGAAATAGTGTGAACTATTATACCAGCTATTAGCCTCATCATAATTGCCCTTCCATCC" + \
1246 "CTTCGTATCCTATATTTAATAGATGAAGTTAATGATCCACACTTAACAATTAAAGCAATC" + \
1247 "GGCCACCAATGATACTGAAGCTACGAATATACTAACTATGAGGATCTCTCATTTGACTCT" + \
1248 "TATATAATTCCAACTAATGACCTTACCCCTGGACAATTCCGGCTGCTAGAAGTTGATAAT" + \
1249 "CGAATAGTAGTCCCAATAGAATCTCCAACCCGACTTTTAGTTACAGCCGAAGACGTCCTC" + \
1250 "CACTCGTGAGCTGTACCCTCCTTGGGTGTCAAAACAGATGCAATCCCAGGACGACTTCAT" + \
1251 "CAAACATCATTTATTGCTACTCGTCCGGGAGTATTTTACGGACAATGTTCAGAAATTTGC" + \
1252 "GGAGCAAACCACAGCTTTATACCAATTGTAGTTGAAGCAGTACCGCTAACCGACTTTGAA" + \
1253 "AACTGATCTTCATCAATACTA---GAAGCATCACTA------AGA", True),
1254 (nxs_example3, "nexus", 10, "Frog",
1255 'MAHPSQLGFQDAASPIMEELLHFHDHTLMAVFLISTLVLYIITIMMTTKLTNTNLMDAQE' + \
1256 'IEMVWTIMPAISLIMIALPSLRILYLMDEVNDPHLTIKAIGHQWYWSYEYTNYEDLSFDS' + \
1257 'YMIPTNDLTPGQFRLLEVDNRMVVPMESPTRLLVTAEDVLHSWAVPSLGVKTDAIPGRLH' + \
1258 'QTSFIATRPGVFYGQCSEICGANHSFMPIVVEAVPLTDFENWSSSML-EASL--', True),
1259 (faa_example, "fasta", 8, "HISJ_E_COLI",
1260 'mkklvlslslvlafssataafaaipqnirigtdptyapfesknsqgelvgfdidlakelc' + \
1261 'krintqctfvenpldalipslkakkidaimsslsitekrqqeiaftdklyaadsrlvvak' + \
1262 'nsdiqptveslkgkrvgvlqgttqetfgnehwapkgieivsyqgqdniysdltagridaa' + \
1263 'fqdevaasegflkqpvgkdykfggpsvkdeklfgvgtgmglrkednelrealnkafaemr' + \
1264 'adgtyeklakkyfdfdvygg', True),
1265 (sth_example, "stockholm", 5, "O31699/88-139",
1266 'EVMLTDIPRLHINDPIMK--GFGMVINN------GFVCVENDE', True),
1267 (sth_example2, "stockholm", 2, "AE007476.1",
1268 'AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU' + \
1269 'UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU', True),
1270 (gbk_example, "genbank", 1, "U49845.1", None, True),
1271 (gbk_example2,"genbank", 1, 'AAD51968.1',
1272 "MESTLGSDLARLVRVWRALIDHRLKPLELTQTHWVTLHNINRLPPEQSQIQLAKAIGIEQ" + \
1273 "PSLVRTLDQLEEKGLITRHTCANDRRAKRIKLTEQSSPIIEQVDGVICSTRKEILGGISP" + \
1274 "DEIELLSGLIDKLERNIIQLQSK", True),
1275 (gbk_example, "genbank-cds", 3, "AAA98667.1",
1276 'MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQFVPINRHPALIDYIEE' + \
1277 'LILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVDKDDQIITETEVFDEFRSS' + \
1278 'LNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNRRVDSLEEKAEIERDSNWVKC' + \
1279 'QEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEKLISGDDKILNGVYSQYEEGESI' + \
1280 'FGSLF', True),
1281 (swiss_example,"swiss", 3, "Q43495",
1282 "MASVKSSSSSSSSSFISLLLLILLVIVLQSQVIECQPQQSCTASLTGLNVCAPFLVPGSP" + \
1283 "TASTECCNAVQSINHDCMCNTMRIAAQIPAQCNLPPLSCSAN", True),
1284 ]
1285
1286 for (data, format, rec_count, last_id, last_seq, dict_check) in tests:
1287
1288 print "%s file with %i records" % (format, rec_count)
1289
1290 print "Bio.SeqIO.parse(handle)"
1291
1292
1293
1294 iterator = parse(StringIO(data), format=format)
1295 as_list = list(iterator)
1296 assert len(as_list) == rec_count, \
1297 "Expected %i records, found %i" \
1298 % (rec_count, len(as_list))
1299 assert as_list[-1].id == last_id, \
1300 "Expected '%s' as last record ID, found '%s'" \
1301 % (last_id, as_list[-1].id)
1302 if last_seq :
1303 assert as_list[-1].seq.tostring() == last_seq
1304
1305
1306 iterator = parse(StringIO(data), format=format)
1307 count = 1
1308 record = iterator.next()
1309 assert record is not None
1310 assert str(record.__class__) == "Bio.SeqRecord.SeqRecord"
1311
1312 for record in iterator :
1313 assert record.id == as_list[count].id
1314 assert record.seq.tostring() == as_list[count].seq.tostring()
1315 count = count + 1
1316 assert count == rec_count
1317 assert record is not None
1318 assert record.id == last_id
1319
1320
1321 iterator = parse(StringIO(data), format=format)
1322 count = 0
1323 while True :
1324 try :
1325 record = iterator.next()
1326 except StopIteration :
1327 break
1328 if record is None : break
1329 assert record.id == as_list[count].id
1330 assert record.seq.tostring() == as_list[count].seq.tostring()
1331 count=count+1
1332 assert count == rec_count
1333
1334 print "parse(handle)"
1335 iterator = parse(StringIO(data), format=format)
1336 for (i, record) in enumerate(iterator) :
1337 assert record.id == as_list[i].id
1338 assert record.seq.tostring() == as_list[i].seq.tostring()
1339 assert i+1 == rec_count
1340
1341 print "parse(handle to empty file)"
1342 iterator = parse(StringIO(""), format=format)
1343 assert len(list(iterator))==0
1344
1345 if dict_check :
1346 print "to_dict(parse(...))"
1347 seq_dict = to_dict(parse(StringIO(data), format=format))
1348 assert Set(seq_dict.keys()) == Set([r.id for r in as_list])
1349 assert last_id in seq_dict
1350 assert seq_dict[last_id].seq.tostring() == as_list[-1].seq.tostring()
1351
1352 if len(Set([len(r.seq) for r in as_list]))==1 :
1353
1354
1355 print "to_alignment(parse(handle))"
1356 alignment = to_alignment(parse(handle = StringIO(data), format=format))
1357 assert len(alignment._records)==rec_count
1358 assert alignment.get_alignment_length() == len(as_list[0].seq)
1359 for i in range(0, rec_count) :
1360 assert as_list[i].id == alignment._records[i].id
1361 assert as_list[i].id == alignment.get_all_seqs()[i].id
1362 assert as_list[i].seq.tostring() == alignment._records[i].seq.tostring()
1363 assert as_list[i].seq.tostring() == alignment.get_all_seqs()[i].seq.tostring()
1364
1365 print
1366
1367 print "Checking phy <-> aln examples agree using list(parse(...))"
1368
1369
1370
1371 aln_list = list(parse(StringIO(aln_example), format="clustal"))
1372 phy_list = list(parse(StringIO(phy_example), format="phylip"))
1373 assert len(aln_list) == len(phy_list)
1374 assert Set([r.id[0:10] for r in aln_list]) == Set([r.id for r in phy_list])
1375 for i in range(0, len(aln_list)) :
1376 assert aln_list[i].id[0:10] == phy_list[i].id
1377 assert aln_list[i].seq.tostring() == phy_list[i].seq.tostring()
1378
1379 print "Checking nxs <-> aln examples agree using parse"
1380
1381
1382
1383 aln_iter = parse(StringIO(aln_example), format="clustal")
1384 nxs_iter = parse(StringIO(nxs_example), format="nexus")
1385 while True :
1386 try :
1387 aln_record = aln_iter.next()
1388 except StopIteration :
1389 aln_record = None
1390 try :
1391 nxs_record = nxs_iter.next()
1392 except StopIteration :
1393 nxs_record = None
1394 if aln_record is None or nxs_record is None :
1395 assert aln_record is None
1396 assert nxs_record is None
1397 break
1398 assert aln_record.id == nxs_record.id
1399 assert aln_record.seq.tostring() == nxs_record.seq.tostring()
1400
1401 print "Checking faa <-> aln examples agree using to_dict(parse(...)"
1402
1403 aln_dict = to_dict(parse(StringIO(aln_example), format="clustal"))
1404 faa_dict = to_dict(parse(StringIO(faa_example), format="fasta"))
1405
1406 ids = Set(aln_dict.keys())
1407 assert ids == Set(faa_dict.keys())
1408
1409 for id in ids :
1410
1411 assert aln_dict[id].seq.tostring().upper().replace("-","") == \
1412 faa_dict[id].seq.tostring().upper()
1413
1414 print
1415 print "#########################################################"
1416 print "# Sequence Output Tests #"
1417 print "#########################################################"
1418 print
1419
1420 general_output_formats = ["fasta"]
1421 alignment_formats = ["phylip","stockholm","clustal"]
1422 for (in_data, in_format, rec_count, last_id, last_seq, unique_ids) in tests:
1423 if unique_ids :
1424 in_list = list(parse(StringIO(in_data), format=in_format))
1425 seq_lengths = [len(r.seq) for r in in_list]
1426 output_formats = general_output_formats[:]
1427 if min(seq_lengths)==max(seq_lengths) :
1428 output_formats.extend(alignment_formats)
1429 print "Checking conversion from %s (including to alignment formats)" % in_format
1430 else :
1431 print "Checking conversion from %s (excluding alignment formats)" % in_format
1432 for out_format in output_formats :
1433 print "Converting %s iterator -> %s" % (in_format, out_format)
1434 output = open("temp.txt","w")
1435 iterator = parse(StringIO(in_data), format=in_format)
1436
1437
1438
1439
1440 try :
1441 write(iterator, output, out_format)
1442 except ValueError, e:
1443 print "FAILED: %s" % str(e)
1444
1445 continue
1446
1447 output.close()
1448
1449 print "Checking %s <-> %s" % (in_format, out_format)
1450 out_list = list(parse(open("temp.txt","rU"), format=out_format))
1451
1452 assert rec_count == len(out_list)
1453 if last_seq :
1454 assert last_seq == out_list[-1].seq.tostring()
1455 if out_format=="phylip" :
1456 assert last_id[0:10] == out_list[-1].id
1457 else :
1458 assert last_id == out_list[-1].id
1459
1460 for i in range(0, rec_count) :
1461 assert in_list[-1].seq.tostring() == out_list[-1].seq.tostring()
1462 if out_format=="phylip" :
1463 assert in_list[i].id[0:10] == out_list[i].id
1464 else :
1465 assert in_list[i].id == out_list[i].id
1466 print
1467
1468 print "#########################################################"
1469 print "# SeqIO Tests finished #"
1470 print "#########################################################"
1471