1
2
3
4
5
6
7
8
9
10
11 """Nexus class. Parse the contents of a nexus file.
12 Based upon 'NEXUS: An extensible file format for systematic information'
13 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
14 """
15
16 import os,sys, math, random, copy
17 import sets
18
19 from Bio.Alphabet import IUPAC
20 from Bio.Data import IUPACData
21 from Bio.Seq import Seq
22
23 from Trees import Tree,NodeData
24
25 try:
26 import cnexus
27 except ImportError:
28 C=False
29 else:
30 C=True
31
32 INTERLEAVE=70
33 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
34 'matrix','tree', 'utree','translate']
35 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets']
36 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
37 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
38 WHITESPACE=' \t\n'
39
40 SPECIALCOMMENTS=['&']
41 CHARSET='chars'
42 TAXSET='taxa'
43
45
47 """Helps reading NEXUS-words and characters from a buffer."""
49 if string:
50 self.buffer=list(string)
51 else:
52 self.buffer=[]
53
55 if self.buffer:
56 return self.buffer[0]
57 else:
58 return None
59
61 b=''.join(self.buffer).strip()
62 if b:
63 return b[0]
64 else:
65 return None
66
68 if self.buffer:
69 return self.buffer.pop(0)
70 else:
71 return None
72
74 while True:
75 p=self.next()
76 if p is None:
77 break
78 if p not in WHITESPACE:
79 return p
80 return None
81
83 while self.buffer[0] in WHITESPACE:
84 self.buffer=self.buffer[1:]
85
87 for t in target:
88 try:
89 pos=self.buffer.index(t)
90 except ValueError:
91 pass
92 else:
93 found=''.join(self.buffer[:pos])
94 self.buffer=self.buffer[pos:]
95 return found
96 else:
97 return None
98
100 return ''.join(self.buffer[:len(word)])==word
101
103 """Return the next NEXUS word from a string, dealing with single and double quotes,
104 whitespace and punctuation.
105 """
106
107 word=[]
108 quoted=False
109 first=self.next_nonwhitespace()
110 if not first:
111 return None
112 word.append(first)
113 if first=="'":
114 quoted=True
115 elif first in PUNCTUATION:
116 return first
117 while True:
118 c=self.peek()
119 if c=="'":
120 word.append(self.next())
121 if self.peek()=="'":
122 skip=self.next()
123 elif quoted:
124 break
125 elif quoted:
126 word.append(self.next())
127 elif not c or c in PUNCTUATION or c in WHITESPACE:
128 break
129 else:
130 word.append(self.next())
131 return ''.join(word)
132
134 """Return the rest of the string without parsing."""
135 return ''.join(self.buffer)
136
138 """Calculate a stepmatrix for weighted parsimony.
139 See Wheeler (1990), Cladistics 6:269-275.
140 """
141
143 self.data={}
144 self.symbols=[s for s in symbols]
145 self.symbols.sort()
146 if gap:
147 self.symbols.append(gap)
148 for x in self.symbols:
149 for y in [s for s in self.symbols if s!=x]:
150 self.set(x,y,0)
151
152 - def set(self,x,y,value):
156
157 - def add(self,x,y,value):
161
164
171
173 for k in self.data:
174 if self.data[k]!=0:
175 self.data[k]=-math.log(self.data[k])
176 return self
177
178 - def smprint(self,name='your_name_here'):
179 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
180 matrix+=' %s\n' % ' '.join(self.symbols)
181 for x in self.symbols:
182 matrix+='[%s]'.ljust(8) % x
183 for y in self.symbols:
184 if x==y:
185 matrix+=' . '
186 else:
187 if x>y:
188 x1,y1=y,x
189 else:
190 x1,y1=x,y
191 if self.data[x1+y1]==0:
192 matrix+='inf. '
193 else:
194 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
195 matrix+='\n'
196 matrix+=';\n'
197 return matrix
198
200 """Return a taxon identifier according to NEXUS standard.
201 Wrap quotes around names with punctuation or whitespace, and double single quotes.
202 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes.
203 """
204 if mrbayes:
205 safe=name.replace(' ','_')
206 safe=''.join([c for c in safe if c in MRBAYESSAFE])
207 else:
208 safe=name.replace("'","''")
209 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)):
210 safe="'"+safe+"'"
211 return safe
212
214 """Remove quotes and/or double quotes around identifiers."""
215 if not word:
216 return None
217 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
218 word=word[1:-1]
219 return word
220
237
239 """Returns a sorted list of keys of p sorted by values of p."""
240 startpos=[(p[pn],pn) for pn in p if p[pn]]
241 startpos.sort()
242 return zip(*startpos)[1]
243
245 """Check that all values in list are unique and return a pruned and sorted list."""
246 l=list(sets.Set(l))
247 l.sort()
248 return l
249
251 """Converts a Seq-object matrix to a plain sequence-string matrix."""
252 return dict([(t,matrix[t].tostring()) for t in matrix])
253
255 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
256 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
257 """
258
259 if not orig_list:
260 return ''
261 orig_list=list(sets.Set(orig_list))
262 orig_list.sort()
263 shortlist=[]
264 clist=orig_list[:]
265 clist.append(clist[-1]+.5)
266 while len(clist)>1:
267 step=1
268 for i,x in enumerate(clist):
269 if x==clist[0]+i*step:
270 continue
271 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
272
273
274 step=x-clist[0]
275 else:
276 sub=clist[:i]
277 if len(sub)==1:
278 shortlist.append(str(sub[0]+1))
279 else:
280 if step==1:
281 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
282 else:
283 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
284 clist=clist[i:]
285 break
286 return ' '.join(shortlist)
287
289 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
290
291 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
292 Character sets, character partitions and taxon sets are prefixed, readjusted and present in
293 the combined matrix.
294 """
295
296 if not matrices:
297 return None
298 name=matrices[0][0]
299 combined=copy.deepcopy(matrices[0][1])
300 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1)
301 if mixed_datatypes:
302 combined.datatype='None'
303
304 combined.charlabels=None
305 combined.statelabels=None
306 combined.interleave=False
307 combined.translate=None
308
309
310 for cn,cs in combined.charsets.items():
311 combined.charsets['%s.%s' % (name,cn)]=cs
312 del combined.charsets[cn]
313 for tn,ts in combined.taxsets.items():
314 combined.taxsets['%s.%s' % (name,tn)]=ts
315 del combined.taxsets[tn]
316
317
318 combined.charpartitions={'combined':{name:range(combined.nchar)}}
319 for n,m in matrices[1:]:
320 both=[t for t in combined.taxlabels if t in m.taxlabels]
321 combined_only=[t for t in combined.taxlabels if t not in both]
322 m_only=[t for t in m.taxlabels if t not in both]
323 for t in both:
324
325 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
326
327 for t in combined_only:
328 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
329 for t in m_only:
330 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
331 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
332 combined.taxlabels.extend(m_only)
333 for cn,cs in m.charsets.items():
334 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
335 if m.taxsets:
336 if not combined.taxsets:
337 combined.taxsets={}
338 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()]))
339 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
340
341 if m.charlabels:
342 if not combined.charlabels:
343 combined.charlabels={}
344 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
345 combined.nchar+=m.nchar
346 combined.ntax+=len(m_only)
347
348 return combined
349
408
409
411 """Adjust linebreaks to match ';', strip leading/trailing whitespace
412
413 list_of_commandlines=_adjust_lines(input_text)
414 Lines are adjusted so that no linebreaks occur within a commandline
415 (except matrix command line)
416 """
417 formatted_lines=[]
418 for l in lines:
419
420 l=l.replace('\r\n','\n').replace('\r','\n').strip()
421 if l.lower().startswith('matrix'):
422 formatted_lines.append(l)
423 else:
424 l=l.replace('\n',' ')
425 if l:
426 formatted_lines.append(l)
427 return formatted_lines
428
430 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
431
432 opening=seq.find('(')
433 while opening>-1:
434 closing=seq.find(')')
435 if closing<0:
436 raise NexusError, 'Missing closing parenthesis in: '+seq
437 elif closing<opening:
438 raise NexusError, 'Missing opening parenthesis in: '+seq
439 ambig=[x for x in seq[opening+1:closing]]
440 ambig.sort()
441 ambig=''.join(ambig)
442 ambig_code=rev_ambig_values[ambig.upper()]
443 if ambig!=ambig.upper():
444 ambig_code=ambig_code.lower()
445 seq=seq[:opening]+ambig_code+seq[closing+1:]
446 opening=seq.find('(')
447 return seq
448
449
451 """Represent a commandline as command and options."""
452
454 self.options={}
455 options=[]
456 self.command=None
457 try:
458
459 self.command, options = line.strip().split('\n', 1)
460 except ValueError:
461
462 self.command=line.split()[0]
463 options=' '.join(line.split()[1:])
464 self.command = self.command.strip().lower()
465 if self.command in SPECIAL_COMMANDS:
466 self.options=options.strip()
467 else:
468 if len(options) > 0:
469 try:
470 options = options.replace('=', ' = ').split()
471 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
472 indices = []
473 for sl in valued_indices:
474 indices.extend(sl)
475 token_indices = [n for n in range(len(options)) if n not in indices]
476 for opt in valued_indices:
477
478 self.options[options[opt[0]].lower()] = options[opt[2]]
479 for token in token_indices:
480 self.options[options[token].lower()] = None
481 except ValueError:
482 raise NexusError, 'Incorrect formatting in line: %s' % line
483
485 """Represent a NEXUS block with block name and list of commandlines ."""
489
491
492 __slots__=['original_taxon_order','__dict__']
493
495 self.ntax=0
496 self.nchar=0
497 self.taxlabels=[]
498 self.charlabels=None
499 self.statelabels=None
500 self.datatype='dna'
501 self.respectcase=False
502 self.missing='?'
503 self.gap='-'
504 self.symbols=None
505 self.equate=None
506 self.matchchar=None
507 self.labels=None
508 self.transpose=False
509 self.interleave=False
510 self.tokens=False
511 self.eliminate=None
512 self.matrix=None
513 self.unknown_blocks=[]
514 self.taxsets={}
515 self.charsets={}
516 self.charpartitions={}
517 self.taxpartitions={}
518 self.trees=[]
519 self.translate=None
520 self.structured=[]
521 self.set={}
522 self.options={}
523
524
525 self.options['gapmode']='missing'
526
527 if input:
528 self.read(input)
529
531 """Included for backwards compatibility."""
532 return self.taxlabels
534 """Included for backwards compatibility."""
535 self.taxlabels=value
536 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
537
538 - def read(self,input):
539 """Read and parse NEXUS imput (filename, file-handle, string."""
540
541
542
543 try:
544 file_contents = open(os.path.expanduser(input),'rU').read()
545 self.filename=input
546 except (TypeError,IOError,AttributeError):
547
548 if isinstance(input, str):
549 file_contents = input
550 self.filename='input_string'
551
552 elif hasattr(input,'read'):
553 file_contents=input.read()
554 if hasattr(input,"name") and input.name:
555 self.filename=input.name
556 else:
557 self.filename='Unknown_nexus_file'
558 else:
559 print input.strip()[:50]
560 raise NexusError, 'Unrecognized input: %s ...' % input[:100]
561 file_contents=file_contents.strip()
562 if file_contents.startswith('#NEXUS'):
563 file_contents=file_contents[6:]
564 if C:
565 decommented=cnexus.scanfile(file_contents)
566
567 if decommented=='[' or decommented==']':
568 raise NexusError, 'Unmatched %s' % decommented
569
570
571 commandlines=_adjust_lines(decommented.split(chr(7)))
572 else:
573 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
574
575 try:
576 if commandlines[0][:6].upper()=='#NEXUS':
577 commandlines[0]=commandlines[0][6:].strip()
578 except:
579 pass
580
581 nexus_block_gen = self._get_nexus_block(commandlines)
582 while 1:
583 try:
584 title, contents = nexus_block_gen.next()
585 except StopIteration:
586 break
587 if title in KNOWN_NEXUS_BLOCKS:
588 self._parse_nexus_block(title, contents)
589 else:
590 self._unknown_nexus_block(title, contents)
591
593 """Generator for looping through Nexus blocks."""
594 inblock=False
595 blocklines=[]
596 while file_contents:
597 cl=file_contents.pop(0)
598 if cl.lower().startswith('begin'):
599 if not inblock:
600 inblock=True
601 title=cl.split()[1].lower()
602 else:
603 raise NexusError('Illegal block nesting in block %s' % title)
604 elif cl.lower().startswith('end'):
605 if inblock:
606 inblock=False
607 yield title,blocklines
608 blocklines=[]
609 else:
610 raise NexusError('Unmatched \'end\'.')
611 elif inblock:
612 blocklines.append(cl)
613
619
621 """Parse a known Nexus Block """
622
623 self._apply_block_structure(title, contents)
624
625
626 block=self.structured[-1]
627 for line in block.commandlines:
628 try:
629 getattr(self,'_'+line.command)(line.options)
630 except AttributeError:
631 raise
632 raise NexusError, 'Unknown command: %s ' % line.command
633
635 if options.has_key('ntax'):
636 self.ntax=eval(options['ntax'])
637 if options.has_key('nchar'):
638 self.nchar=eval(options['nchar'])
639
719
720
721 - def _set(self,options):
723
725 self.options=options;
726
728 self.eliminate=options
729
739
741 """Check for presence of taxon in self.taxlabels."""
742
743
744 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
745 nexid=taxon.replace(' ','_')
746 return nextaxa.get(nexid)
747
770
774
779
781 if not self.ntax or not self.nchar:
782 raise NexusError,'Dimensions must be specified before matrix!'
783 taxlabels_present=(self.taxlabels!=[])
784 self.matrix={}
785 taxcount=0
786 block_interleave=0
787
788 lines=[l.strip() for l in options.split('\n') if l.strip()<>'']
789 lineiter=iter(lines)
790 while 1:
791 try:
792 l=lineiter.next()
793 except StopIteration:
794 if taxcount<self.ntax:
795 raise NexusError, 'Not enough taxa in matrix.'
796 elif taxcount>self.ntax:
797 raise NexusError, 'Too many taxa in matrix.'
798 else:
799 break
800
801 taxcount+=1
802
803 if taxcount>self.ntax:
804 if not self.interleave:
805 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?'
806 else:
807 taxcount=1
808 block_interleave=1
809
810 linechars=CharBuffer(l)
811 id=quotestrip(linechars.next_word())
812 l=linechars.rest().strip()
813 if taxlabels_present and not self._check_taxlabels(id):
814 raise NexusError,'Taxon '+id+' not found in taxlabels.'
815 chars=''
816 if self.interleave:
817
818
819 if l:
820 chars=''.join(l.split())
821 else:
822 chars=''.join(lineiter.next().split())
823 else:
824
825 chars=''.join(l.split())
826 while len(chars)<self.nchar:
827 l=lineiter.next()
828 chars+=''.join(l.split())
829 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
830
831 if taxcount==1:
832 refseq=iupac_seq
833 else:
834 if self.matchchar:
835 while 1:
836 p=iupac_seq.tostring().find(self.matchchar)
837 if p==-1:
838 break
839 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
840
841 for i,c in enumerate(iupac_seq.tostring()):
842 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
843 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
844 % (id,c,l[i-10:i+10])
845
846 if block_interleave==0:
847 while self.matrix.has_key(id):
848 if id.split('.')[-1].startswith('copy'):
849 id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1)
850 else:
851 id+='.copy'
852
853 self.matrix[id]=iupac_seq
854
855 if not taxlabels_present:
856 self.taxlabels.append(id)
857 else:
858 taxon_present=self._check_taxlabels(id)
859 if taxon_present:
860 self.matrix[taxon_present]+=iupac_seq
861 else:
862 raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id
863
864 for taxon in self.matrix:
865 if len(self.matrix[taxon])!=self.nchar:
866 raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon
867
887
889 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
890 self._tree(options)
891
892 - def _tree(self,options):
922
929
933
937
939 taxpartition={}
940 quotelevel=False
941 opts=CharBuffer(options)
942 name=self._name_n_vector(opts)
943 if not name:
944 raise NexusError, 'Formatting error in taxpartition: %s ' % options
945
946
947
948 sub=''
949 while True:
950 w=opts.next()
951 if w is None or (w==',' and not quotelevel):
952 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
953 taxpartition[subname]=_make_unique(subindices)
954 sub=''
955 if w is None:
956 break
957 else:
958 if w=="'":
959 quotelevel=not quotelevel
960 sub+=w
961 self.taxpartitions[name]=taxpartition
962
964 charpartition={}
965 quotelevel=False
966 opts=CharBuffer(options)
967 name=self._name_n_vector(opts)
968 if not name:
969 raise NexusError, 'Formatting error in charpartition: %s ' % options
970
971
972 sub=''
973 while True:
974 w=opts.next()
975 if w is None or (w==',' and not quotelevel):
976 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
977 charpartition[subname]=_make_unique(subindices)
978 sub=''
979 if w is None:
980 break
981 else:
982 if w=="'":
983 quotelevel=not quotelevel
984 sub+=w
985 self.charpartitions[name]=charpartition
986
988 """Parse the taxset/charset specification
989 '1 2 3 - 5 dog cat 10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19]
990 """
991 opts=CharBuffer(options)
992 name=self._name_n_vector(opts,separator=separator)
993 indices=self._parse_list(opts,set_type=set_type)
994 if indices is None:
995 raise NexusError, 'Formatting error in line: %s ' % options
996 return name,indices
997
1016
1018 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21],
1019 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1020 """
1021 plain_list=[]
1022 if options_buffer.peek_nonwhitespace():
1023 try:
1024 while True:
1025 identifier=options_buffer.next_word()
1026 if not identifier:
1027 break
1028 start=self._resolve(identifier,set_type=set_type)
1029 if options_buffer.peek_nonwhitespace()=='-':
1030 end=start
1031 step=1
1032
1033 hyphen=options_buffer.next_nonwhitespace()
1034 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1035 if set_type==CHARSET:
1036 if options_buffer.peek_nonwhitespace()=='\\':
1037 backslash=options_buffer.next_nonwhitespace()
1038 step=int(options_buffer.next_word())
1039 plain_list.extend(range(start,end+1,step))
1040 else:
1041 if type(start)==list or type(end)==list:
1042 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier
1043 start=self.taxlabels.index(start)
1044 end=self.taxlabels.index(end)
1045 taxrange=self.taxlabels[start:end+1]
1046 plain_list.extend(taxrange)
1047 else:
1048 if type(start)==list:
1049 plain_list.extend(start)
1050 else:
1051 plain_list.append(start)
1052 except NexusError:
1053 raise
1054 except:
1055 return None
1056 return plain_list
1057
1058 - def _resolve(self,identifier,set_type=None):
1059 """Translate identifier in list into character/taxon index.
1060 Characters (which are referred to by their index in Nexus.py):
1061 Plain numbers are returned minus 1 (Nexus indices to python indices)
1062 Text identifiers are translaterd into their indices (if plain character indentifiers),
1063 the first hit in charlabels is returned (charlabels don't need to be unique)
1064 or the range of indices is returned (if names of character sets).
1065 Taxa (which are referred to by their unique name in Nexus.py):
1066 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1067 Names are returned unchanged (if plain taxon identifiers), or the names in
1068 the corresponding taxon set is returned
1069 """
1070 identifier=quotestrip(identifier)
1071 if not set_type:
1072 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1073 if set_type==CHARSET:
1074 try:
1075 n=int(identifier)
1076 except ValueError:
1077 if self.charlabels and identifier in self.charlabels.values():
1078 for k in self.charlabels:
1079 if self.charlabels[k]==identifier:
1080 return k
1081 elif self.charsets and identifier in self.charsets:
1082 return self.charsets[identifier]
1083 else:
1084 raise NexusError, 'Unknown character identifier: %s' % identifier
1085 else:
1086 if n<=self.nchar:
1087 return n-1
1088 else:
1089 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar)
1090 elif set_type==TAXSET:
1091 try:
1092 n=int(identifier)
1093 except ValueError:
1094 taxlabels_id=self._check_taxlabels(identifier)
1095 if taxlabels_id:
1096 return taxlabels_id
1097 elif self.taxsets and identifier in self.taxsets:
1098 return self.taxsets[identifier]
1099 else:
1100 raise NexusError, 'Unknown taxon identifier: %s' % identifier
1101 else:
1102 if n>0 and n<=self.ntax:
1103 return self.taxlabels[n-1]
1104 else:
1105 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax)
1106 else:
1107 raise NexusError('Unknown set specification: %s.'% set_type)
1108
1112
1116
1120
1124
1127 """Writes a nexus file for each partition in charpartition.
1128 Only non-excluded characters and non-deleted taxa are included, just the data block is written.
1129 """
1130
1131 if not matrix:
1132 matrix=self.matrix
1133 if not matrix:
1134 return
1135 if not filename:
1136 filename=self.filename
1137 if charpartition:
1138 pfilenames={}
1139 for p in charpartition:
1140 total_exclude=[]+exclude
1141 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1142 total_exclude=_make_unique(total_exclude)
1143 pcomment=comment+'\nPartition: '+p+'\n'
1144 dot=filename.rfind('.')
1145 if dot>0:
1146 pfilename=filename[:dot]+'_'+p+'.data'
1147 else:
1148 pfilename=filename+'_'+p
1149 pfilenames[p]=pfilename
1150 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1151 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1152 mrbayes=mrbayes)
1153 return pfilenames
1154 else:
1155 fn=self.filename+'.data'
1156 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1157 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1158 mrbayes=mrbayes)
1159 return fn
1160
1161 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1162 blocksize=None, interleave=False, interleave_by_partition=False,\
1163 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False):
1164 """ Writes a nexus file with data and sets block. Character sets and partitions
1165 are appended by default, and are adjusted according
1166 to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions),
1167 without including the deleted characters.
1168 """
1169 if not matrix:
1170 matrix=self.matrix
1171 if not matrix:
1172 return
1173 if not filename:
1174 filename=self.filename
1175 if [t for t in delete if not self._check_taxlabels(t)]:
1176 raise NexusError, 'Unknown taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels)))
1177 if interleave_by_partition:
1178 if not interleave_by_partition in self.charpartitions:
1179 raise NexusError, 'Unknown partition: '+interleave_by_partition
1180 else:
1181 partition=self.charpartitions[interleave_by_partition]
1182
1183 names=_sort_keys_by_values(partition)
1184 newpartition={}
1185 for p in partition:
1186 newpartition[p]=[c for c in partition[p] if c not in exclude]
1187
1188 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1189 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1190 ntax_adjusted=len(undelete)
1191 nchar_adjusted=len(cropped_matrix[undelete[0]])
1192 if not undelete or (undelete and undelete[0]==''):
1193 return
1194 if isinstance(filename,str):
1195 try:
1196 fh=open(filename,'w')
1197 except IOError:
1198 raise NexusError, 'Could not open %s for writing.' % filename
1199 elif isinstance(filename,file):
1200 fh=filename
1201 if not omit_NEXUS:
1202 fh.write('#NEXUS\n')
1203 if comment:
1204 fh.write('['+comment+']\n')
1205 fh.write('begin data;\n')
1206 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1207 fh.write('\tformat datatype='+self.datatype)
1208 if self.respectcase:
1209 fh.write(' respectcase')
1210 if self.missing:
1211 fh.write(' missing='+self.missing)
1212 if self.gap:
1213 fh.write(' gap='+self.gap)
1214 if self.matchchar:
1215 fh.write(' matchchar='+self.matchchar)
1216 if self.labels:
1217 fh.write(' labels='+self.labels)
1218 if self.equate:
1219 fh.write(' equate='+self.equate)
1220 if interleave or interleave_by_partition:
1221 fh.write(' interleave')
1222 fh.write(';\n')
1223
1224
1225 if self.charlabels:
1226 newcharlabels=self._adjust_charlabels(exclude=exclude)
1227 clkeys=newcharlabels.keys()
1228 clkeys.sort()
1229 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1230 fh.write('matrix\n')
1231 if not blocksize:
1232 if interleave:
1233 blocksize=70
1234 else:
1235 blocksize=self.nchar
1236
1237 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1238 if interleave_by_partition:
1239
1240 seek=0
1241 for p in names:
1242 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1243 if len(newpartition[p])>0:
1244 for taxon in undelete:
1245 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1246 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1247 fh.write('\n')
1248 else:
1249 fh.write('[empty]\n\n')
1250 seek+=len(newpartition[p])
1251 elif interleave:
1252 for seek in range(0,nchar_adjusted,blocksize):
1253 for taxon in undelete:
1254 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1255 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1256 fh.write('\n')
1257 else:
1258 for taxon in undelete:
1259 if blocksize<nchar_adjusted:
1260 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1261 else:
1262 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1263 for seek in range(0,nchar_adjusted,blocksize):
1264 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1265 fh.write(';\nend;\n')
1266 if append_sets:
1267 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1268 fh.close()
1269 return filename
1270
1272 """Appends a sets block to <filename>."""
1273 if not self.charsets and not self.taxsets and not self.charpartitions:
1274 return ''
1275 sets=['\nbegin sets']
1276
1277
1278
1279
1280 offset=0
1281 offlist=[]
1282 for c in range(self.nchar):
1283 if c in exclude:
1284 offset+=1
1285 offlist.append(-1)
1286 else:
1287 offlist.append(c-offset)
1288
1289 for n,ns in self.charsets.items():
1290 cset=[offlist[c] for c in ns if c not in exclude]
1291 if cset:
1292 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1293 for n,s in self.taxsets.items():
1294 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1295 if tset:
1296 sets.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1297 for n,p in self.charpartitions.items():
1298
1299
1300
1301 names=_sort_keys_by_values(p)
1302 newpartition={}
1303 for sn in names:
1304 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1305 if nsp:
1306 newpartition[sn]=nsp
1307 if newpartition:
1308 sets.append('charpartition %s = %s' % (safename(n),\
1309 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1310
1311 for n,p in self.taxpartitions.items():
1312 names=_sort_keys_by_values(p)
1313 newpartition={}
1314 for sn in names:
1315 nsp=[t for t in p[sn] if t not in delete]
1316 if nsp:
1317 newpartition[sn]=nsp
1318 if newpartition:
1319 sets.append('taxpartition %s = %s' % (safename(n),\
1320 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1321
1322 sets.append('end;\n')
1323 return ';\n'.join(sets)
1324 f.close()
1325
1327 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1328 if not filename:
1329 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1330 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1331 else:
1332 filename=self.filename+'.fas'
1333 fh=open(filename,'w')
1334 for taxon in self.taxlabels:
1335 fh.write('>'+safename(taxon)+'\n')
1336 for i in range(0, len(self.matrix[taxon].tostring()), width):
1337 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1338 fh.close()
1339
1340 - def constant(self,matrix=None,delete=[],exclude=[]):
1341 """Return a list with all constant characters."""
1342 if not matrix:
1343 matrix=self.matrix
1344 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1345 if not undelete:
1346 return None
1347 elif len(undelete)==1:
1348 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1349
1350 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1351 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1352 for taxon in undelete[1:]:
1353 newconstant=[]
1354 for site in constant:
1355
1356 seqsite=matrix[taxon][site[0]].upper()
1357
1358 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1359
1360 newconstant.append(site)
1361 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1362
1363 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1364 elif seqsite in self.ambiguous_values:
1365 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1]))
1366 if intersect:
1367 newconstant.append((site[0],''.join(intersect)))
1368
1369
1370
1371
1372
1373 constant=newconstant
1374 cpos=[s[0] for s in constant]
1375 return constant
1376
1377
1379 """Summarize character.
1380 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1381 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1382 """
1383 undelete=[t for t in self.taxlabels if t not in delete]
1384 if not undelete:
1385 return None
1386 cstatus=[]
1387 for t in undelete:
1388 c=self.matrix[t][site].upper()
1389 if self.options.get('gapmode')=='missing' and c==self.gap:
1390 c=self.missing
1391 if narrow and c==self.missing:
1392 if c not in cstatus:
1393 cstatus.append(c)
1394 else:
1395 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1396 if self.missing in cstatus and narrow and len(cstatus)>1:
1397 cstatus=[c for c in cstatus if c!=self.missing]
1398 cstatus.sort()
1399 return cstatus
1400
1402 """Calculates a stepmatrix for weighted parsimony.
1403 See Wheeler (1990), Cladistics 6:269-275 and
1404 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
1405 """
1406 m=StepMatrix(self.unambiguous_letters,self.gap)
1407 for site in [s for s in range(self.nchar) if s not in exclude]:
1408 cstatus=self.cstatus(site,delete)
1409 for i,b1 in enumerate(cstatus[:-1]):
1410 for b2 in cstatus[i+1:]:
1411 m.add(b1.upper(),b2.upper(),1)
1412 return m.transformation().weighting().smprint(name=name)
1413
1414 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1415 """Return a matrix without deleted taxa and excluded characters."""
1416 if not matrix:
1417 matrix=self.matrix
1418 if [t for t in delete if not self._check_taxlabels(t)]:
1419 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels))
1420 if exclude!=[]:
1421 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1422 if not undelete:
1423 return {}
1424 m=[matrix[k].tostring() for k in undelete]
1425 zipped_m=zip(*m)
1426 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1427 if sitesm==[]:
1428 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1429 else:
1430 zipped_sitesm=zip(*sitesm)
1431 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1432 return dict(zip(undelete,m))
1433 else:
1434 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1435
1436 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1437 """Return a bootstrapped matrix."""
1438 if not matrix:
1439 matrix=self.matrix
1440 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)
1441 cm=self.crop_matrix(delete=delete,exclude=exclude)
1442 if not cm:
1443 return {}
1444 elif len(cm[cm.keys()[0]])==0:
1445 return cm
1446 undelete=[t for t in self.taxlabels if t in cm]
1447 if seqobjects:
1448 sitesm=zip(*[cm[t].tostring() for t in undelete])
1449 alphabet=matrix[matrix.keys()[0]].alphabet
1450 else:
1451 sitesm=zip(*[cm[t] for t in undelete])
1452 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1453 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1454 if seqobjects:
1455 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1456 return dict(zip(undelete,bootstrapseqs))
1457
1471
1472
1474 """Add a gap into the matrix and adjust charsets and partitions.
1475
1476 pos=0: first position
1477 pos=nchar: last position
1478 """
1479
1480 def _adjust(set,x,d,leftgreedy=False):
1481 """Adjusts chartacter sets if gaps are inserted, taking care of
1482 new gaps within a coherent character set."""
1483
1484
1485
1486 set.sort()
1487 addpos=0
1488 for i,c in enumerate(set):
1489 if c>=x:
1490 set[i]=c+d
1491
1492 if c==x:
1493 if leftgreedy or (i>0 and set[i-1]==c-1):
1494 addpos=i
1495 if addpos>0:
1496 set[addpos:addpos]=range(x,x+d)
1497 return set
1498
1499 if pos<0 or pos>self.nchar:
1500 raise NexusError('Illegal gap position: %d' % pos)
1501 if n==0:
1502 return
1503 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1504 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1505
1506
1507 zipped=zip(*sitesm)
1508 mapped=map(''.join,zipped)
1509 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1510 self.matrix=dict(listed)
1511 self.nchar+=n
1512
1513 for i,s in self.charsets.items():
1514 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1515 for p in self.charpartitions:
1516 for sp,s in self.charpartitions[p].items():
1517 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1518
1519 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1520 return self.charlabels
1521
1523 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1524 if exclude and insert:
1525 raise NexusError, 'Can\'t exclude and insert at the same time'
1526 if not self.charlabels:
1527 return None
1528 labels=self.charlabels.keys()
1529 labels.sort()
1530 newcharlabels={}
1531 if exclude:
1532 exclude.sort()
1533 exclude.append(sys.maxint)
1534 excount=0
1535 for c in labels:
1536 if not c in exclude:
1537 while c>exclude[excount]:
1538 excount+=1
1539 newcharlabels[c-excount]=self.charlabels[c]
1540 elif insert:
1541 insert.sort()
1542 insert.append(sys.maxint)
1543 icount=0
1544 for c in labels:
1545 while c>=insert[icount]:
1546 icount+=1
1547 newcharlabels[c+icount]=self.charlabels[c]
1548 else:
1549 return self.charlabels
1550 return newcharlabels
1551
1553 """Returns all character indices that are not in charlist."""
1554 return [c for c in range(self.nchar) if c not in charlist]
1555
1557 """Return gap-only sites."""
1558 gap=sets.Set(self.gap)
1559 if include_missing:
1560 gap.add(self.missing)
1561 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1562 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)]
1563 return gaponly
1564
1586