Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Nexus.py - a NEXUS parser 
   2  # 
   3  # Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license. Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  #  
   8  # Bug reports welcome: fkauff@duke.edu 
   9  # 
  10   
  11  """Nexus class. Parse the contents of a nexus file. 
  12  Based upon 'NEXUS: An extensible file format for systematic information' 
  13  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  14  """ 
  15   
  16  import os,sys, math, random, copy 
  17  import sets 
  18   
  19  from Bio.Alphabet import IUPAC 
  20  from Bio.Data import IUPACData 
  21  from Bio.Seq import Seq 
  22   
  23  from Trees import Tree,NodeData 
  24   
  25  try: 
  26      import cnexus 
  27  except ImportError: 
  28      C=False 
  29  else: 
  30      C=True 
  31   
  32  INTERLEAVE=70 
  33  SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ 
  34          'matrix','tree', 'utree','translate'] 
  35  KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets'] 
  36  PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' 
  37  MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  38  WHITESPACE=' \t\n' 
  39  #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments 
  40  SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored 
  41  CHARSET='chars' 
  42  TAXSET='taxa' 
  43   
44 -class NexusError(Exception): pass
45
46 -class CharBuffer:
47 """Helps reading NEXUS-words and characters from a buffer."""
48 - def __init__(self,string):
49 if string: 50 self.buffer=list(string) 51 else: 52 self.buffer=[]
53
54 - def peek(self):
55 if self.buffer: 56 return self.buffer[0] 57 else: 58 return None
59
60 - def peek_nonwhitespace(self):
61 b=''.join(self.buffer).strip() 62 if b: 63 return b[0] 64 else: 65 return None
66
67 - def next(self):
68 if self.buffer: 69 return self.buffer.pop(0) 70 else: 71 return None
72
73 - def next_nonwhitespace(self):
74 while True: 75 p=self.next() 76 if p is None: 77 break 78 if p not in WHITESPACE: 79 return p 80 return None
81
82 - def skip_whitespace(self):
83 while self.buffer[0] in WHITESPACE: 84 self.buffer=self.buffer[1:]
85
86 - def next_until(self,target):
87 for t in target: 88 try: 89 pos=self.buffer.index(t) 90 except ValueError: 91 pass 92 else: 93 found=''.join(self.buffer[:pos]) 94 self.buffer=self.buffer[pos:] 95 return found 96 else: 97 return None
98
99 - def peek_word(self,word):
100 return ''.join(self.buffer[:len(word)])==word
101
102 - def next_word(self):
103 """Return the next NEXUS word from a string, dealing with single and double quotes, 104 whitespace and punctuation. 105 """ 106 107 word=[] 108 quoted=False 109 first=self.next_nonwhitespace() # get first character 110 if not first: # return empty if only whitespace left 111 return None 112 word.append(first) 113 if first=="'": # word starts with a quote 114 quoted=True 115 elif first in PUNCTUATION: # if it's punctuation, return immediately 116 return first 117 while True: 118 c=self.peek() 119 if c=="'": # a quote? 120 word.append(self.next()) # store quote 121 if self.peek()=="'": # double quote 122 skip=self.next() # skip second quote 123 elif quoted: # second single quote ends word 124 break 125 elif quoted: 126 word.append(self.next()) # if quoted, then add anything 127 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop 128 break 129 else: 130 word.append(self.next()) # standard character 131 return ''.join(word)
132
133 - def rest(self):
134 """Return the rest of the string without parsing.""" 135 return ''.join(self.buffer)
136
137 -class StepMatrix:
138 """Calculate a stepmatrix for weighted parsimony. 139 See Wheeler (1990), Cladistics 6:269-275. 140 """ 141
142 - def __init__(self,symbols,gap):
143 self.data={} 144 self.symbols=[s for s in symbols] 145 self.symbols.sort() 146 if gap: 147 self.symbols.append(gap) 148 for x in self.symbols: 149 for y in [s for s in self.symbols if s!=x]: 150 self.set(x,y,0)
151
152 - def set(self,x,y,value):
153 if x>y: 154 x,y=y,x 155 self.data[x+y]=value
156
157 - def add(self,x,y,value):
158 if x>y: 159 x,y=y,x 160 self.data[x+y]+=value
161
162 - def sum(self):
163 return reduce(lambda x,y:x+y,self.data.values())
164
165 - def transformation(self):
166 total=self.sum() 167 if total!=0: 168 for k in self.data: 169 self.data[k]=self.data[k]/float(total) 170 return self
171
172 - def weighting(self):
173 for k in self.data: 174 if self.data[k]!=0: 175 self.data[k]=-math.log(self.data[k]) 176 return self
177
178 - def smprint(self,name='your_name_here'):
179 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) 180 matrix+=' %s\n' % ' '.join(self.symbols) 181 for x in self.symbols: 182 matrix+='[%s]'.ljust(8) % x 183 for y in self.symbols: 184 if x==y: 185 matrix+=' . ' 186 else: 187 if x>y: 188 x1,y1=y,x 189 else: 190 x1,y1=x,y 191 if self.data[x1+y1]==0: 192 matrix+='inf. ' 193 else: 194 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) 195 matrix+='\n' 196 matrix+=';\n' 197 return matrix
198
199 -def safename(name,mrbayes=False):
200 """Return a taxon identifier according to NEXUS standard. 201 Wrap quotes around names with punctuation or whitespace, and double single quotes. 202 mrbayes=True: write names without quotes, whitespace or punctuation for mrbayes. 203 """ 204 if mrbayes: 205 safe=name.replace(' ','_') 206 safe=''.join([c for c in safe if c in MRBAYESSAFE]) 207 else: 208 safe=name.replace("'","''") 209 if sets.Set(safe).intersection(sets.Set(WHITESPACE+PUNCTUATION)): 210 safe="'"+safe+"'" 211 return safe
212
213 -def quotestrip(word):
214 """Remove quotes and/or double quotes around identifiers.""" 215 if not word: 216 return None 217 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 218 word=word[1:-1] 219 return word
220
221 -def get_start_end(sequence, skiplist=['-','?']):
222 """Return position of first and last character which is not in skiplist (defaults to ['-','?']).""" 223 224 length=len(sequence) 225 if length==0: 226 return None,None 227 end=length-1 228 while end>=0 and (sequence[end] in skiplist): 229 end-=1 230 start=0 231 while start<length and (sequence[start] in skiplist): 232 start+=1 233 if start==length and end==-1: # empty sequence 234 return -1,-1 235 else: 236 return start,end
237
238 -def _sort_keys_by_values(p):
239 """Returns a sorted list of keys of p sorted by values of p.""" 240 startpos=[(p[pn],pn) for pn in p if p[pn]] 241 startpos.sort() 242 return zip(*startpos)[1]
243
244 -def _make_unique(l):
245 """Check that all values in list are unique and return a pruned and sorted list.""" 246 l=list(sets.Set(l)) 247 l.sort() 248 return l
249
250 -def _seqmatrix2strmatrix(matrix):
251 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 252 return dict([(t,matrix[t].tostring()) for t in matrix])
253
254 -def _compact4nexus(orig_list):
255 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 256 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 257 """ 258 259 if not orig_list: 260 return '' 261 orig_list=list(sets.Set(orig_list)) 262 orig_list.sort() 263 shortlist=[] 264 clist=orig_list[:] 265 clist.append(clist[-1]+.5) # dummy value makes it easier 266 while len(clist)>1: 267 step=1 268 for i,x in enumerate(clist): 269 if x==clist[0]+i*step: # are we still in the right step? 270 continue 271 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: 272 # second element, and possibly at least 3 elements to link, 273 # and the next one is in the right step 274 step=x-clist[0] 275 else: # pattern broke, add all values before current position to new list 276 sub=clist[:i] 277 if len(sub)==1: 278 shortlist.append(str(sub[0]+1)) 279 else: 280 if step==1: 281 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) 282 else: 283 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) 284 clist=clist[i:] 285 break 286 return ' '.join(shortlist)
287
288 -def combine(matrices):
289 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 290 291 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 292 Character sets, character partitions and taxon sets are prefixed, readjusted and present in 293 the combined matrix. 294 """ 295 296 if not matrices: 297 return None 298 name=matrices[0][0] 299 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 300 mixed_datatypes=(len(sets.Set([n[1].datatype for n in matrices]))>1) 301 if mixed_datatypes: 302 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! 303 # raise NexusError, 'Matrices must be of same datatype' 304 combined.charlabels=None 305 combined.statelabels=None 306 combined.interleave=False 307 combined.translate=None 308 309 # rename taxon sets and character sets and name them with prefix 310 for cn,cs in combined.charsets.items(): 311 combined.charsets['%s.%s' % (name,cn)]=cs 312 del combined.charsets[cn] 313 for tn,ts in combined.taxsets.items(): 314 combined.taxsets['%s.%s' % (name,tn)]=ts 315 del combined.taxsets[tn] 316 # previous partitions usually don't make much sense in combined matrix 317 # just initiate one new partition parted by single matrices 318 combined.charpartitions={'combined':{name:range(combined.nchar)}} 319 for n,m in matrices[1:]: # add all other matrices 320 both=[t for t in combined.taxlabels if t in m.taxlabels] 321 combined_only=[t for t in combined.taxlabels if t not in both] 322 m_only=[t for t in m.taxlabels if t not in both] 323 for t in both: 324 # concatenate sequences and unify gap and missing character symbols 325 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 326 # replace date of missing taxa with symbol for missing data 327 for t in combined_only: 328 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) 329 for t in m_only: 330 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ 331 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 332 combined.taxlabels.extend(m_only) # new taxon list 333 for cn,cs in m.charsets.items(): # adjust character sets for new matrix 334 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] 335 if m.taxsets: 336 if not combined.taxsets: 337 combined.taxsets={} 338 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets 339 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition 340 # update charlabels 341 if m.charlabels: 342 if not combined.charlabels: 343 combined.charlabels={} 344 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()])) 345 combined.nchar+=m.nchar # update nchar and ntax 346 combined.ntax+=len(m_only) 347 348 return combined
349
350 -def _kill_comments_and_break_lines(text):
351 """Delete []-delimited comments out of a file and break into lines separated by ';'. 352 353 stripped_text=_kill_comments_and_break_lines(text): 354 Nested and multiline comments are allowed. [ and ] symbols within single 355 or double quotes are ignored, newline ends a quote, all symbols with quotes are 356 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 357 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 358 Quotes inside special [& and [\ are treated as normal characters, 359 but no nesting inside these special comments allowed (like [& [\ ]]). 360 ';' ist deleted from end of line. 361 362 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 363 """ 364 contents=CharBuffer(text) 365 newtext=[] 366 newline=[] 367 quotelevel='' 368 speciallevel=False 369 commlevel=0 370 while True: 371 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character 372 #if not plain: 373 # newline.append(contents.rest) # not found, just add the rest 374 # break 375 #newline.append(plain) # add intermediate text 376 t=contents.next() # and get special character 377 if t is None: 378 break 379 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation 380 quotelevel='' 381 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation 382 quotelevel=t 383 elif not quotelevel and t=='[': # opening bracket outside a quote 384 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: 385 speciallevel=True 386 else: 387 commlevel+=1 388 elif not quotelevel and t==']': # closing bracket ioutside a quote 389 if speciallevel: 390 speciallevel=False 391 else: 392 commlevel-=1 393 if commlevel<0: 394 raise NexusError, 'Nexus formatting error: unmatched ]' 395 continue 396 if commlevel==0: # copy if we're not in comment 397 if t==';' and not quotelevel: 398 newtext.append(''.join(newline)) 399 newline=[] 400 else: 401 newline.append(t) 402 #level of comments should be 0 at the end of the file 403 if newline: 404 newtext.append('\n'.join(newline)) 405 if commlevel>0: 406 raise NexusError, 'Nexus formatting error: unmatched [' 407 return newtext
408 409
410 -def _adjust_lines(lines):
411 """Adjust linebreaks to match ';', strip leading/trailing whitespace 412 413 list_of_commandlines=_adjust_lines(input_text) 414 Lines are adjusted so that no linebreaks occur within a commandline 415 (except matrix command line) 416 """ 417 formatted_lines=[] 418 for l in lines: 419 #Convert line endings 420 l=l.replace('\r\n','\n').replace('\r','\n').strip() 421 if l.lower().startswith('matrix'): 422 formatted_lines.append(l) 423 else: 424 l=l.replace('\n',' ') 425 if l: 426 formatted_lines.append(l) 427 return formatted_lines
428
429 -def _replace_parenthesized_ambigs(seq,rev_ambig_values):
430 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 431 432 opening=seq.find('(') 433 while opening>-1: 434 closing=seq.find(')') 435 if closing<0: 436 raise NexusError, 'Missing closing parenthesis in: '+seq 437 elif closing<opening: 438 raise NexusError, 'Missing opening parenthesis in: '+seq 439 ambig=[x for x in seq[opening+1:closing]] 440 ambig.sort() 441 ambig=''.join(ambig) 442 ambig_code=rev_ambig_values[ambig.upper()] 443 if ambig!=ambig.upper(): 444 ambig_code=ambig_code.lower() 445 seq=seq[:opening]+ambig_code+seq[closing+1:] 446 opening=seq.find('(') 447 return seq
448 449
450 -class Commandline:
451 """Represent a commandline as command and options.""" 452
453 - def __init__(self, line, title):
454 self.options={} 455 options=[] 456 self.command=None 457 try: 458 #Assume matrix (all other command lines have been stripped of \n) 459 self.command, options = line.strip().split('\n', 1) 460 except ValueError: #Not matrix 461 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 462 self.command=line.split()[0] 463 options=' '.join(line.split()[1:]) 464 self.command = self.command.strip().lower() 465 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved 466 self.options=options.strip() 467 else: 468 if len(options) > 0: 469 try: 470 options = options.replace('=', ' = ').split() 471 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] 472 indices = [] 473 for sl in valued_indices: 474 indices.extend(sl) 475 token_indices = [n for n in range(len(options)) if n not in indices] 476 for opt in valued_indices: 477 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 478 self.options[options[opt[0]].lower()] = options[opt[2]] 479 for token in token_indices: 480 self.options[options[token].lower()] = None 481 except ValueError: 482 raise NexusError, 'Incorrect formatting in line: %s' % line
483
484 -class Block:
485 """Represent a NEXUS block with block name and list of commandlines ."""
486 - def __init__(self,title=None):
487 self.title=title 488 self.commandlines=[]
489
490 -class Nexus(object):
491 492 __slots__=['original_taxon_order','__dict__'] 493
494 - def __init__(self, input=None):
495 self.ntax=0 # number of taxa 496 self.nchar=0 # number of characters 497 self.taxlabels=[] # labels for taxa, ordered by their id 498 self.charlabels=None # ... and for characters 499 self.statelabels=None # ... and for states 500 self.datatype='dna' # (standard), dna, rna, nucleotide, protein 501 self.respectcase=False # case sensitivity 502 self.missing='?' # symbol for missing characters 503 self.gap='-' # symbol for gap 504 self.symbols=None # set of symbols 505 self.equate=None # set of symbol synonyms 506 self.matchchar=None # matching char for matrix representation 507 self.labels=None # left, right, no 508 self.transpose=False # whether matrix is transposed 509 self.interleave=False # whether matrix is interleaved 510 self.tokens=False # unsupported 511 self.eliminate=None # unsupported 512 self.matrix=None # ... 513 self.unknown_blocks=[] # blocks we don't care about 514 self.taxsets={} 515 self.charsets={} 516 self.charpartitions={} 517 self.taxpartitions={} 518 self.trees=[] # list of Trees (instances of tree class) 519 self.translate=None # Dict to translate taxon <-> taxon numbers 520 self.structured=[] # structured input representation 521 self.set={} # dict of the set command to set various options 522 self.options={} # dict of the options command in the data block 523 524 # some defaults 525 self.options['gapmode']='missing' 526 527 if input: 528 self.read(input)
529
530 - def get_original_taxon_order(self):
531 """Included for backwards compatibility.""" 532 return self.taxlabels
533 - def set_original_taxon_order(self,value):
534 """Included for backwards compatibility.""" 535 self.taxlabels=value
536 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) 537
538 - def read(self,input):
539 """Read and parse NEXUS imput (filename, file-handle, string.""" 540 541 # 1. Assume we have the name of a file in the execution dir 542 # Note we need to add parsing of the path to dir/filename 543 try: 544 file_contents = open(os.path.expanduser(input),'rU').read() 545 self.filename=input 546 except (TypeError,IOError,AttributeError): 547 #2 Assume we have a string from a fh.read() 548 if isinstance(input, str): 549 file_contents = input 550 self.filename='input_string' 551 #3 Assume we have a file object 552 elif hasattr(input,'read'): # file objects or StringIO objects 553 file_contents=input.read() 554 if hasattr(input,"name") and input.name: 555 self.filename=input.name 556 else: 557 self.filename='Unknown_nexus_file' 558 else: 559 print input.strip()[:50] 560 raise NexusError, 'Unrecognized input: %s ...' % input[:100] 561 file_contents=file_contents.strip() 562 if file_contents.startswith('#NEXUS'): 563 file_contents=file_contents[6:] 564 if C: 565 decommented=cnexus.scanfile(file_contents) 566 #check for unmatched parentheses 567 if decommented=='[' or decommented==']': 568 raise NexusError, 'Unmatched %s' % decommented 569 # cnexus can't return lists, so in analogy we separate commandlines with chr(7) 570 # (a character that shoudn't be part of a nexus file under normal circumstances) 571 commandlines=_adjust_lines(decommented.split(chr(7))) 572 else: 573 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents)) 574 # get rid of stupid 'NEXUS token' 575 try: 576 if commandlines[0][:6].upper()=='#NEXUS': 577 commandlines[0]=commandlines[0][6:].strip() 578 except: 579 pass 580 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 581 nexus_block_gen = self._get_nexus_block(commandlines) 582 while 1: 583 try: 584 title, contents = nexus_block_gen.next() 585 except StopIteration: 586 break 587 if title in KNOWN_NEXUS_BLOCKS: 588 self._parse_nexus_block(title, contents) 589 else: 590 self._unknown_nexus_block(title, contents)
591
592 - def _get_nexus_block(self,file_contents):
593 """Generator for looping through Nexus blocks.""" 594 inblock=False 595 blocklines=[] 596 while file_contents: 597 cl=file_contents.pop(0) 598 if cl.lower().startswith('begin'): 599 if not inblock: 600 inblock=True 601 title=cl.split()[1].lower() 602 else: 603 raise NexusError('Illegal block nesting in block %s' % title) 604 elif cl.lower().startswith('end'): 605 if inblock: 606 inblock=False 607 yield title,blocklines 608 blocklines=[] 609 else: 610 raise NexusError('Unmatched \'end\'.') 611 elif inblock: 612 blocklines.append(cl)
613
614 - def _unknown_nexus_block(self,title, contents):
615 block = Block() 616 block.commandlines.append(contents) 617 block.title = title 618 self.unknown_blocks.append(block)
619
620 - def _parse_nexus_block(self,title, contents):
621 """Parse a known Nexus Block """ 622 # attached the structered block representation 623 self._apply_block_structure(title, contents) 624 #now check for taxa,characters,data blocks. If this stuff is defined more than once 625 #the later occurences will override the previous ones. 626 block=self.structured[-1] 627 for line in block.commandlines: 628 try: 629 getattr(self,'_'+line.command)(line.options) 630 except AttributeError: 631 raise 632 raise NexusError, 'Unknown command: %s ' % line.command
633
634 - def _dimensions(self,options):
635 if options.has_key('ntax'): 636 self.ntax=eval(options['ntax']) 637 if options.has_key('nchar'): 638 self.nchar=eval(options['nchar'])
639
640 - def _format(self,options):
641 # print options 642 # we first need to test respectcase, then symbols (which depends on respectcase) 643 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 644 # dicts for ambiguous values and alphabet 645 if options.has_key('respectcase'): 646 self.respectcase=True 647 # adjust symbols to for respectcase 648 if options.has_key('symbols'): 649 self.symbols=options['symbols'] 650 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 651 (self.symbold.startswith("'") and self.symbols.endswith("'")): 652 self.symbols=self.symbols[1:-1].replace(' ','') 653 if not self.respectcase: 654 self.symbols=self.symbols.lower()+self.symbols.upper() 655 self.symbols=list(sets.Set(self.symbols)) 656 if options.has_key('datatype'): 657 self.datatype=options['datatype'].lower() 658 if self.datatype=='dna' or self.datatype=='nucleotide': 659 self.alphabet=IUPAC.ambiguous_dna 660 self.ambiguous_values=IUPACData.ambiguous_dna_values 661 self.unambiguous_letters=IUPACData.unambiguous_dna_letters 662 elif self.datatype=='rna': 663 self.alphabet=IUPAC.ambiguous_rna 664 self.ambiguous_values=IUPACData.ambiguous_rna_values 665 self.unambiguous_letters=IUPACData.unambiguous_rna_letters 666 elif self.datatype=='protein': 667 self.alphabet=IUPAC.protein 668 self.ambiguous_values={'B':'DN','Z':'EQ','X':IUPACData.protein_letters} # that's how PAUP handles it 669 self.unambiguous_letters=IUPACData.protein_letters+'*' # stop-codon 670 elif self.datatype=='standard': 671 raise NexusError('Datatype standard is not yet supported.') 672 #self.alphabet=None 673 #self.ambiguous_values={} 674 #if not self.symbols: 675 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states 676 #self.unambiguous_letters=self.symbols 677 else: 678 raise NexusError, 'Unsupported datatype: '+self.datatype 679 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters 680 if not self.respectcase: 681 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() 682 #we have to sort the reverse ambig coding dict key characters: 683 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 684 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X']) 685 self.rev_ambiguous_values={} 686 for (k,v) in rev.items(): 687 key=[c for c in k] 688 key.sort() 689 self.rev_ambiguous_values[''.join(key)]=v 690 #overwrite symbols for datype rna,dna,nucleotide 691 if self.datatype in ['dna','rna','nucleotide']: 692 self.symbols=self.alphabet.letters 693 if self.missing not in self.ambiguous_values: 694 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap 695 self.ambiguous_values[self.gap]=self.gap 696 elif self.datatype=='standard': 697 if not self.symbols: 698 self.symbols=['1','0'] 699 if options.has_key('missing'): 700 self.missing=options['missing'][0] 701 if options.has_key('gap'): 702 self.gap=options['gap'][0] 703 if options.has_key('equate'): 704 self.equate=options['equate'] 705 if options.has_key('matchchar'): 706 self.matchchar=options['matchchar'][0] 707 if options.has_key('labels'): 708 self.labels=options['labels'] 709 if options.has_key('transpose'): 710 raise NexusError, 'TRANSPOSE is not supported!' 711 self.transpose=True 712 if options.has_key('interleave'): 713 if options['interleave']==None or options['interleave'].lower()=='yes': 714 self.interleave=True 715 if options.has_key('tokens'): 716 self.tokens=True 717 if options.has_key('notokens'): 718 self.tokens=False
719 720
721 - def _set(self,options):
722 self.set=options;
723
724 - def _options(self,options):
725 self.options=options;
726
727 - def _eliminate(self,options):
728 self.eliminate=options
729
730 - def _taxlabels(self,options):
731 """Get taxon labels.""" 732 self.taxlabels=[] 733 opts=CharBuffer(options) 734 while True: 735 taxon=quotestrip(opts.next_word()) 736 if not taxon: 737 break 738 self.taxlabels.append(taxon)
739
740 - def _check_taxlabels(self,taxon):
741 """Check for presence of taxon in self.taxlabels.""" 742 # According to NEXUS standard, underscores shall be treated as spaces..., 743 # so checking for identity is more difficult 744 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) 745 nexid=taxon.replace(' ','_') 746 return nextaxa.get(nexid)
747
748 - def _charlabels(self,options):
749 self.charlabels={} 750 opts=CharBuffer(options) 751 while True: 752 try: 753 # get id and state 754 w=opts.next_word() 755 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 756 break 757 identifier=self._resolve(w,set_type=CHARSET) 758 state=quotestrip(opts.next_word()) 759 self.charlabels[identifier]=state 760 # check for comma or end of command 761 c=opts.next_nonwhitespace() 762 if c is None: 763 break 764 elif c!=',': 765 raise NexusError,'Missing \',\' in line %s.' % options 766 except NexusError: 767 raise 768 except: 769 raise NexusError,'Format error in line %s.' % options
770
771 - def _charstatelabels(self,options):
772 # warning: charstatelabels supports only charlabels-syntax! 773 self._charlabels(options)
774
775 - def _statelabels(self,options):
776 #self.charlabels=options 777 #print 'Command statelabels is not supported and will be ignored.' 778 pass
779
780 - def _matrix(self,options):
781 if not self.ntax or not self.nchar: 782 raise NexusError,'Dimensions must be specified before matrix!' 783 taxlabels_present=(self.taxlabels!=[]) 784 self.matrix={} 785 taxcount=0 786 block_interleave=0 787 #eliminate empty lines and leading/trailing whitespace 788 lines=[l.strip() for l in options.split('\n') if l.strip()<>''] 789 lineiter=iter(lines) 790 while 1: 791 try: 792 l=lineiter.next() 793 except StopIteration: 794 if taxcount<self.ntax: 795 raise NexusError, 'Not enough taxa in matrix.' 796 elif taxcount>self.ntax: 797 raise NexusError, 'Too many taxa in matrix.' 798 else: 799 break 800 # count the taxa and check for interleaved matrix 801 taxcount+=1 802 ##print taxcount 803 if taxcount>self.ntax: 804 if not self.interleave: 805 raise NexusError, 'Too many taxa in matrix - should matrix be interleaved?' 806 else: 807 taxcount=1 808 block_interleave=1 809 #get taxon name and sequence 810 linechars=CharBuffer(l) 811 id=quotestrip(linechars.next_word()) 812 l=linechars.rest().strip() 813 if taxlabels_present and not self._check_taxlabels(id): 814 raise NexusError,'Taxon '+id+' not found in taxlabels.' 815 chars='' 816 if self.interleave: 817 #interleaved matrix 818 #print 'In interleave' 819 if l: 820 chars=''.join(l.split()) 821 else: 822 chars=''.join(lineiter.next().split()) 823 else: 824 #non-interleaved matrix 825 chars=''.join(l.split()) 826 while len(chars)<self.nchar: 827 l=lineiter.next() 828 chars+=''.join(l.split()) 829 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) 830 #first taxon has the reference sequence if matchhar is used 831 if taxcount==1: 832 refseq=iupac_seq 833 else: 834 if self.matchchar: 835 while 1: 836 p=iupac_seq.tostring().find(self.matchchar) 837 if p==-1: 838 break 839 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) 840 #check for invalid characters 841 for i,c in enumerate(iupac_seq.tostring()): 842 if c not in self.valid_characters and c!=self.gap and c!=self.missing: 843 raise NexusError, 'Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\ 844 % (id,c,l[i-10:i+10]) 845 #add sequence to matrix 846 if block_interleave==0: 847 while self.matrix.has_key(id): 848 if id.split('.')[-1].startswith('copy'): 849 id='.'.join(id.split('.')[:-1])+'.copy'+str(eval('0'+id.split('.')[-1][4:])+1) 850 else: 851 id+='.copy' 852 #raise NexusError, id+' already in matrix!\nError in: '+l 853 self.matrix[id]=iupac_seq 854 # add taxon name only if taxlabels is not alredy present 855 if not taxlabels_present: 856 self.taxlabels.append(id) 857 else: 858 taxon_present=self._check_taxlabels(id) 859 if taxon_present: 860 self.matrix[taxon_present]+=iupac_seq 861 else: 862 raise NexusError, 'Taxon %s not in first block of interleaved matrix.' % id 863 #check all sequences for length according to nchar 864 for taxon in self.matrix: 865 if len(self.matrix[taxon])!=self.nchar: 866 raise NexusError,'Nchar ('+str(self.nchar)+') does not match data for taxon '+taxon
867
868 - def _translate(self,options):
869 self.translate={} 870 opts=CharBuffer(options) 871 while True: 872 try: 873 # get id and state 874 identifier=int(opts.next_word()) 875 label=quotestrip(opts.next_word()) 876 self.translate[identifier]=label 877 # check for comma or end of command 878 c=opts.next_nonwhitespace() 879 if c is None: 880 break 881 elif c!=',': 882 raise NexusError,'Missing \',\' in line %s.' % options 883 except NexusError: 884 raise 885 except: 886 raise NexusError,'Format error in line %s.' % options
887
888 - def _utree(self,options):
889 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 890 self._tree(options)
891
892 - def _tree(self,options):
893 opts=CharBuffer(options) 894 name=opts.next_word() 895 if opts.next_nonwhitespace()!='=': 896 raise NexusError,'Syntax error in tree description: %s' % options[:50] 897 rooted=False 898 weight=1.0 899 while opts.peek_nonwhitespace()=='[': 900 open=opts.next_nonwhitespace() 901 symbol=opts.next() 902 if symbol!='&': 903 raise NexusError,'Illegal special comment [%s...] in tree description: %s' % (symbol, options[:50]) 904 special=opts.next() 905 value=opts.next_until(']') 906 closing=opts.next() 907 if special=='R': 908 rooted=True 909 elif special=='U': 910 rooted=False 911 elif special=='W': 912 weight=float(value) 913 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) 914 # if there's an active translation table, translate 915 if self.translate: 916 for n in tree.get_terminals(): 917 try: 918 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 919 except (ValueError,KeyError): 920 raise NexusError,'Unable to substitue %s using \'translate\' data.' % tree.node(n).data.taxon 921 self.trees.append(tree)
922
923 - def _apply_block_structure(self,title,lines):
924 block=Block('') 925 block.title = title 926 for line in lines: 927 block.commandlines.append(Commandline(line, title)) 928 self.structured.append(block)
929
930 - def _taxset(self, options):
931 name,taxa=self._get_indices(options,set_type=TAXSET) 932 self.taxsets[name]=_make_unique(taxa)
933
934 - def _charset(self, options):
935 name,sites=self._get_indices(options,set_type=CHARSET) 936 self.charsets[name]=_make_unique(sites)
937
938 - def _taxpartition(self, options):
939 taxpartition={} 940 quotelevel=False 941 opts=CharBuffer(options) 942 name=self._name_n_vector(opts) 943 if not name: 944 raise NexusError, 'Formatting error in taxpartition: %s ' % options 945 # now collect thesubbpartitions and parse them 946 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 947 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 948 sub='' 949 while True: 950 w=opts.next() 951 if w is None or (w==',' and not quotelevel): 952 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') 953 taxpartition[subname]=_make_unique(subindices) 954 sub='' 955 if w is None: 956 break 957 else: 958 if w=="'": 959 quotelevel=not quotelevel 960 sub+=w 961 self.taxpartitions[name]=taxpartition
962
963 - def _charpartition(self, options):
964 charpartition={} 965 quotelevel=False 966 opts=CharBuffer(options) 967 name=self._name_n_vector(opts) 968 if not name: 969 raise NexusError, 'Formatting error in charpartition: %s ' % options 970 # now collect thesubbpartitions and parse them 971 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 972 sub='' 973 while True: 974 w=opts.next() 975 if w is None or (w==',' and not quotelevel): 976 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') 977 charpartition[subname]=_make_unique(subindices) 978 sub='' 979 if w is None: 980 break 981 else: 982 if w=="'": 983 quotelevel=not quotelevel 984 sub+=w 985 self.charpartitions[name]=charpartition
986
987 - def _get_indices(self,options,set_type=CHARSET,separator='='):
988 """Parse the taxset/charset specification 989 '1 2 3 - 5 dog cat 10- 20 \\ 3' --> [0,1,2,3,4,'dog','cat',10,13,16,19] 990 """ 991 opts=CharBuffer(options) 992 name=self._name_n_vector(opts,separator=separator) 993 indices=self._parse_list(opts,set_type=set_type) 994 if indices is None: 995 raise NexusError, 'Formatting error in line: %s ' % options 996 return name,indices
997
998 - def _name_n_vector(self,opts,separator='='):
999 """Extract name and check that it's not in vector format.""" 1000 rest=opts.rest() 1001 name=opts.next_word() 1002 if not name: 1003 raise NexusError, 'Formatting error in line: %s ' % rest 1004 name=quotestrip(name) 1005 if opts.peek_nonwhitespace=='(': 1006 open=opts.next_nonwhitespace() 1007 qualifier=open.next_word() 1008 close=opts.next_nonwhitespace() 1009 if qualifier.lower()=='vector': 1010 raise NexusError, 'Unsupported VECTOR format in line %s' % (options) 1011 elif qualifier.lower()!='standard': 1012 raise NexusError, 'Unknown qualifier %s in line %s' % (qualifier,options) 1013 if opts.next_nonwhitespace()!=separator: 1014 raise NexusError, 'Formatting error in line: %s ' % rest 1015 return name
1016
1017 - def _parse_list(self,options_buffer,set_type):
1018 """Parse a NEXUS list: [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17-21], 1019 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1020 """ 1021 plain_list=[] 1022 if options_buffer.peek_nonwhitespace(): 1023 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError 1024 while True: 1025 identifier=options_buffer.next_word() # next list element 1026 if not identifier: # end of list? 1027 break 1028 start=self._resolve(identifier,set_type=set_type) 1029 if options_buffer.peek_nonwhitespace()=='-': # followd by - 1030 end=start 1031 step=1 1032 # get hyphen and end of range 1033 hyphen=options_buffer.next_nonwhitespace() 1034 end=self._resolve(options_buffer.next_word(),set_type=set_type) 1035 if set_type==CHARSET: 1036 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ 1037 backslash=options_buffer.next_nonwhitespace() 1038 step=int(options_buffer.next_word()) # get backslash and step 1039 plain_list.extend(range(start,end+1,step)) 1040 else: 1041 if type(start)==list or type(end)==list: 1042 raise NexusError, 'Name if character sets not allowed in range definition: %s' % identifier 1043 start=self.taxlabels.index(start) 1044 end=self.taxlabels.index(end) 1045 taxrange=self.taxlabels[start:end+1] 1046 plain_list.extend(taxrange) 1047 else: 1048 if type(start)==list: # start was the name of charset or taxset 1049 plain_list.extend(start) 1050 else: # start was an ordinary identifier 1051 plain_list.append(start) 1052 except NexusError: 1053 raise 1054 except: 1055 return None 1056 return plain_list
1057
1058 - def _resolve(self,identifier,set_type=None):
1059 """Translate identifier in list into character/taxon index. 1060 Characters (which are referred to by their index in Nexus.py): 1061 Plain numbers are returned minus 1 (Nexus indices to python indices) 1062 Text identifiers are translaterd into their indices (if plain character indentifiers), 1063 the first hit in charlabels is returned (charlabels don't need to be unique) 1064 or the range of indices is returned (if names of character sets). 1065 Taxa (which are referred to by their unique name in Nexus.py): 1066 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1067 Names are returned unchanged (if plain taxon identifiers), or the names in 1068 the corresponding taxon set is returned 1069 """ 1070 identifier=quotestrip(identifier) 1071 if not set_type: 1072 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1073 if set_type==CHARSET: 1074 try: 1075 n=int(identifier) 1076 except ValueError: 1077 if self.charlabels and identifier in self.charlabels.values(): 1078 for k in self.charlabels: 1079 if self.charlabels[k]==identifier: 1080 return k 1081 elif self.charsets and identifier in self.charsets: 1082 return self.charsets[identifier] 1083 else: 1084 raise NexusError, 'Unknown character identifier: %s' % identifier 1085 else: 1086 if n<=self.nchar: 1087 return n-1 1088 else: 1089 raise NexusError, 'Illegal character identifier: %d>nchar (=%d).' % (identifier,self.nchar) 1090 elif set_type==TAXSET: 1091 try: 1092 n=int(identifier) 1093 except ValueError: 1094 taxlabels_id=self._check_taxlabels(identifier) 1095 if taxlabels_id: 1096 return taxlabels_id 1097 elif self.taxsets and identifier in self.taxsets: 1098 return self.taxsets[identifier] 1099 else: 1100 raise NexusError, 'Unknown taxon identifier: %s' % identifier 1101 else: 1102 if n>0 and n<=self.ntax: 1103 return self.taxlabels[n-1] 1104 else: 1105 raise NexusError, 'Illegal taxon identifier: %d>ntax (=%d).' % (identifier,self.ntax) 1106 else: 1107 raise NexusError('Unknown set specification: %s.'% set_type)
1108
1109 - def _stateset(self, options):
1110 #Not implemented 1111 pass
1112
1113 - def _changeset(self, options):
1114 #Not implemented 1115 pass
1116
1117 - def _treeset(self, options):
1118 #Not implemented 1119 pass
1120
1121 - def _treepartition(self, options):
1122 #Not implemented 1123 pass
1124
1125 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, 1126 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1127 """Writes a nexus file for each partition in charpartition. 1128 Only non-excluded characters and non-deleted taxa are included, just the data block is written. 1129 """ 1130 1131 if not matrix: 1132 matrix=self.matrix 1133 if not matrix: 1134 return 1135 if not filename: 1136 filename=self.filename 1137 if charpartition: 1138 pfilenames={} 1139 for p in charpartition: 1140 total_exclude=[]+exclude 1141 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) 1142 total_exclude=_make_unique(total_exclude) 1143 pcomment=comment+'\nPartition: '+p+'\n' 1144 dot=filename.rfind('.') 1145 if dot>0: 1146 pfilename=filename[:dot]+'_'+p+'.data' 1147 else: 1148 pfilename=filename+'_'+p 1149 pfilenames[p]=pfilename 1150 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, 1151 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, 1152 mrbayes=mrbayes) 1153 return pfilenames 1154 else: 1155 fn=self.filename+'.data' 1156 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, 1157 exclude=exclude,delete=delete,comment=comment,append_sets=False, 1158 mrbayes=mrbayes) 1159 return fn
1160
1161 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ 1162 blocksize=None, interleave=False, interleave_by_partition=False,\ 1163 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False):
1164 """ Writes a nexus file with data and sets block. Character sets and partitions 1165 are appended by default, and are adjusted according 1166 to excluded characters (i.e. character sets still point to the same sites (not necessarily same positions), 1167 without including the deleted characters. 1168 """ 1169 if not matrix: 1170 matrix=self.matrix 1171 if not matrix: 1172 return 1173 if not filename: 1174 filename=self.filename 1175 if [t for t in delete if not self._check_taxlabels(t)]: 1176 raise NexusError, 'Unknown taxa: %s' % ', '.join(sets.Set(delete).difference(sets.Set(self.taxlabels))) 1177 if interleave_by_partition: 1178 if not interleave_by_partition in self.charpartitions: 1179 raise NexusError, 'Unknown partition: '+interleave_by_partition 1180 else: 1181 partition=self.charpartitions[interleave_by_partition] 1182 # we need to sort the partition names by starting position before we exclude characters 1183 names=_sort_keys_by_values(partition) 1184 newpartition={} 1185 for p in partition: 1186 newpartition[p]=[c for c in partition[p] if c not in exclude] 1187 # how many taxa and how many characters are left? 1188 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1189 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) 1190 ntax_adjusted=len(undelete) 1191 nchar_adjusted=len(cropped_matrix[undelete[0]]) 1192 if not undelete or (undelete and undelete[0]==''): 1193 return 1194 if isinstance(filename,str): 1195 try: 1196 fh=open(filename,'w') 1197 except IOError: 1198 raise NexusError, 'Could not open %s for writing.' % filename 1199 elif isinstance(filename,file): 1200 fh=filename 1201 if not omit_NEXUS: 1202 fh.write('#NEXUS\n') 1203 if comment: 1204 fh.write('['+comment+']\n') 1205 fh.write('begin data;\n') 1206 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1207 fh.write('\tformat datatype='+self.datatype) 1208 if self.respectcase: 1209 fh.write(' respectcase') 1210 if self.missing: 1211 fh.write(' missing='+self.missing) 1212 if self.gap: 1213 fh.write(' gap='+self.gap) 1214 if self.matchchar: 1215 fh.write(' matchchar='+self.matchchar) 1216 if self.labels: 1217 fh.write(' labels='+self.labels) 1218 if self.equate: 1219 fh.write(' equate='+self.equate) 1220 if interleave or interleave_by_partition: 1221 fh.write(' interleave') 1222 fh.write(';\n') 1223 #if self.taxlabels: 1224 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1225 if self.charlabels: 1226 newcharlabels=self._adjust_charlabels(exclude=exclude) 1227 clkeys=newcharlabels.keys() 1228 clkeys.sort() 1229 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') 1230 fh.write('matrix\n') 1231 if not blocksize: 1232 if interleave: 1233 blocksize=70 1234 else: 1235 blocksize=self.nchar 1236 # delete deleted taxa and ecxclude excluded characters... 1237 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) 1238 if interleave_by_partition: 1239 # interleave by partitions, but adjust partitions with regard to excluded characters 1240 seek=0 1241 for p in names: 1242 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) 1243 if len(newpartition[p])>0: 1244 for taxon in undelete: 1245 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1246 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1247 fh.write('\n') 1248 else: 1249 fh.write('[empty]\n\n') 1250 seek+=len(newpartition[p]) 1251 elif interleave: 1252 for seek in range(0,nchar_adjusted,blocksize): 1253 for taxon in undelete: 1254 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1255 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1256 fh.write('\n') 1257 else: 1258 for taxon in undelete: 1259 if blocksize<nchar_adjusted: 1260 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') 1261 else: 1262 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1263 for seek in range(0,nchar_adjusted,blocksize): 1264 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1265 fh.write(';\nend;\n') 1266 if append_sets: 1267 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) 1268 fh.close() 1269 return filename
1270
1271 - def append_sets(self,exclude=[],delete=[],mrbayes=False):
1272 """Appends a sets block to <filename>.""" 1273 if not self.charsets and not self.taxsets and not self.charpartitions: 1274 return '' 1275 sets=['\nbegin sets'] 1276 # - now if characters have been excluded, the character sets need to be adjusted, 1277 # so that they still point to the right character positions 1278 # calculate a list of offsets: for each deleted character, the following character position 1279 # in the new file will have an additional offset of -1 1280 offset=0 1281 offlist=[] 1282 for c in range(self.nchar): 1283 if c in exclude: 1284 offset+=1 1285 offlist.append(-1) # dummy value as these character positions are excluded 1286 else: 1287 offlist.append(c-offset) 1288 # now adjust each of the character sets 1289 for n,ns in self.charsets.items(): 1290 cset=[offlist[c] for c in ns if c not in exclude] 1291 if cset: 1292 sets.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 1293 for n,s in self.taxsets.items(): 1294 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] 1295 if tset: 1296 sets.append('taxset %s = %s' % (safename(n),' '.join(tset))) 1297 for n,p in self.charpartitions.items(): 1298 # as characters have been excluded, the partitions must be adjusted 1299 # if a partition is empty, it will be omitted from the charpartition command 1300 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1301 names=_sort_keys_by_values(p) 1302 newpartition={} 1303 for sn in names: 1304 nsp=[offlist[c] for c in p[sn] if c not in exclude] 1305 if nsp: 1306 newpartition[sn]=nsp 1307 if newpartition: 1308 sets.append('charpartition %s = %s' % (safename(n),\ 1309 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) 1310 # now write charpartititions, much easier than charpartitions 1311 for n,p in self.taxpartitions.items(): 1312 names=_sort_keys_by_values(p) 1313 newpartition={} 1314 for sn in names: 1315 nsp=[t for t in p[sn] if t not in delete] 1316 if nsp: 1317 newpartition[sn]=nsp 1318 if newpartition: 1319 sets.append('taxpartition %s = %s' % (safename(n),\ 1320 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) 1321 # add 'end' and return everything 1322 sets.append('end;\n') 1323 return ';\n'.join(sets) 1324 f.close()
1325
1326 - def export_fasta(self, filename=None, width=70):
1327 """Writes matrix into a fasta file: (self, filename=None, width=70).""" 1328 if not filename: 1329 if '.' in filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1330 filename='.'.join(self.filename.split('.')[:-1])+'.fas' 1331 else: 1332 filename=self.filename+'.fas' 1333 fh=open(filename,'w') 1334 for taxon in self.taxlabels: 1335 fh.write('>'+safename(taxon)+'\n') 1336 for i in range(0, len(self.matrix[taxon].tostring()), width): 1337 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') 1338 fh.close()
1339
1340 - def constant(self,matrix=None,delete=[],exclude=[]):
1341 """Return a list with all constant characters.""" 1342 if not matrix: 1343 matrix=self.matrix 1344 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1345 if not undelete: 1346 return None 1347 elif len(undelete)==1: 1348 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1349 # get the first sequence and expand all ambiguous values 1350 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 1351 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] 1352 for taxon in undelete[1:]: 1353 newconstant=[] 1354 for site in constant: 1355 #print '%d (paup=%d)' % (site[0],site[0]+1), 1356 seqsite=matrix[taxon][site[0]].upper() 1357 #print seqsite,'checked against',site[1],'\t', 1358 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 1359 # missing or same as before -> ok 1360 newconstant.append(site) 1361 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): 1362 # subset of an ambig or only missing in previous -> take subset 1363 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) 1364 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values 1365 intersect=sets.Set(self.ambiguous_values[seqsite]).intersection(sets.Set(site[1])) 1366 if intersect: 1367 newconstant.append((site[0],''.join(intersect))) 1368 # print 'ok' 1369 #else: 1370 # print 'failed' 1371 #else: 1372 # print 'failed' 1373 constant=newconstant 1374 cpos=[s[0] for s in constant] 1375 return constant
1376 # return [x[0] for x in constant] 1377
1378 - def cstatus(self,site,delete=[],narrow=True):
1379 """Summarize character. 1380 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1381 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1382 """ 1383 undelete=[t for t in self.taxlabels if t not in delete] 1384 if not undelete: 1385 return None 1386 cstatus=[] 1387 for t in undelete: 1388 c=self.matrix[t][site].upper() 1389 if self.options.get('gapmode')=='missing' and c==self.gap: 1390 c=self.missing 1391 if narrow and c==self.missing: 1392 if c not in cstatus: 1393 cstatus.append(c) 1394 else: 1395 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) 1396 if self.missing in cstatus and narrow and len(cstatus)>1: 1397 cstatus=[c for c in cstatus if c!=self.missing] 1398 cstatus.sort() 1399 return cstatus
1400
1401 - def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
1402 """Calculates a stepmatrix for weighted parsimony. 1403 See Wheeler (1990), Cladistics 6:269-275 and 1404 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 1405 """ 1406 m=StepMatrix(self.unambiguous_letters,self.gap) 1407 for site in [s for s in range(self.nchar) if s not in exclude]: 1408 cstatus=self.cstatus(site,delete) 1409 for i,b1 in enumerate(cstatus[:-1]): 1410 for b2 in cstatus[i+1:]: 1411 m.add(b1.upper(),b2.upper(),1) 1412 return m.transformation().weighting().smprint(name=name)
1413
1414 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1415 """Return a matrix without deleted taxa and excluded characters.""" 1416 if not matrix: 1417 matrix=self.matrix 1418 if [t for t in delete if not self._check_taxlabels(t)]: 1419 raise NexusError, 'Unknwon taxa: %s' % ', '.join(sets.Set(delete).difference(self.taxlabels)) 1420 if exclude!=[]: 1421 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1422 if not undelete: 1423 return {} 1424 m=[matrix[k].tostring() for k in undelete] 1425 zipped_m=zip(*m) 1426 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] 1427 if sitesm==[]: 1428 return dict([(t,Seq('',self.alphabet)) for t in undelete]) 1429 else: 1430 zipped_sitesm=zip(*sitesm) 1431 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] 1432 return dict(zip(undelete,m)) 1433 else: 1434 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1435
1436 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1437 """Return a bootstrapped matrix.""" 1438 if not matrix: 1439 matrix=self.matrix 1440 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects 1441 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out 1442 if not cm: # everything deleted? 1443 return {} 1444 elif len(cm[cm.keys()[0]])==0: # everything excluded? 1445 return cm 1446 undelete=[t for t in self.taxlabels if t in cm] 1447 if seqobjects: 1448 sitesm=zip(*[cm[t].tostring() for t in undelete]) 1449 alphabet=matrix[matrix.keys()[0]].alphabet 1450 else: 1451 sitesm=zip(*[cm[t] for t in undelete]) 1452 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] 1453 bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) 1454 if seqobjects: 1455 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] 1456 return dict(zip(undelete,bootstrapseqs))
1457
1458 - def add_sequence(self,name,sequence):
1459 """Adds a sequence to the matrix.""" 1460 if not name: 1461 raise NexusError, 'New sequence must have a name' 1462 diff=self.nchar-len(sequence) 1463 if diff<0: 1464 self.insert_gap(self.nchar,-diff) 1465 elif diff>0: 1466 sequence+=self.missing*diff 1467 1468 self.matrix[name]=Seq(sequence,self.alphabet) 1469 self.ntax+=1 1470 self.taxlabels.append(name)
1471 #taxlabels? 1472
1473 - def insert_gap(self,pos,n=1,leftgreedy=False):
1474 """Add a gap into the matrix and adjust charsets and partitions. 1475 1476 pos=0: first position 1477 pos=nchar: last position 1478 """ 1479 1480 def _adjust(set,x,d,leftgreedy=False): 1481 """Adjusts chartacter sets if gaps are inserted, taking care of 1482 new gaps within a coherent character set.""" 1483 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 1484 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 1485 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 1486 set.sort() 1487 addpos=0 1488 for i,c in enumerate(set): 1489 if c>=x: 1490 set[i]=c+d 1491 # if we add gaps within a group of characters, we want the gap position included in this group 1492 if c==x: 1493 if leftgreedy or (i>0 and set[i-1]==c-1): 1494 addpos=i 1495 if addpos>0: 1496 set[addpos:addpos]=range(x,x+d) 1497 return set
1498 1499 if pos<0 or pos>self.nchar: 1500 raise NexusError('Illegal gap position: %d' % pos) 1501 if n==0: 1502 return 1503 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1504 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n 1505 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ 1506 # i,taxon in enumerate(self.taxlabels)]) 1507 zipped=zip(*sitesm) 1508 mapped=map(''.join,zipped) 1509 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] 1510 self.matrix=dict(listed) 1511 self.nchar+=n 1512 # now adjust character sets 1513 for i,s in self.charsets.items(): 1514 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1515 for p in self.charpartitions: 1516 for sp,s in self.charpartitions[p].items(): 1517 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1518 # now adjust character state labels 1519 self.charlabels=self._adjust_charlabels(insert=[pos]*n) 1520 return self.charlabels 1521
1522 - def _adjust_charlabels(self,exclude=None,insert=None):
1523 """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" 1524 if exclude and insert: 1525 raise NexusError, 'Can\'t exclude and insert at the same time' 1526 if not self.charlabels: 1527 return None 1528 labels=self.charlabels.keys() 1529 labels.sort() 1530 newcharlabels={} 1531 if exclude: 1532 exclude.sort() 1533 exclude.append(sys.maxint) 1534 excount=0 1535 for c in labels: 1536 if not c in exclude: 1537 while c>exclude[excount]: 1538 excount+=1 1539 newcharlabels[c-excount]=self.charlabels[c] 1540 elif insert: 1541 insert.sort() 1542 insert.append(sys.maxint) 1543 icount=0 1544 for c in labels: 1545 while c>=insert[icount]: 1546 icount+=1 1547 newcharlabels[c+icount]=self.charlabels[c] 1548 else: 1549 return self.charlabels 1550 return newcharlabels
1551
1552 - def invert(self,charlist):
1553 """Returns all character indices that are not in charlist.""" 1554 return [c for c in range(self.nchar) if c not in charlist]
1555
1556 - def gaponly(self,include_missing=False):
1557 """Return gap-only sites.""" 1558 gap=sets.Set(self.gap) 1559 if include_missing: 1560 gap.add(self.missing) 1561 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1562 gaponly=[i for i,site in enumerate(sitesm) if sets.Set(site).issubset(gap)] 1563 return gaponly
1564
1565 - def terminal_gap_to_missing(self,missing=None,skip_n=True):
1566 """Replaces all terminal gaps with missing character. 1567 1568 Mixtures like ???------??------- are properly resolved.""" 1569 1570 if not missing: 1571 missing=self.missing 1572 replace=[self.missing,self.gap] 1573 if not skip_n: 1574 replace.extend(['n','N']) 1575 for taxon in self.taxlabels: 1576 sequence=self.matrix[taxon].tostring() 1577 length=len(sequence) 1578 start,end=get_start_end(sequence,skiplist=replace) 1579 if start==-1 and end==-1: 1580 sequence=missing*length 1581 else: 1582 sequence=sequence[:end+1]+missing*(length-end-1) 1583 sequence=start*missing+sequence[start:] 1584 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon 1585 self.matrix[taxon]=Seq(sequence,self.alphabet)
1586