Package nltk_lite :: Package contrib :: Module fsa
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.fsa

  1  # Natural Language Toolkit: Finite State Automata 
  2  # 
  3  # Copyright (C) 2001-2006 University of Pennsylvania 
  4  # Authors: Steven Bird <sb@ldc.upenn.edu> 
  5  #          Rob Speer <rspeer@mit.edu> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  A module for finite state automata.  
 11  Operations are based on Aho, Sethi & Ullman (1986) Chapter 3. 
 12  """ 
 13   
 14  from nltk_lite import tokenize 
 15  from nltk_lite.parse.tree import Tree 
 16  from nltk_lite.parse import cfg, pcfg, pchart 
 17  import yaml 
 18   
 19  epsilon = None 
 20   
 21  # some helper functions 
 22   
 23  # TODO - check that parse was complete, and report error otherwise 
 24   
25 -class FSA(yaml.YAMLObject):
26 """ 27 A class for finite state automata. In general, it represents 28 nondetermnistic finite state automata, with DFAs being a special case. 29 """ 30 yaml_tag = '!FSA'
31 - def __init__(self, sigma='', transitions=None, start=0, finals=None):
32 """Set up the FSA. 33 34 @param sigma: the alphabet of the FSA 35 @type sigma: sequence 36 @param transitions: A dictionary representing the states and 37 transitions in the FSA. The keys are state identifiers (any hashable 38 object), and the values are dictionaries that map input symbols to the 39 sets of states they lead to. 40 @type transitions: dict 41 @param start: The identifier of the start state 42 @type start: hashable object 43 @param finals: The identifiers of the accept states 44 @type finals: sequence 45 """ 46 self._transitions = transitions or {0: {}} 47 self._start = start 48 self._reverse = {} 49 self._build_reverse_transitions() 50 if finals: self._finals = set(finals) 51 else: self._finals = set([0]) 52 self._sigma = set(sigma) 53 assert isinstance(self._transitions, dict) 54 self._next_state_num = 0
55
57 for state in self._transitions: 58 self._reverse.setdefault(state, {}) 59 for (state, symbol, target) in self.generate_transitions(): 60 self._add_transition(self._reverse, target, symbol, state)
61
62 - def generate_transitions(self):
63 """ 64 A generator that yields each transition arrow in the FSA in the form 65 (source, label, target). 66 """ 67 for (state, map) in self._transitions.items(): 68 for (symbol, targets) in map.items(): 69 for target in targets: 70 yield (state, symbol, target)
71
72 - def labels(self, s1, s2):
73 """ 74 A generator for all possible labels taking state s1 to state s2. 75 """ 76 map = self._transitions.get(s1, {}) 77 for (symbol, targets) in map.items(): 78 if s2 in targets: yield symbol
79
80 - def sigma(self):
81 "The alphabet of the FSA." 82 return self._sigma
83 alphabet = sigma 84
85 - def check_in_sigma(self, label):
86 "Check whether a given object is in the alphabet." 87 if label and label not in self._sigma: 88 raise ValueError('Label "%s" not in alphabet: %s' % (label, str(self._sigma)))
89
90 - def __len__(self):
91 "The number of states in the FSA." 92 return len(self._transitions)
93
94 - def new_state(self):
95 """ 96 Add a new state to the FSA. 97 @returns: the ID of the new state (a sequentially-assigned number). 98 @rtype: int 99 """ 100 while self._next_state_num in self._transitions: 101 self._next_state_num += 1 102 self._transitions[self._next_state_num] = {} 103 self._reverse[self._next_state_num] = {} 104 return self._next_state_num
105
106 - def add_state(self, name):
107 self._transitions[name] = {} 108 self._reverse[name] = {} 109 return name
110
111 - def start(self):
112 """ 113 @returns: the ID of the FSA's start state. 114 """ 115 return self._start
116
117 - def finals(self):
118 """ 119 @returns: the IDs of all accept states. 120 @rtype: set 121 """ 122 # was a tuple before 123 return self._finals
124
125 - def states(self):
126 """ 127 @returns: a list of all states in the FSA. 128 @rtype: list 129 """ 130 return self._transitions.keys()
131
132 - def add_final(self, state):
133 """ 134 Make a state into an accept state. 135 """ 136 self._finals.add(state)
137
138 - def delete_final(self, state):
139 """ 140 Make an accept state no longer be an accept state. 141 """ 142 self._finals = self._finals.difference(set([state]))
143 # del self._finals[state] 144
145 - def set_final(self, states):
146 """ 147 Set the list of accept states. 148 """ 149 self._finals = set(states)
150
151 - def set_start(self, start):
152 """ 153 Set the start state of the FSA. 154 """ 155 self._start = start
156
157 - def in_finals(self, list):
158 """ 159 Check whether a sequence contains any final states. 160 """ 161 return [state for state in list 162 if state in self.finals()] != []
163
164 - def insert_safe(self, s1, label, s2):
165 if s1 not in self.states(): 166 self.add_state(s1) 167 if s2 not in self.states(): 168 self.add_state(s2) 169 self.insert(s1, label, s2)
170
171 - def insert(self, s1, label, s2):
172 """ 173 Add a new transition to the FSA. 174 175 @param s1: the source of the transition 176 @param label: the element of the alphabet that labels the transition 177 @param s2: the destination of the transition 178 """ 179 if s1 not in self.states(): 180 raise ValueError, "State %s does not exist" % s1 181 if s2 not in self.states(): 182 raise ValueError, "State %s does not exist" % s1 183 self._add_transition(self._transitions, s1, label, s2) 184 self._add_transition(self._reverse, s2, label, s1)
185
186 - def _add_transition(self, map, s1, label, s2):
187 mapping = map[s1] 188 targets = mapping.setdefault(label, set()) 189 targets.add(s2)
190
191 - def _del_transition(self, map, s1, label, s2):
192 mapping = map[s1] 193 targets = mapping.setdefault(label, set()) 194 targets.remove(s2) 195 if len(targets) == 0: del mapping[label]
196
197 - def delete(self, s1, label, s2):
198 """ 199 Removes a transition from the FSA. 200 201 @param s1: the source of the transition 202 @param label: the element of the alphabet that labels the transition 203 @param s2: the destination of the transition 204 """ 205 if s1 not in self.states(): 206 raise ValueError, "State %s does not exist" % s1 207 if s2 not in self.states(): 208 raise ValueError, "State %s does not exist" % s1 209 self._del_transition(self._transitions, s1, label, s2) 210 self._del_transition(self._reverse, s2, label, s1)
211
212 - def delete_state(self, state):
213 "Removes a state and all its transitions from the FSA." 214 if state not in self.states(): 215 raise ValueError, "State %s does not exist" % state 216 for (s1, label, s2) in self.incident_transitions(state): 217 self.delete(s1, label, s2) 218 del self._transitions[state] 219 del self._reverse[state]
220
221 - def incident_transitions(self, state):
222 """ 223 @returns: a set of transitions into or out of a state. 224 @rtype: set 225 """ 226 result = set() 227 forward = self._transitions[state] 228 backward = self._reverse[state] 229 for label, targets in forward.items(): 230 for target in targets: 231 result.add((state, label, target)) 232 for label, targets in backward.items(): 233 for target in targets: 234 result.add((target, label, state)) 235 return result
236
237 - def relabel_state(self, old, new):
238 """ 239 Assigns a state a new identifier. 240 """ 241 if old not in self.states(): 242 raise ValueError, "State %s does not exist" % old 243 if new in self.states(): 244 raise ValueError, "State %s already exists" % new 245 changes = [] 246 for (s1, symbol, s2) in self.generate_transitions(): 247 if s1 == old and s2 == old: 248 changes.append((s1, symbol, s2, new, symbol, new)) 249 elif s1 == old: 250 changes.append((s1, symbol, s2, new, symbol, s2)) 251 elif s2 == old: 252 changes.append((s1, symbol, s2, s1, symbol, new)) 253 for (leftstate, symbol, rightstate, newleft, newsym, newright)\ 254 in changes: 255 self.remove(leftstate, symbol, rightstate) 256 self.insert(newleft, newsym, newright) 257 del self._transitions[old] 258 del self._reverse[old]
259
260 - def next(self, state, symbol):
261 "The set of states reached from a certain state via a given symbol." 262 return self.e_closure(self._transitions[state].get(symbol, set()))
263 nextStates = next 264
265 - def move(self, states, symbol):
266 "The set of states reached from a set of states via a given symbol." 267 result = set() 268 for state in states: 269 result = result.union(self.next(state, symbol)) 270 return self.e_closure(result)
271
272 - def is_deterministic(self):
273 """ 274 Return whether this is a DFA 275 (every symbol leads from a state to at most one target state). 276 """ 277 for map in self._transitions.values(): 278 for targets in map.values(): 279 if len(targets) > 1: return False 280 return True
281
282 - def nextState(self, state, symbol):
283 """ 284 The single state reached from a state via a given symbol. 285 If there is more than one such state, raises a ValueError. 286 If there is no such state, returns None. 287 """ 288 next = self.next(state, symbol) 289 if len(next) > 1: 290 raise ValueError, "This FSA is nondeterministic -- use nextStates instead." 291 elif len(next) == 1: return list(next)[0] 292 else: return None
293
294 - def forward_traverse(self, state):
295 "All states reachable by following transitions from a given state." 296 result = set() 297 for (symbol, targets) in self._transitions[state].items(): 298 result = result.union(targets) 299 return result
300
301 - def reverse_traverse(self, state):
302 """All states from which a given state is reachable by following 303 transitions.""" 304 result = set() 305 for (symbol, targets) in self._reverse[state].items(): 306 result = result.union(targets) 307 return result
308
309 - def _forward_accessible(self, s1, visited):
310 for s2 in self.forward_traverse(s1): 311 if not s2 in visited: 312 visited.add(s2) 313 self._forward_accessible(s2, visited) 314 return visited
315
316 - def _reverse_accessible(self, s1, visited):
317 for s2 in self.reverse_traverse(s1): 318 if not s2 in visited: 319 visited.add(s2) 320 self._reverse_accessible(s2, visited) 321 return visited
322 323 # delete inaccessible nodes and unused transitions
324 - def prune(self):
325 """ 326 Modifies an FSA to remove inaccessible states and unused transitions. 327 """ 328 acc = self.accessible() 329 for state in self.states(): 330 if state not in acc: 331 self.delete_state(state) 332 else: 333 self._clean_map(self._transitions[state]) 334 self._clean_map(self._reverse[state])
335
336 - def _clean_map(self, map):
337 for (key, value) in map.items(): 338 if len(value) == 0: 339 del map[key]
340 341 # mark accessible nodes
342 - def accessible(self):
343 acc = set() 344 for final in self.finals(): 345 reverse_acc = set([final]) 346 self._reverse_accessible(final, reverse_acc) 347 acc = acc.union(reverse_acc) 348 349 forward_acc = set([self.start()]) 350 self._forward_accessible(self.start(), forward_acc) 351 352 acc = acc.intersection(forward_acc) 353 return acc
354
355 - def e_closure(self, states):
356 """ 357 Given a set of states, return the set of states reachable from 358 those states by following epsilon transitions. 359 360 @param states: the initial set of states 361 @type states: sequence 362 @returns: a superset of the given states, reachable by epsilon 363 transitions 364 @rtype: set 365 """ 366 stack = list(states) 367 closure = list(states) 368 while stack: 369 s1 = stack.pop() 370 for s2 in self.next(s1, epsilon): 371 if s2 not in closure: 372 closure.append(s2) 373 stack.append(s2) 374 return set(closure)
375 376 # return the corresponding DFA using subset construction (ASU p118) 377 # NB representation of (a*) still isn't minimal; should have 1 state not 2
378 - def dfa(self):
379 "Return a DFA that is equivalent to this FSA." 380 dfa = FSA(self.sigma()) 381 dfa_initial = dfa.start() 382 nfa_initial = tuple(self.e_closure((self.start(),))) 383 map = {} 384 map[dfa_initial] = nfa_initial 385 map[nfa_initial] = dfa_initial 386 if nfa_initial in self.finals(): 387 dfa.add_final(dfa_initial) 388 unmarked = [dfa_initial] 389 marked = [] 390 while unmarked: 391 dfa_state = unmarked.pop() 392 marked.append(dfa_state) 393 # is a final state accessible via epsilon transitions? 394 if self.in_finals(self.e_closure(map[dfa_state])): 395 dfa.add_final(dfa_state) 396 for label in self.sigma(): 397 nfa_next = tuple(self.e_closure(self.move(map[dfa_state], 398 label))) 399 if map.has_key(nfa_next): 400 dfa_next = map[nfa_next] 401 else: 402 dfa_next = dfa.new_state() 403 map[dfa_next] = nfa_next 404 map[nfa_next] = dfa_next 405 if self.in_finals(nfa_next): 406 dfa.add_final(dfa_next) 407 unmarked.append(dfa_next) 408 dfa.insert(dfa_state, label, dfa_next) 409 return dfa
410
411 - def generate(self, maxlen, state=0, prefix=""):
412 "Generate all accepting sequences of length at most maxlen." 413 if maxlen > 0: 414 if state in self._finals: 415 print prefix 416 for (s1, labels, s2) in self.outgoing_transitions(state): 417 for label in labels(): 418 self.generate(maxlen-1, s2, prefix+label)
419
420 - def pp(self):
421 """ 422 Print a representation of this FSA (in human-readable YAML format). 423 """ 424 print yaml.dump(self)
425 426 @classmethod
427 - def from_yaml(cls, loader, node):
428 map = loader.construct_mapping(node) 429 result = cls(map.get('sigma', []), {}, map.get('finals', [])) 430 for (s1, map1) in map['transitions'].items(): 431 for (symbol, targets) in map1.items(): 432 for s2 in targets: 433 result.insert(s1, symbol, s2) 434 return result
435 436 @classmethod
437 - def to_yaml(cls, dumper, data):
438 sigma = data.sigma() 439 transitions = {} 440 for (s1, symbol, s2) in data.generate_transitions(): 441 map1 = transitions.setdefault(s1, {}) 442 map2 = map1.setdefault(symbol, []) 443 map2.append(s2) 444 try: sigma = "".join(sigma) 445 except: sigma = list(sigma) 446 node = dumper.represent_mapping(cls.yaml_tag, dict( 447 sigma = sigma, 448 finals = list(data.finals()), 449 start = data._start, 450 transitions = transitions)) 451 return node
452
453 - def __str__(self):
454 return yaml.dump(self)
455 456 ### FUNCTIONS TO BUILD FSA FROM REGEXP 457 458 # the grammar of regular expressions 459 # (probabilities ensure that unary operators 460 # have stronger associativity than juxtaposition) 461
462 -def grammar(terminals):
463 (S, Expr, Star, Plus, Qmk, Paren) = [cfg.Nonterminal(s) for s in 'SE*+?('] 464 rules = [pcfg.WeightedProduction(Expr, [Star], prob=0.2), 465 pcfg.WeightedProduction(Expr, [Plus], prob=0.2), 466 pcfg.WeightedProduction(Expr, [Qmk], prob=0.2), 467 pcfg.WeightedProduction(Expr, [Paren], prob=0.2), 468 pcfg.WeightedProduction(S, [Expr], prob=0.5), 469 pcfg.WeightedProduction(S, [S, Expr], prob=0.5), 470 pcfg.WeightedProduction(Star, [Expr, '*'], prob=1), 471 pcfg.WeightedProduction(Plus, [Expr, '+'], prob=1), 472 pcfg.WeightedProduction(Qmk, [Expr, '?'], prob=1), 473 pcfg.WeightedProduction(Paren, ['(', S, ')'], prob=1)] 474 475 prob_term = 0.2/len(terminals) # divide remaining pr. mass 476 for terminal in terminals: 477 rules.append(pcfg.WeightedProduction(Expr, [terminal], prob=prob_term)) 478 479 return pcfg.WeightedGrammar(S, rules)
480 481 _parser = pchart.InsideParse(grammar('abcde')) 482 483 # create NFA from regexp (Thompson's construction) 484 # assumes unique start and final states 485
486 -def re2nfa(fsa, re):
487 tokens = tokenize.regexp(re, pattern=r'.') 488 tree = _parser.parse(tokens) 489 if tree is None: raise ValueError('Bad Regexp') 490 state = re2nfa_build(fsa, fsa.start(), tree) 491 fsa.set_final([state])
492 # fsa.minimize() 493
494 -def re2nfa_build(fsa, node, tree):
495 # Terminals. 496 if not isinstance(tree, Tree): 497 return re2nfa_char(fsa, node, tree) 498 elif len(tree) == 1: 499 return re2nfa_build(fsa, node, tree[0]) 500 elif tree.node == '(': 501 return re2nfa_build(fsa, node, tree[1]) 502 elif tree.node == '*': return re2nfa_star(fsa, node, tree[0]) 503 elif tree.node == '+': return re2nfa_plus(fsa, node, tree[0]) 504 elif tree.node == '?': return re2nfa_qmk(fsa, node, tree[0]) 505 else: 506 node = re2nfa_build(fsa, node, tree[0]) 507 return re2nfa_build(fsa, node, tree[1])
508
509 -def re2nfa_char(fsa, node, char):
510 new = fsa.new_state() 511 fsa.insert(node, char, new) 512 return new
513
514 -def re2nfa_qmk(fsa, node, tree):
515 node1 = fsa.new_state() 516 node2 = re2nfa_build(fsa, node1, tree) 517 node3 = fsa.new_state() 518 fsa.insert(node, epsilon, node1) 519 fsa.insert(node, epsilon, node3) 520 fsa.insert(node2, epsilon, node3) 521 return node3
522
523 -def re2nfa_plus(fsa, node, tree):
524 node1 = re2nfa_build(fsa, node, tree[0]) 525 fsa.insert(node1, epsilon, node) 526 return node1
527
528 -def re2nfa_star(fsa, node, tree):
529 node1 = fsa.new_state() 530 node2 = re2nfa_build(fsa, node1, tree) 531 node3 = fsa.new_state() 532 fsa.insert(node, epsilon, node1) 533 fsa.insert(node, epsilon, node3) 534 fsa.insert(node2, epsilon, node1) 535 fsa.insert(node2, epsilon, node3) 536 return node3
537 538 ################################################################# 539 # Demonstration 540 ################################################################# 541
542 -def demo():
543 """ 544 A demonstration showing how FSAs can be created and used. 545 """ 546 # Define an alphabet. 547 alphabet = "abcd" 548 549 # Create a new FSA. 550 fsa = FSA(alphabet) 551 552 # Use a regular expression to initialize the FSA. 553 re = 'abcd' 554 print 'Regular Expression:', re 555 re2nfa(fsa, re) 556 print "NFA:" 557 fsa.pp() 558 559 # Convert the (nondeterministic) FSA to a deterministic FSA. 560 dfa = fsa.dfa() 561 print "DFA:" 562 dfa.pp() 563 564 # Prune the DFA 565 dfa.prune() 566 print "PRUNED DFA:" 567 dfa.pp()
568 569 # Use the FSA to generate all strings of length less than 3 570 # (broken) 571 #fsa.generate(3) 572 573 if __name__ == '__main__': demo() 574