1
2
3
4
5
6
7
8
9 """
10 A module for finite state automata.
11 Operations are based on Aho, Sethi & Ullman (1986) Chapter 3.
12 """
13
14 from nltk_lite import tokenize
15 from nltk_lite.parse.tree import Tree
16 from nltk_lite.parse import cfg, pcfg, pchart
17 import yaml
18
19 epsilon = None
20
21
22
23
24
25 -class FSA(yaml.YAMLObject):
26 """
27 A class for finite state automata. In general, it represents
28 nondetermnistic finite state automata, with DFAs being a special case.
29 """
30 yaml_tag = '!FSA'
31 - def __init__(self, sigma='', transitions=None, start=0, finals=None):
32 """Set up the FSA.
33
34 @param sigma: the alphabet of the FSA
35 @type sigma: sequence
36 @param transitions: A dictionary representing the states and
37 transitions in the FSA. The keys are state identifiers (any hashable
38 object), and the values are dictionaries that map input symbols to the
39 sets of states they lead to.
40 @type transitions: dict
41 @param start: The identifier of the start state
42 @type start: hashable object
43 @param finals: The identifiers of the accept states
44 @type finals: sequence
45 """
46 self._transitions = transitions or {}
47 self._start = start
48 self._reverse = {}
49 self._build_reverse_transitions()
50 if finals: self._finals = set(finals)
51 else: self._finals = set([0])
52 self._sigma = set(sigma)
53 assert isinstance(self._transitions, dict)
54 self._next_state_num = 0
55
61
63 """
64 A generator that yields each transition arrow in the FSA in the form
65 (source, label, target).
66 """
67 for (state, map) in self._transitions.items():
68 for (symbol, targets) in map.items():
69 for target in targets:
70 yield (state, symbol, target)
71
73 """
74 A generator for all possible labels taking state s1 to state s2.
75 """
76 map = self._transitions.get(s1, {})
77 for (symbol, targets) in map.items():
78 if s2 in targets: yield symbol
79
81 "The alphabet of the FSA."
82 return self._sigma
83 alphabet = sigma
84
86 "Check whether a given object is in the alphabet."
87 if label and label not in self._sigma:
88 raise ValueError('Label "%s" not in alphabet: %s' % (label, str(self._sigma)))
89
91 "The number of states in the FSA."
92 return len(self._transitions)
93
95 """
96 Add a new state to the FSA.
97 @returns: the ID of the new state (a sequentially-assigned number).
98 @rtype: int
99 """
100 while self._next_state_num in self._transitions:
101 self._next_state_num += 1
102 self._transitions[self._next_state_num] = {}
103 self._reverse[self._next_state_num] = {}
104 return self._next_state_num
105
107 self._transitions[name] = {}
108 self._reverse[name] = {}
109 return name
110
112 """
113 @returns: the ID of the FSA's start state.
114 """
115 return self._start
116
118 """
119 @returns: the IDs of all accept states.
120 @rtype: set
121 """
122
123 return self._finals
124
126 """
127 @returns: the IDs of all accept states.
128 @rtype: set
129 """
130
131 return set(self.states()).difference(self._finals)
132
134 """
135 @returns: a list of all states in the FSA.
136 @rtype: list
137 """
138 return self._transitions.keys()
139
141 """
142 Make a state into an accept state.
143 """
144 self._finals.add(state)
145
147 """
148 Make an accept state no longer be an accept state.
149 """
150 self._finals = self._finals.difference(set([state]))
151
152
154 """
155 Set the list of accept states.
156 """
157 self._finals = set(states)
158
160 """
161 Set the start state of the FSA.
162 """
163 self._start = start
164
166 """
167 Check whether a sequence contains any final states.
168 """
169 return [state for state in list
170 if state in self.finals()] != []
171
178
179 - def insert(self, s1, label, s2):
180 """
181 Add a new transition to the FSA.
182
183 @param s1: the source of the transition
184 @param label: the element of the alphabet that labels the transition
185 @param s2: the destination of the transition
186 """
187 if s1 not in self.states():
188 raise ValueError, "State %s does not exist in %s" % (s1,
189 self.states())
190 if s2 not in self.states():
191 raise ValueError, "State %s does not exist in %s" % (s2,
192 self.states())
193 self._add_transition(self._transitions, s1, label, s2)
194 self._add_transition(self._reverse, s2, label, s1)
195
200
206
207 - def delete(self, s1, label, s2):
208 """
209 Removes a transition from the FSA.
210
211 @param s1: the source of the transition
212 @param label: the element of the alphabet that labels the transition
213 @param s2: the destination of the transition
214 """
215 if s1 not in self.states():
216 raise ValueError, "State %s does not exist" % s1
217 if s2 not in self.states():
218 raise ValueError, "State %s does not exist" % s1
219 self._del_transition(self._transitions, s1, label, s2)
220 self._del_transition(self._reverse, s2, label, s1)
221
223 "Removes a state and all its transitions from the FSA."
224 if state not in self.states():
225 raise ValueError, "State %s does not exist" % state
226 for (s1, label, s2) in self.incident_transitions(state):
227 self.delete(s1, label, s2)
228 del self._transitions[state]
229 del self._reverse[state]
230
232 """
233 @returns: a set of transitions into or out of a state.
234 @rtype: set
235 """
236 result = set()
237 forward = self._transitions[state]
238 backward = self._reverse[state]
239 for label, targets in forward.items():
240 for target in targets:
241 result.add((state, label, target))
242 for label, targets in backward.items():
243 for target in targets:
244 result.add((target, label, state))
245 return result
246
248 """
249 Assigns a state a new identifier.
250 """
251 if old not in self.states():
252 raise ValueError, "State %s does not exist" % old
253 if new in self.states():
254 raise ValueError, "State %s already exists" % new
255 changes = []
256 for (s1, symbol, s2) in self.generate_transitions():
257 if s1 == old and s2 == old:
258 changes.append((s1, symbol, s2, new, symbol, new))
259 elif s1 == old:
260 changes.append((s1, symbol, s2, new, symbol, s2))
261 elif s2 == old:
262 changes.append((s1, symbol, s2, s1, symbol, new))
263 for (leftstate, symbol, rightstate, newleft, newsym, newright)\
264 in changes:
265 print leftstate, symbol, rightstate, newleft, newsym, newright
266 self.delete(leftstate, symbol, rightstate)
267 self.insert_safe(newleft, newsym, newright)
268 del self._transitions[old]
269 del self._reverse[old]
270
271 - def next(self, state, symbol):
272 "The set of states reached from a certain state via a given symbol."
273 return self.e_closure(self._transitions[state].get(symbol, set()))
274 nextStates = next
275
276 - def move(self, states, symbol):
277 "The set of states reached from a set of states via a given symbol."
278 result = set()
279 for state in states:
280 result = result.union(self.next(state, symbol))
281 return self.e_closure(result)
282
284 """
285 Return whether this is a DFA
286 (every symbol leads from a state to at most one target state).
287 """
288 for map in self._transitions.values():
289 for targets in map.values():
290 if len(targets) > 1: return False
291 return True
292
294 """
295 The single state reached from a state via a given symbol.
296 If there is more than one such state, raises a ValueError.
297 If there is no such state, returns None.
298 """
299 next = self.next(state, symbol)
300 if len(next) > 1:
301 raise ValueError, "This FSA is nondeterministic -- use nextStates instead."
302 elif len(next) == 1: return list(next)[0]
303 else: return None
304
306 "All states reachable by following transitions from a given state."
307 result = set()
308 for (symbol, targets) in self._transitions[state].items():
309 result = result.union(targets)
310 return result
311
313 """All states from which a given state is reachable by following
314 transitions."""
315 result = set()
316 for (symbol, targets) in self._reverse[state].items():
317 result = result.union(targets)
318 return result
319
326
333
334
336 """
337 Modifies an FSA to remove inaccessible states and unused transitions.
338 """
339 acc = self.accessible()
340 for state in self.states():
341 if state not in acc:
342 self.delete_state(state)
343 else:
344 self._clean_map(self._transitions[state])
345 self._clean_map(self._reverse[state])
346
351
352
354 acc = set()
355 for final in self.finals():
356 reverse_acc = set([final])
357 self._reverse_accessible(final, reverse_acc)
358 acc = acc.union(reverse_acc)
359
360 forward_acc = set([self.start()])
361 self._forward_accessible(self.start(), forward_acc)
362
363 acc = acc.intersection(forward_acc)
364 return acc
365
367 """
368 Given a set of states, return the set of states reachable from
369 those states by following epsilon transitions.
370
371 @param states: the initial set of states
372 @type states: sequence
373 @returns: a superset of the given states, reachable by epsilon
374 transitions
375 @rtype: set
376 """
377 stack = list(states)
378 closure = list(states)
379 while stack:
380 s1 = stack.pop()
381 for s2 in self.next(s1, epsilon):
382 if s2 not in closure:
383 closure.append(s2)
384 stack.append(s2)
385 return set(closure)
386
387
388
390 "Return a DFA that is equivalent to this FSA."
391 dfa = FSA(self.sigma())
392 dfa_initial = dfa.start()
393 nfa_initial = tuple(self.e_closure((self.start(),)))
394 map = {}
395 map[dfa_initial] = nfa_initial
396 map[nfa_initial] = dfa_initial
397 if nfa_initial in self.finals():
398 dfa.add_final(dfa_initial)
399 unmarked = [dfa_initial]
400 marked = []
401 while unmarked:
402 dfa_state = unmarked.pop()
403 marked.append(dfa_state)
404
405 if self.in_finals(self.e_closure(map[dfa_state])):
406 dfa.add_final(dfa_state)
407 for label in self.sigma():
408 nfa_next = tuple(self.e_closure(self.move(map[dfa_state],
409 label)))
410 if map.has_key(nfa_next):
411 dfa_next = map[nfa_next]
412 else:
413 dfa_next = dfa.new_state()
414 map[dfa_next] = nfa_next
415 map[nfa_next] = dfa_next
416 if self.in_finals(nfa_next):
417 dfa.add_final(dfa_next)
418 unmarked.append(dfa_next)
419 dfa.insert(dfa_state, label, dfa_next)
420 return dfa
421
422 - def generate(self, maxlen, state=0, prefix=""):
423 "Generate all accepting sequences of length at most maxlen."
424 if maxlen > 0:
425 if state in self._finals:
426 print prefix
427 for (s1, labels, s2) in self.outgoing_transitions(state):
428 for label in labels():
429 self.generate(maxlen-1, s2, prefix+label)
430
432 """
433 Print a representation of this FSA (in human-readable YAML format).
434 """
435 print yaml.dump(self)
436
437 @classmethod
439 map = loader.construct_mapping(node)
440 result = cls(map.get('sigma', []), {}, map.get('finals', []))
441 for (s1, map1) in map['transitions'].items():
442 for (symbol, targets) in map1.items():
443 for s2 in targets:
444 result.insert(s1, symbol, s2)
445 return result
446
447 @classmethod
463
464 - def show_pygraph(self, title='FSA', outfile=None, labels=True, root=None):
465 from pygraph import pygraph, tkgraphview
466 graph = pygraph.Grapher('directed')
467
468 for state in self.states():
469 color = '#eee'
470 if state in self.finals():
471 shape = 'oval'
472 else:
473 shape = 'rect'
474 if state == self.start():
475 color = '#afa'
476 term = ''
477 if state == self.start(): term = 'start'
478 elif state == 'End': term = 'end'
479 if state in [0, '0', 'reject', 'Reject']: color='#e99'
480
481 graph.addNode(state, state, color, shape, term)
482
483
484 for source, label, target in self.generate_transitions():
485 if not labels: label = ''
486 graph.addEdge(source, target, label, color='black', dup=False)
487
488 if outfile is None: outfile = title
489
490 return tkgraphview.tkGraphView(graph, title, outfile, root=root,
491 startTk=(not root))
492
494 return yaml.dump(self)
495
496
497
498
499
500
501
503 (S, Expr, Star, Plus, Qmk, Paren) = [cfg.Nonterminal(s) for s in 'SE*+?(']
504 rules = [pcfg.WeightedProduction(Expr, [Star], prob=0.2),
505 pcfg.WeightedProduction(Expr, [Plus], prob=0.2),
506 pcfg.WeightedProduction(Expr, [Qmk], prob=0.2),
507 pcfg.WeightedProduction(Expr, [Paren], prob=0.2),
508 pcfg.WeightedProduction(S, [Expr], prob=0.5),
509 pcfg.WeightedProduction(S, [S, Expr], prob=0.5),
510 pcfg.WeightedProduction(Star, [Expr, '*'], prob=1),
511 pcfg.WeightedProduction(Plus, [Expr, '+'], prob=1),
512 pcfg.WeightedProduction(Qmk, [Expr, '?'], prob=1),
513 pcfg.WeightedProduction(Paren, ['(', S, ')'], prob=1)]
514
515 prob_term = 0.2/len(terminals)
516 for terminal in terminals:
517 rules.append(pcfg.WeightedProduction(Expr, [terminal], prob=prob_term))
518
519 return pcfg.WeightedGrammar(S, rules)
520
521 _parser = pchart.InsideParse(grammar('abcde'))
522
523
524
525
532
533
535
536 if not isinstance(tree, Tree):
537 return re2nfa_char(fsa, node, tree)
538 elif len(tree) == 1:
539 return re2nfa_build(fsa, node, tree[0])
540 elif tree.node == '(':
541 return re2nfa_build(fsa, node, tree[1])
542 elif tree.node == '*': return re2nfa_star(fsa, node, tree[0])
543 elif tree.node == '+': return re2nfa_plus(fsa, node, tree[0])
544 elif tree.node == '?': return re2nfa_qmk(fsa, node, tree[0])
545 else:
546 node = re2nfa_build(fsa, node, tree[0])
547 return re2nfa_build(fsa, node, tree[1])
548
553
562
567
569 node1 = fsa.new_state()
570 node2 = re2nfa_build(fsa, node1, tree)
571 node3 = fsa.new_state()
572 fsa.insert(node, epsilon, node1)
573 fsa.insert(node, epsilon, node3)
574 fsa.insert(node2, epsilon, node1)
575 fsa.insert(node2, epsilon, node3)
576 return node3
577
578
579
580
581
583 """
584 A demonstration showing how FSAs can be created and used.
585 """
586
587 alphabet = "abcd"
588
589
590 fsa = FSA(alphabet)
591
592
593 re = 'abcd'
594 print 'Regular Expression:', re
595 re2nfa(fsa, re)
596 print "NFA:"
597 fsa.pp()
598
599
600 dfa = fsa.dfa()
601 print "DFA:"
602 dfa.pp()
603
604
605 dfa.prune()
606 print "PRUNED DFA:"
607 dfa.pp()
608
609
610
611
612
613 if __name__ == '__main__': demo()
614