1
2
3
4
5
6
7
8
9 """
10 A module for finite state automata.
11 Operations are based on Aho, Sethi & Ullman (1986) Chapter 3.
12 """
13
14 from nltk_lite import tokenize
15 from nltk_lite.parse.tree import Tree
16 from nltk_lite.parse import cfg, pcfg, pchart
17 import yaml
18
19 epsilon = None
20
21
22
23
24
25 -class FSA(yaml.YAMLObject):
26 """
27 A class for finite state automata. In general, it represents
28 nondetermnistic finite state automata, with DFAs being a special case.
29 """
30 yaml_tag = '!FSA'
31 - def __init__(self, sigma='', transitions=None, start=0, finals=None):
32 """Set up the FSA.
33
34 @param sigma: the alphabet of the FSA
35 @type sigma: sequence
36 @param transitions: A dictionary representing the states and
37 transitions in the FSA. The keys are state identifiers (any hashable
38 object), and the values are dictionaries that map input symbols to the
39 sets of states they lead to.
40 @type transitions: dict
41 @param start: The identifier of the start state
42 @type start: hashable object
43 @param finals: The identifiers of the accept states
44 @type finals: sequence
45 """
46 self._transitions = transitions or {0: {}}
47 self._start = start
48 self._reverse = {}
49 self._build_reverse_transitions()
50 if finals: self._finals = set(finals)
51 else: self._finals = set([0])
52 self._sigma = set(sigma)
53 assert isinstance(self._transitions, dict)
54 self._next_state_num = 0
55
61
63 """
64 A generator that yields each transition arrow in the FSA in the form
65 (source, label, target).
66 """
67 for (state, map) in self._transitions.items():
68 for (symbol, targets) in map.items():
69 for target in targets:
70 yield (state, symbol, target)
71
73 """
74 A generator for all possible labels taking state s1 to state s2.
75 """
76 map = self._transitions.get(s1, {})
77 for (symbol, targets) in map.items():
78 if s2 in targets: yield symbol
79
81 "The alphabet of the FSA."
82 return self._sigma
83 alphabet = sigma
84
86 "Check whether a given object is in the alphabet."
87 if label and label not in self._sigma:
88 raise ValueError('Label "%s" not in alphabet: %s' % (label, str(self._sigma)))
89
91 "The number of states in the FSA."
92 return len(self._transitions)
93
95 """
96 Add a new state to the FSA.
97 @returns: the ID of the new state (a sequentially-assigned number).
98 @rtype: int
99 """
100 while self._next_state_num in self._transitions:
101 self._next_state_num += 1
102 self._transitions[self._next_state_num] = {}
103 self._reverse[self._next_state_num] = {}
104 return self._next_state_num
105
107 self._transitions[name] = {}
108 self._reverse[name] = {}
109 return name
110
112 """
113 @returns: the ID of the FSA's start state.
114 """
115 return self._start
116
118 """
119 @returns: the IDs of all accept states.
120 @rtype: set
121 """
122
123 return self._finals
124
126 """
127 @returns: a list of all states in the FSA.
128 @rtype: list
129 """
130 return self._transitions.keys()
131
133 """
134 Make a state into an accept state.
135 """
136 self._finals.add(state)
137
139 """
140 Make an accept state no longer be an accept state.
141 """
142 self._finals = self._finals.difference(set([state]))
143
144
146 """
147 Set the list of accept states.
148 """
149 self._finals = set(states)
150
152 """
153 Set the start state of the FSA.
154 """
155 self._start = start
156
158 """
159 Check whether a sequence contains any final states.
160 """
161 return [state for state in list
162 if state in self.finals()] != []
163
170
171 - def insert(self, s1, label, s2):
172 """
173 Add a new transition to the FSA.
174
175 @param s1: the source of the transition
176 @param label: the element of the alphabet that labels the transition
177 @param s2: the destination of the transition
178 """
179 if s1 not in self.states():
180 raise ValueError, "State %s does not exist" % s1
181 if s2 not in self.states():
182 raise ValueError, "State %s does not exist" % s1
183 self._add_transition(self._transitions, s1, label, s2)
184 self._add_transition(self._reverse, s2, label, s1)
185
190
196
197 - def delete(self, s1, label, s2):
198 """
199 Removes a transition from the FSA.
200
201 @param s1: the source of the transition
202 @param label: the element of the alphabet that labels the transition
203 @param s2: the destination of the transition
204 """
205 if s1 not in self.states():
206 raise ValueError, "State %s does not exist" % s1
207 if s2 not in self.states():
208 raise ValueError, "State %s does not exist" % s1
209 self._del_transition(self._transitions, s1, label, s2)
210 self._del_transition(self._reverse, s2, label, s1)
211
213 "Removes a state and all its transitions from the FSA."
214 if state not in self.states():
215 raise ValueError, "State %s does not exist" % state
216 for (s1, label, s2) in self.incident_transitions(state):
217 self.delete(s1, label, s2)
218 del self._transitions[state]
219 del self._reverse[state]
220
222 """
223 @returns: a set of transitions into or out of a state.
224 @rtype: set
225 """
226 result = set()
227 forward = self._transitions[state]
228 backward = self._reverse[state]
229 for label, targets in forward.items():
230 for target in targets:
231 result.add((state, label, target))
232 for label, targets in backward.items():
233 for target in targets:
234 result.add((target, label, state))
235 return result
236
238 """
239 Assigns a state a new identifier.
240 """
241 if old not in self.states():
242 raise ValueError, "State %s does not exist" % old
243 if new in self.states():
244 raise ValueError, "State %s already exists" % new
245 changes = []
246 for (s1, symbol, s2) in self.generate_transitions():
247 if s1 == old and s2 == old:
248 changes.append((s1, symbol, s2, new, symbol, new))
249 elif s1 == old:
250 changes.append((s1, symbol, s2, new, symbol, s2))
251 elif s2 == old:
252 changes.append((s1, symbol, s2, s1, symbol, new))
253 for (leftstate, symbol, rightstate, newleft, newsym, newright)\
254 in changes:
255 self.remove(leftstate, symbol, rightstate)
256 self.insert(newleft, newsym, newright)
257 del self._transitions[old]
258 del self._reverse[old]
259
260 - def next(self, state, symbol):
261 "The set of states reached from a certain state via a given symbol."
262 return self.e_closure(self._transitions[state].get(symbol, set()))
263 nextStates = next
264
265 - def move(self, states, symbol):
266 "The set of states reached from a set of states via a given symbol."
267 result = set()
268 for state in states:
269 result = result.union(self.next(state, symbol))
270 return self.e_closure(result)
271
273 """
274 Return whether this is a DFA
275 (every symbol leads from a state to at most one target state).
276 """
277 for map in self._transitions.values():
278 for targets in map.values():
279 if len(targets) > 1: return False
280 return True
281
283 """
284 The single state reached from a state via a given symbol.
285 If there is more than one such state, raises a ValueError.
286 If there is no such state, returns None.
287 """
288 next = self.next(state, symbol)
289 if len(next) > 1:
290 raise ValueError, "This FSA is nondeterministic -- use nextStates instead."
291 elif len(next) == 1: return list(next)[0]
292 else: return None
293
295 "All states reachable by following transitions from a given state."
296 result = set()
297 for (symbol, targets) in self._transitions[state].items():
298 result = result.union(targets)
299 return result
300
302 """All states from which a given state is reachable by following
303 transitions."""
304 result = set()
305 for (symbol, targets) in self._reverse[state].items():
306 result = result.union(targets)
307 return result
308
315
322
323
325 """
326 Modifies an FSA to remove inaccessible states and unused transitions.
327 """
328 acc = self.accessible()
329 for state in self.states():
330 if state not in acc:
331 self.delete_state(state)
332 else:
333 self._clean_map(self._transitions[state])
334 self._clean_map(self._reverse[state])
335
340
341
343 acc = set()
344 for final in self.finals():
345 reverse_acc = set([final])
346 self._reverse_accessible(final, reverse_acc)
347 acc = acc.union(reverse_acc)
348
349 forward_acc = set([self.start()])
350 self._forward_accessible(self.start(), forward_acc)
351
352 acc = acc.intersection(forward_acc)
353 return acc
354
356 """
357 Given a set of states, return the set of states reachable from
358 those states by following epsilon transitions.
359
360 @param states: the initial set of states
361 @type states: sequence
362 @returns: a superset of the given states, reachable by epsilon
363 transitions
364 @rtype: set
365 """
366 stack = list(states)
367 closure = list(states)
368 while stack:
369 s1 = stack.pop()
370 for s2 in self.next(s1, epsilon):
371 if s2 not in closure:
372 closure.append(s2)
373 stack.append(s2)
374 return set(closure)
375
376
377
379 "Return a DFA that is equivalent to this FSA."
380 dfa = FSA(self.sigma())
381 dfa_initial = dfa.start()
382 nfa_initial = tuple(self.e_closure((self.start(),)))
383 map = {}
384 map[dfa_initial] = nfa_initial
385 map[nfa_initial] = dfa_initial
386 if nfa_initial in self.finals():
387 dfa.add_final(dfa_initial)
388 unmarked = [dfa_initial]
389 marked = []
390 while unmarked:
391 dfa_state = unmarked.pop()
392 marked.append(dfa_state)
393
394 if self.in_finals(self.e_closure(map[dfa_state])):
395 dfa.add_final(dfa_state)
396 for label in self.sigma():
397 nfa_next = tuple(self.e_closure(self.move(map[dfa_state],
398 label)))
399 if map.has_key(nfa_next):
400 dfa_next = map[nfa_next]
401 else:
402 dfa_next = dfa.new_state()
403 map[dfa_next] = nfa_next
404 map[nfa_next] = dfa_next
405 if self.in_finals(nfa_next):
406 dfa.add_final(dfa_next)
407 unmarked.append(dfa_next)
408 dfa.insert(dfa_state, label, dfa_next)
409 return dfa
410
411 - def generate(self, maxlen, state=0, prefix=""):
412 "Generate all accepting sequences of length at most maxlen."
413 if maxlen > 0:
414 if state in self._finals:
415 print prefix
416 for (s1, labels, s2) in self.outgoing_transitions(state):
417 for label in labels():
418 self.generate(maxlen-1, s2, prefix+label)
419
421 """
422 Print a representation of this FSA (in human-readable YAML format).
423 """
424 print yaml.dump(self)
425
426 @classmethod
428 map = loader.construct_mapping(node)
429 result = cls(map.get('sigma', []), {}, map.get('finals', []))
430 for (s1, map1) in map['transitions'].items():
431 for (symbol, targets) in map1.items():
432 for s2 in targets:
433 result.insert(s1, symbol, s2)
434 return result
435
436 @classmethod
452
454 return yaml.dump(self)
455
456
457
458
459
460
461
463 (S, Expr, Star, Plus, Qmk, Paren) = [cfg.Nonterminal(s) for s in 'SE*+?(']
464 rules = [pcfg.WeightedProduction(Expr, [Star], prob=0.2),
465 pcfg.WeightedProduction(Expr, [Plus], prob=0.2),
466 pcfg.WeightedProduction(Expr, [Qmk], prob=0.2),
467 pcfg.WeightedProduction(Expr, [Paren], prob=0.2),
468 pcfg.WeightedProduction(S, [Expr], prob=0.5),
469 pcfg.WeightedProduction(S, [S, Expr], prob=0.5),
470 pcfg.WeightedProduction(Star, [Expr, '*'], prob=1),
471 pcfg.WeightedProduction(Plus, [Expr, '+'], prob=1),
472 pcfg.WeightedProduction(Qmk, [Expr, '?'], prob=1),
473 pcfg.WeightedProduction(Paren, ['(', S, ')'], prob=1)]
474
475 prob_term = 0.2/len(terminals)
476 for terminal in terminals:
477 rules.append(pcfg.WeightedProduction(Expr, [terminal], prob=prob_term))
478
479 return pcfg.WeightedGrammar(S, rules)
480
481 _parser = pchart.InsideParse(grammar('abcde'))
482
483
484
485
492
493
495
496 if not isinstance(tree, Tree):
497 return re2nfa_char(fsa, node, tree)
498 elif len(tree) == 1:
499 return re2nfa_build(fsa, node, tree[0])
500 elif tree.node == '(':
501 return re2nfa_build(fsa, node, tree[1])
502 elif tree.node == '*': return re2nfa_star(fsa, node, tree[0])
503 elif tree.node == '+': return re2nfa_plus(fsa, node, tree[0])
504 elif tree.node == '?': return re2nfa_qmk(fsa, node, tree[0])
505 else:
506 node = re2nfa_build(fsa, node, tree[0])
507 return re2nfa_build(fsa, node, tree[1])
508
513
522
527
529 node1 = fsa.new_state()
530 node2 = re2nfa_build(fsa, node1, tree)
531 node3 = fsa.new_state()
532 fsa.insert(node, epsilon, node1)
533 fsa.insert(node, epsilon, node3)
534 fsa.insert(node2, epsilon, node1)
535 fsa.insert(node2, epsilon, node3)
536 return node3
537
538
539
540
541
543 """
544 A demonstration showing how FSAs can be created and used.
545 """
546
547 alphabet = "abcd"
548
549
550 fsa = FSA(alphabet)
551
552
553 re = 'abcd'
554 print 'Regular Expression:', re
555 re2nfa(fsa, re)
556 print "NFA:"
557 fsa.pp()
558
559
560 dfa = fsa.dfa()
561 print "DFA:"
562 dfa.pp()
563
564
565 dfa.prune()
566 print "PRUNED DFA:"
567 dfa.pp()
568
569
570
571
572
573 if __name__ == '__main__': demo()
574