1
2
3
4
5
6
7
8
9
10
11
12 from __init__ import *
13 from tree import Tree
14 import cfg
15
16 """
17 Data classes and parser implementations for \"chart parsers\", which
18 use dynamic programming to efficiently parse a text. A X{chart
19 parser} derives parse trees for a text by iteratively adding \"edges\"
20 to a \"chart.\" Each X{edge} represents a hypothesis about the tree
21 structure for a subsequence of the text. The X{chart} is a
22 \"blackboard\" for composing and combining these hypotheses.
23
24 When a chart parser begins parsing a text, it creates a new (empty)
25 chart, spanning the text. It then incrementally adds new edges to the
26 chart. A set of X{chart rules} specifies the conditions under which
27 new edges should be added to the chart. Once the chart reaches a
28 stage where none of the chart rules adds any new edges, parsing is
29 complete.
30
31 Charts are encoded with the L{Chart} class, and edges are encoded with
32 the L{TreeEdge} and L{LeafEdge} classes. The chart parser module
33 defines three chart parsers:
34
35 - C{ChartParse} is a simple and flexible chart parser. Given a
36 set of chart rules, it will apply those rules to the chart until
37 no more edges are added.
38
39 - C{SteppingChartParse} is a subclass of C{ChartParse} that can
40 be used to step through the parsing process.
41
42 - C{EarleyChartParse} is an implementation of the Earley chart parsing
43 algorithm. It makes a single left-to-right pass through the
44 chart, and applies one of three rules (predictor, scanner, and
45 completer) to each edge it encounters.
46 """
47
48 import re
49
50
51
52
53
55 """
56 A hypothesis about the structure of part of a sentence.
57 Each edge records the fact that a structure is (partially)
58 consistent with the sentence. An edge contains:
59
60 - A X{span}, indicating what part of the sentence is
61 consistent with the hypothesized structure.
62
63 - A X{left-hand side}, specifying what kind of structure is
64 hypothesized.
65
66 - A X{right-hand side}, specifying the contents of the
67 hypothesized structure.
68
69 - A X{dot position}, indicating how much of the hypothesized
70 structure is consistent with the sentence.
71
72 Every edge is either X{complete} or X{incomplete}:
73
74 - An edge is X{complete} if its structure is fully consistent
75 with the sentence.
76
77 - An edge is X{incomplete} if its structure is partially
78 consistent with the sentence. For every incomplete edge, the
79 span specifies a possible prefix for the edge's structure.
80
81 There are two kinds of edge:
82
83 - C{TreeEdges<TreeEdge>} record which trees have been found to
84 be (partially) consistent with the text.
85
86 - C{LeafEdges<leafEdge>} record the tokens occur in the text.
87
88 The C{EdgeI} interface provides a common interface to both types
89 of edge, allowing chart parsers to treat them in a uniform manner.
90 """
92 if self.__class__ == EdgeI:
93 raise TypeError('Edge is an abstract interface')
94
95
96
97
98
100 """
101 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
102 portion of the sentence that is consistent with this
103 edge's structure.
104 @rtype: C{(int, int)}
105 """
106 raise AssertionError('EdgeI is an abstract interface')
107
109 """
110 @return: The start index of this edge's span.
111 @rtype: C{int}
112 """
113 raise AssertionError('EdgeI is an abstract interface')
114
116 """
117 @return: The end index of this edge's span.
118 @rtype: C{int}
119 """
120 raise AssertionError('EdgeI is an abstract interface')
121
123 """
124 @return: The length of this edge's span.
125 @rtype: C{int}
126 """
127 raise AssertionError('EdgeI is an abstract interface')
128
129
130
131
132
134 """
135 @return: This edge's left-hand side, which specifies what kind
136 of structure is hypothesized by this edge.
137 @see: L{TreeEdge} and L{LeafEdge} for a description of
138 the left-hand side values for each edge type.
139 """
140 raise AssertionError('EdgeI is an abstract interface')
141
142
143
144
145
147 """
148 @return: This edge's right-hand side, which specifies
149 the content of the structure hypothesized by this
150 edge.
151 @see: L{TreeEdge} and L{LeafEdge} for a description of
152 the right-hand side values for each edge type.
153 """
154 raise AssertionError('EdgeI is an abstract interface')
155
157 """
158 @return: This edge's dot position, which indicates how much of
159 the hypothesized structure is consistent with the
160 sentence. In particular, C{self.rhs[:dot]} is consistent
161 with C{subtoks[self.start():self.end()]}.
162 @rtype: C{int}
163 """
164 raise AssertionError('EdgeI is an abstract interface')
165
167 """
168 @return: The element of this edge's right-hand side that
169 immediately follows its dot.
170 @rtype: C{Nonterminal} or X{terminal} or C{None}
171 """
172 raise AssertionError('EdgeI is an abstract interface')
173
175 """
176 @return: True if this edge's structure is fully consistent
177 with the text.
178 @rtype: C{boolean}
179 """
180 raise AssertionError('EdgeI is an abstract interface')
181
183 """
184 @return: True if this edge's structure is partially consistent
185 with the text.
186 @rtype: C{boolean}
187 """
188 raise AssertionError('EdgeI is an abstract interface')
189
190
191
192
195
198
200 """
201 An edge that records the fact that a tree is (partially)
202 consistent with the sentence. A tree edge consists of:
203
204 - A X{span}, indicating what part of the sentence is
205 consistent with the hypothesized tree.
206
207 - A X{left-hand side}, specifying the hypothesized tree's node
208 value.
209
210 - A X{right-hand side}, specifying the hypothesized tree's
211 children. Each element of the right-hand side is either a
212 terminal, specifying a token with that terminal as its leaf
213 value; or a nonterminal, specifying a subtree with that
214 nonterminal's symbol as its node value.
215
216 - A X{dot position}, indicating which children are consistent
217 with part of the sentence. In particular, if C{dot} is the
218 dot position, C{rhs} is the right-hand size, C{(start,end)}
219 is the span, and C{sentence} is the list of subtokens in the
220 sentence, then C{subtokens[start:end]} can be spanned by the
221 children specified by C{rhs[:dot]}.
222
223 For more information about edges, see the L{EdgeI} interface.
224 """
225 - def __init__(self, span, lhs, rhs, dot=0):
226 """
227 Construct a new C{TreeEdge}.
228
229 @type span: C{(int, int)}
230 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
231 portion of the sentence that is consistent with the new
232 edge's structure.
233 @type lhs: L{Nonterminal}
234 @param lhs: The new edge's left-hand side, specifying the
235 hypothesized tree's node value.
236 @type rhs: C{list} of (L{Nonterminal} and C{string})
237 @param rhs: The new edge's right-hand side, specifying the
238 hypothesized tree's children.
239 @type dot: C{int}
240 @param dot: The position of the new edge's dot. This position
241 specifies what prefix of the production's right hand side
242 is consistent with the text. In particular, if
243 C{sentence} is the list of subtokens in the sentence, then
244 C{subtokens[span[0]:span[1]]} can be spanned by the
245 children specified by C{rhs[:dot]}.
246 """
247 self._lhs = lhs
248 self._rhs = tuple(rhs)
249 self._span = span
250 self._dot = dot
251
252
254 """
255 @return: A new C{TreeEdge} formed from the given production.
256 The new edge's left-hand side and right-hand side will
257 be taken from C{production}; its span will be C{(index,
258 index)}; and its dot position will be C{0}.
259 @rtype: L{TreeEdge}
260 """
261 return TreeEdge(span=(index, index), lhs=production.lhs(),
262 rhs=production.rhs(), dot=0)
263 from_production = staticmethod(from_production)
264
265
266 - def lhs(self): return self._lhs
267 - def span(self): return self._span
268 - def start(self): return self._span[0]
269 - def end(self): return self._span[1]
270 - def length(self): return self._span[1] - self._span[0]
271 - def rhs(self): return self._rhs
272 - def dot(self): return self._dot
273 - def is_complete(self): return self._dot == len(self._rhs)
276 if self._dot >= len(self._rhs): return None
277 else: return self._rhs[self._dot]
278
279
281 if self.__class__ != other.__class__: return -1
282 return cmp((self._span, self.lhs(), self.rhs(), self._dot),
283 (other._span, other.lhs(), other.rhs(), other._dot))
285 return hash((self.lhs(), self.rhs(), self._span, self._dot))
286
287
289 str = '[%s:%s] ' % (self._span[0], self._span[1])
290 str += '%-2s ->' % (self._lhs.symbol(),)
291
292 for i in range(len(self._rhs)):
293 if i == self._dot: str += ' *'
294 if isinstance(self._rhs[i], cfg.Nonterminal):
295 str += ' %s' % (self._rhs[i].symbol(),)
296 else:
297 str += ' %r' % (self._rhs[i],)
298 if len(self._rhs) == self._dot: str += ' *'
299 return str
300
302 return '[Edge: %s]' % self
303
305 """
306 An edge that records the fact that a leaf value is consistent with
307 a word in the sentence. A leaf edge consists of:
308
309 - An X{index}, indicating the position of the word.
310 - A X{leaf}, specifying the word's content.
311
312 A leaf edge's left-hand side is its leaf value, and its right hand
313 side is C{()}. Its span is C{[index, index+1]}, and its dot
314 position is C{0}.
315 """
317 """
318 Construct a new C{LeafEdge}.
319
320 @param leaf: The new edge's leaf value, specifying the word
321 that is recorded by this edge.
322 @param index: The new edge's index, specifying the position of
323 the word that is recorded by this edge.
324 """
325 self._leaf = leaf
326 self._index = index
327
328
329 - def lhs(self): return self._leaf
334 - def rhs(self): return ()
335 - def dot(self): return 0
338 - def next(self): return None
339
340
342 if not isinstance(other, LeafEdge): return -1
343 return cmp((self._index, self._leaf), (other._index, other._leaf))
345 return hash((self._index, self._leaf))
346
347
350 return '[Edge: %s]' % (self)
351
352
353
354
355
357 """
358 A blackboard for hypotheses about the syntactic constituents of a
359 sentence. A chart contains a set of edges, and each edge encodes
360 a single hypothesis about the structure of some portion of the
361 sentence.
362
363 The L{select} method can be used to select a specific collection
364 of edges. For example C{chart.select(is_complete=True, start=0)}
365 yields all complete edges whose start indices are 0. To ensure
366 the efficiency of these selection operations, C{Chart} dynamically
367 creates and maintains an index for each set of attributes that
368 have been selected on.
369
370 In order to reconstruct the trees that are represented by an edge,
371 the chart associates each edge with a set of child pointer lists.
372 A X{child pointer list} is a list of the edges that license an
373 edge's right-hand side.
374
375 @ivar _tokens: The sentence that the chart covers.
376 @ivar _num_leaves: The number of tokens.
377 @ivar _edges: A list of the edges in the chart
378 @ivar _edge_to_cpls: A dictionary mapping each edge to a set
379 of child pointer lists that are associated with that edge.
380 @ivar _indexes: A dictionary mapping tuples of edge attributes
381 to indices, where each index maps the corresponding edge
382 attribute values to lists of edges.
383 """
385 """
386 Construct a new empty chart.
387
388 @type tokens: L{list}
389 @param tokens: The sentence that this chart will be used to parse.
390 """
391
392 self._tokens = list(tokens)
393 self._num_leaves = len(self._tokens)
394
395
396 self._edges = []
397
398
399 self._edge_to_cpls = {}
400
401
402
403 self._indexes = {}
404
405
406
407
408
410 """
411 @return: The number of words in this chart's sentence.
412 @rtype: C{int}
413 """
414 return self._num_leaves
415
416 - def leaf(self, index):
417 """
418 @return: The leaf value of the word at the given index.
419 @rtype: C{string}
420 """
421 return self._tokens[index]
422
424 """
425 @return: A list of the leaf values of each word in the
426 chart's sentence.
427 @rtype: C{list} of C{string}
428 """
429 return self._tokens
430
431
432
433
434
436 """
437 @return: A list of all edges in this chart. New edges
438 that are added to the chart after the call to edges()
439 will I{not} be contained in this list.
440 @rtype: C{list} of L{EdgeI}
441 @see: L{iteredges}, L{select}
442 """
443 return self._edges[:]
444
446 """
447 @return: An iterator over the edges in this chart. Any
448 new edges that are added to the chart before the iterator
449 is exahusted will also be generated.
450 @rtype: C{iter} of L{EdgeI}
451 @see: L{edges}, L{select}
452 """
453 return iter(self._edges)
454
455
456 __iter__ = iteredges
457
459 """
460 @return: The number of edges contained in this chart.
461 @rtype: C{int}
462 """
463 return len(self._edge_to_cpls)
464
465 - def select(self, **restrictions):
466 """
467 @return: An iterator over the edges in this chart. Any
468 new edges that are added to the chart before the iterator
469 is exahusted will also be generated. C{restrictions}
470 can be used to restrict the set of edges that will be
471 generated.
472 @rtype: C{iter} of L{EdgeI}
473
474 @kwarg span: Only generate edges C{e} where C{e.span()==span}
475 @kwarg start: Only generate edges C{e} where C{e.start()==start}
476 @kwarg end: Only generate edges C{e} where C{e.end()==end}
477 @kwarg length: Only generate edges C{e} where C{e.length()==length}
478 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs}
479 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs}
480 @kwarg next: Only generate edges C{e} where C{e.next()==next}
481 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot}
482 @kwarg is_complete: Only generate edges C{e} where
483 C{e.is_complete()==is_complete}
484 @kwarg is_incomplete: Only generate edges C{e} where
485 C{e.is_incomplete()==is_incomplete}
486 """
487
488 if restrictions=={}: return iter(self._edges)
489
490
491 restr_keys = restrictions.keys()
492 restr_keys.sort()
493 restr_keys = tuple(restr_keys)
494
495
496 if not self._indexes.has_key(restr_keys):
497 self._add_index(restr_keys)
498
499 vals = [restrictions[k] for k in restr_keys]
500 return iter(self._indexes[restr_keys].get(tuple(vals), []))
501
503 """
504 A helper function for L{select}, which creates a new index for
505 a given set of attributes (aka restriction keys).
506 """
507
508 for k in restr_keys:
509 if not hasattr(EdgeI, k):
510 raise ValueError, 'Bad restriction: %s' % k
511
512
513 self._indexes[restr_keys] = {}
514
515
516 for edge in self._edges:
517 vals = [getattr(edge, k)() for k in restr_keys]
518 index = self._indexes[restr_keys]
519 index.setdefault(tuple(vals),[]).append(edge)
520
521
522
523
524
525 - def insert(self, edge, child_pointer_list):
526 """
527 Add a new edge to the chart.
528
529 @type edge: L{Edge}
530 @param edge: The new edge
531 @type child_pointer_list: C{tuple} of L{Edge}
532 @param child_pointer_list: A list of the edges that were used to
533 form this edge. This list is used to reconstruct the trees
534 (or partial trees) that are associated with C{edge}.
535 @rtype: C{bool}
536 @return: True if this operation modified the chart. In
537 particular, return true iff the chart did not already
538 contain C{edge}, or if it did not already associate
539 C{child_pointer_list} with C{edge}.
540 """
541
542 if not self._edge_to_cpls.has_key(edge):
543
544 self._edges.append(edge)
545
546
547 for (restr_keys, index) in self._indexes.items():
548 vals = [getattr(edge, k)() for k in restr_keys]
549 index = self._indexes[restr_keys]
550 index.setdefault(tuple(vals),[]).append(edge)
551
552
553 cpls = self._edge_to_cpls.setdefault(edge,{})
554 child_pointer_list = tuple(child_pointer_list)
555
556 if cpls.has_key(child_pointer_list):
557
558 return False
559 else:
560
561 cpls[child_pointer_list] = True
562 return True
563
564
565
566
567
569 """
570 @return: A list of the complete tree structures that span
571 the entire chart, and whose root node is C{root}.
572 """
573 trees = []
574 for edge in self.select(span=(0,self._num_leaves), lhs=root):
575 trees += self.trees(edge, tree_class=tree_class, complete=True)
576 return trees
577
578 - def trees(self, edge, tree_class=Tree, complete=False):
579 """
580 @return: A list of the tree structures that are associated
581 with C{edge}.
582
583 If C{edge} is incomplete, then the unexpanded children will be
584 encoded as childless subtrees, whose node value is the
585 corresponding terminal or nonterminal.
586
587 @rtype: C{list} of L{Tree}
588 @note: If two trees share a common subtree, then the same
589 C{Tree} may be used to encode that subtree in
590 both trees. If you need to eliminate this subtree
591 sharing, then create a deep copy of each tree.
592 """
593 return self._trees(edge, complete, memo={}, tree_class=tree_class)
594
595 - def _trees(self, edge, complete, memo, tree_class):
596 """
597 A helper function for L{trees}.
598 @param memo: A dictionary used to record the trees that we've
599 generated for each edge, so that when we see an edge more
600 than once, we can reuse the same trees.
601 """
602
603 if memo.has_key(edge): return memo[edge]
604
605 trees = []
606
607
608 if complete and edge.is_incomplete():
609 return trees
610
611
612
613
614
615
616
617 memo[edge] = []
618
619
620 if isinstance(edge, LeafEdge):
621 leaf = self._tokens[edge.start()]
622 memo[edge] = leaf
623 return [leaf]
624
625
626 for cpl in self.child_pointer_lists(edge):
627
628
629
630 child_choices = [self._trees(cp, complete, memo, tree_class)
631 for cp in cpl]
632
633
634 if len(child_choices) > 0 and type(child_choices[0]) == type(""):
635 child_choices = [child_choices]
636
637
638 for children in self._choose_children(child_choices):
639 lhs = edge.lhs().symbol()
640 trees.append(tree_class(lhs, children))
641
642
643 if edge.is_incomplete():
644 unexpanded = [tree_class(elt,[])
645 for elt in edge.rhs()[edge.dot():]]
646 for tree in trees:
647 tree.extend(unexpanded)
648
649
650 memo[edge] = trees
651
652
653 return trees
654
656 """
657 A helper function for L{_trees} that finds the possible sets
658 of subtrees for a new tree.
659
660 @param child_choices: A list that specifies the options for
661 each child. In particular, C{child_choices[i]} is a list of
662 tokens and subtrees that can be used as the C{i}th child.
663 """
664 children_lists = [[]]
665 for child_choice in child_choices:
666 children_lists = [child_list+[child]
667 for child in child_choice
668 for child_list in children_lists]
669 return children_lists
670
672 """
673 @rtype: C{list} of C{list} of C{Edge}
674 @return: The set of child pointer lists for the given edge.
675 Each child pointer list is a list of edges that have
676 been used to form this edge.
677 """
678
679 return self._edge_to_cpls.get(edge, {}).keys()
680
681
682
683
684 - def pp_edge(self, edge, width=None):
713
715 """
716 @return: A pretty-printed string representation of this
717 chart's leaves. This string can be used as a header
718 for calls to L{pp_edge}.
719 """
720 if width is None: width = 50/(self.num_leaves()+1)
721
722 if self._tokens is not None and width>1:
723 header = '|.'
724 for tok in self._tokens:
725 header += tok[:width-1].center(width-1)+'.'
726 header += '|'
727 else:
728 header = ''
729
730 return header
731
732 - def pp(self, width=None):
733 """
734 @return: A pretty-printed string representation of this chart.
735 @rtype: C{string}
736 @param width: The number of characters allotted to each
737 index in the sentence.
738 """
739 if width is None: width = 50/(self.num_leaves()+1)
740
741
742 edges = [(e.length(), e.start(), e) for e in self]
743 edges.sort()
744 edges = [e for (_,_,e) in edges]
745
746 return (self.pp_leaves(width) + '\n' +
747 '\n'.join(self.pp_edge(edge, width) for edge in edges))
748
749
750
751
752
754
755 s = 'digraph nltk_chart {\n'
756
757 s += ' rankdir=LR;\n'
758 s += ' node [height=0.1,width=0.1];\n'
759 s += ' node [style=filled, color="lightgray"];\n'
760
761
762 for y in range(self.num_edges(), -1, -1):
763 if y == 0:
764 s += ' node [style=filled, color="black"];\n'
765 for x in range(self.num_leaves()+1):
766 if y == 0 or (x <= self._edges[y-1].start() or
767 x >= self._edges[y-1].end()):
768 s += ' %04d.%04d [label=""];\n' % (x,y)
769
770
771 s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
772
773
774 for x in range(self.num_leaves()+1):
775 s += ' {rank=same;'
776 for y in range(self.num_edges()+1):
777 if y == 0 or (x <= self._edges[y-1].start() or
778 x >= self._edges[y-1].end()):
779 s += ' %04d.%04d' % (x,y)
780 s += '}\n'
781
782
783 s += ' edge [style=invis, weight=100];\n'
784 s += ' node [shape=plaintext]\n'
785 s += ' 0000.0000'
786 for x in range(self.num_leaves()):
787 s += '->%s->%04d.0000' % (self.leaf(x), x+1)
788 s += ';\n\n'
789
790
791 s += ' edge [style=solid, weight=1];\n'
792 for y, edge in enumerate(self):
793 for x in range(edge.start()):
794 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
795 (x, y+1, x+1, y+1))
796 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' %
797 (edge.start(), y+1, edge.end(), y+1, edge))
798 for x in range(edge.end(), self.num_leaves()):
799 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
800 (x, y+1, x+1, y+1))
801 s += '}\n'
802 return s
803
804
805
806
807
809 """
810 A rule that specifies what new edges are licensed by any given set
811 of existing edges. Each chart rule expects a fixed number of
812 edges, as indicated by the class variable L{NUM_EDGES}. In
813 particular:
814
815 - A chart rule with C{NUM_EDGES=0} specifies what new edges are
816 licensed, regardless of existing edges.
817
818 - A chart rule with C{NUM_EDGES=1} specifies what new edges are
819 licensed by a single existing edge.
820
821 - A chart rule with C{NUM_EDGES=2} specifies what new edges are
822 licensed by a pair of existing edges.
823
824 @type NUM_EDGES: C{int}
825 @cvar NUM_EDGES: The number of existing edges that this rule uses
826 to license new edges. Typically, this number ranges from zero
827 to two.
828 """
829 - def apply(self, chart, grammar, *edges):
830 """
831 Add the edges licensed by this rule and the given edges to the
832 chart.
833
834 @type edges: C{list} of L{EdgeI}
835 @param edges: A set of existing edges. The number of edges
836 that should be passed to C{apply} is specified by the
837 L{NUM_EDGES} class variable.
838 @rtype: C{list} of L{EdgeI}
839 @return: A list of the edges that were added.
840 """
841 raise AssertionError, 'ChartRuleI is an abstract interface'
842
844 """
845 @return: A generator that will add edges licensed by this rule
846 and the given edges to the chart, one at a time. Each
847 time the generator is resumed, it will either add a new
848 edge and yield that edge; or return.
849 @rtype: C{iter} of L{EdgeI}
850
851 @type edges: C{list} of L{EdgeI}
852 @param edges: A set of existing edges. The number of edges
853 that should be passed to C{apply} is specified by the
854 L{NUM_EDGES} class variable.
855 """
856 raise AssertionError, 'ChartRuleI is an abstract interface'
857
859 """
860 Add all the edges licensed by this rule and the edges in the
861 chart to the chart.
862
863 @rtype: C{list} of L{EdgeI}
864 @return: A list of the edges that were added.
865 """
866 raise AssertionError, 'ChartRuleI is an abstract interface'
867
869 """
870 @return: A generator that will add all edges licensed by
871 this rule, given the edges that are currently in the
872 chart, one at a time. Each time the generator is resumed,
873 it will either add a new edge and yield that edge; or
874 return.
875 @rtype: C{iter} of L{EdgeI}
876 """
877 raise AssertionError, 'ChartRuleI is an abstract interface'
878
880 """
881 An abstract base class for chart rules. C{AbstractChartRule}
882 provides:
883 - A default implementation for C{apply}, based on C{apply_iter}.
884 - A default implementation for C{apply_everywhere_iter},
885 based on C{apply_iter}.
886 - A default implementation for C{apply_everywhere}, based on
887 C{apply_everywhere_iter}. Currently, this implementation
888 assumes that C{NUM_EDGES}<=3.
889 - A default implementation for C{__str__}, which returns a
890 name basd on the rule's class name.
891 """
892
893
896
897
898
900 if self.NUM_EDGES == 0:
901 for new_edge in self.apply_iter(chart, grammar):
902 yield new_edge
903
904 elif self.NUM_EDGES == 1:
905 for e1 in chart:
906 for new_edge in self.apply_iter(chart, grammar, e1):
907 yield new_edge
908
909 elif self.NUM_EDGES == 2:
910 for e1 in chart:
911 for e2 in chart:
912 for new_edge in self.apply_iter(chart, grammar, e1, e2):
913 yield new_edge
914
915 elif self.NUM_EDGES == 3:
916 for e1 in chart:
917 for e2 in chart:
918 for e3 in chart:
919 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3):
920 yield new_edge
921
922 else:
923 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
924
925
926 - def apply(self, chart, grammar, *edges):
928
929
932
933
935
936 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
937
938
939
940
942 """
943 A rule that joins two adjacent edges to form a single combined
944 edge. In particular, this rule specifies that any pair of edges:
945
946 - [AS{->}S{alpha}*BS{beta}][i:j]
947 - [BS{->}S{gamma}*][j:k]
948 licenses the edge:
949 - [AS{->}S{alpha}B*S{beta}][i:j]
950 """
951 NUM_EDGES = 2
952 - def apply_iter(self, chart, grammar, left_edge, right_edge):
972
974 """
975 A rule that joins a given edge with adjacent edges in the chart,
976 to form combined edges. In particular, this rule specifies that
977 either of the edges:
978 - [AS{->}S{alpha}*BS{beta}][i:j]
979 - [BS{->}S{gamma}*][j:k]
980 licenses the edge:
981 - [AS{->}S{alpha}B*S{beta}][i:j]
982 if the other edge is already in the chart.
983 @note: This is basically L{FundamentalRule}, with one edge is left
984 unspecified.
985 """
986 NUM_EDGES = 1
987
988 _fundamental_rule = FundamentalRule()
989
991 fr = self._fundamental_rule
992 if edge1.is_incomplete():
993
994 for edge2 in chart.select(start=edge1.end(), is_complete=True,
995 lhs=edge1.next()):
996 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2):
997 yield new_edge
998 else:
999
1000 for edge2 in chart.select(end=edge1.start(), is_complete=False,
1001 next=edge1.lhs()):
1002 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1):
1003 yield new_edge
1004
1005 - def __str__(self): return 'Fundamental Rule'
1006
1008 """
1009 A rule licensing any edges corresponding to terminals in the
1010 text. In particular, this rule licenses the leaf edge:
1011 - [wS{->}*][i:i+1]
1012 for C{w} is a word in the text, where C{i} is C{w}'s index.
1013 """
1014 NUM_EDGES = 0
1020
1021
1022
1023
1024
1025
1027 """
1028 A rule licensing edges corresponding to the grammar productions for
1029 the grammar's start symbol. In particular, this rule specifies that:
1030 - [SS{->}*S{alpha}][0:i]
1031 is licensed for each grammar production C{SS{->}S{alpha}}, where
1032 C{S} is the grammar's start symbol.
1033 """
1034 NUM_EDGES = 0
1040
1042 """
1043 A rule licensing edges corresponding to the grammar productions
1044 for the nonterminal following an incomplete edge's dot. In
1045 particular, this rule specifies that:
1046 - [AS{->}S{alpha}*BS{beta}][i:j]
1047 licenses the edge:
1048 - [BS{->}*S{gamma}][j:j]
1049 for each grammar production C{BS{->}S{gamma}}.
1050 """
1051 NUM_EDGES = 1
1058
1060 """
1061 A rule licensing an edge corresponding to a terminal following an
1062 incomplete edge's dot. In particular, this rule specifies that:
1063 - [AS{->}S{alpha}*w{beta}][i:j]
1064 licenses the leaf edge:
1065 - [wS{->}*][j:j+1]
1066 if the C{j}th word in the text is C{w}.
1067 """
1068 NUM_EDGES = 1
1077
1078
1080 """
1081 A cached version of L{TopDownInitRule}. After the first time this
1082 rule is applied, it will not generate any more edges.
1083
1084 If C{chart} or C{grammar} are changed, then the cache is flushed.
1085 """
1089
1101
1102 - def __str__(self): return 'Top Down Init Rule'
1103
1105 """
1106 A cached version of L{TopDownExpandRule}. After the first time
1107 this rule is applied to an edge with a given C{end} and C{next},
1108 it will not generate any more edges for edges with that C{end} and
1109 C{next}.
1110
1111 If C{chart} or C{grammar} are changed, then the cache is flushed.
1112 """
1116
1130
1131 - def __str__(self): return 'Top Down Expand Rule'
1132
1133
1134
1135
1136
1138 """
1139 A rule licensing any edges corresponding to terminals in the
1140 text. In particular, this rule licenses the leaf edge:
1141 - [wS{->}*][i:i+1]
1142 for C{w} is a word in the text, where C{i} is C{w}'s index.
1143 """
1144 NUM_EDGES = 0
1150
1152 """
1153 A rule licensing any edge corresponding to a production whose
1154 right-hand side begins with a complete edge's left-hand side. In
1155 particular, this rule specifies that:
1156 - [AS{->}S{alpha}*]
1157 licenses the edge:
1158 - [BS{->}*AS{beta}]
1159 for each grammar production C{BS{->}AS{beta}}
1160 """
1161 NUM_EDGES = 1
1168
1169
1170
1171
1172
1174 """
1175 A rule that joins a given complete edge with adjacent incomplete
1176 edges in the chart, to form combined edges. In particular, this
1177 rule specifies that:
1178 - [BS{->}S{gamma}*][j:k]
1179 licenses the edge:
1180 - [AS{->}S{alpha}B*S{beta}][i:j]
1181 given that the chart contains:
1182 - [AS{->}S{alpha}*BS{beta}][i:j]
1183 @note: This is basically L{FundamentalRule}, with the left edge
1184 left unspecified.
1185 """
1186 NUM_EDGES = 1
1187
1188 _fundamental_rule = FundamentalRule()
1189
1190 - def apply_iter(self, chart, grammar, right_edge):
1198
1199 - def __str__(self): return 'Completer Rule'
1200
1202 """
1203 A rule licensing a leaf edge corresponding to a part-of-speech
1204 terminal following an incomplete edge's dot. In particular, this
1205 rule specifies that:
1206 - [AS{->}S{alpha}*PS{beta}][i:j]
1207 licenses the edges:
1208 - [PS{->}w*][j:j+1]
1209 - [wS{->}*][j:j+1]
1210 if the C{j}th word in the text is C{w}; and C{P} is a valid part
1211 of speech for C{w}.
1212 """
1213 NUM_EDGES = 1
1214 - def __init__(self, word_to_pos_lexicon):
1215 self._word_to_pos = word_to_pos_lexicon
1216
1229
1230
1232
1233
1234
1235
1236
1238 """
1239 A chart parser implementing the Earley parsing algorithm:
1240
1241 - For each index I{end} in [0, 1, ..., N]:
1242 - For each I{edge} s.t. I{edge}.end = I{end}:
1243 - If I{edge} is incomplete, and I{edge}.next is not a part
1244 of speech:
1245 - Apply PredictorRule to I{edge}
1246 - If I{edge} is incomplete, and I{edge}.next is a part of
1247 speech:
1248 - Apply ScannerRule to I{edge}
1249 - If I{edge} is complete:
1250 - Apply CompleterRule to I{edge}
1251 - Return any complete parses in the chart
1252
1253 C{EarleyChartParse} uses a X{lexicon} to decide whether a leaf
1254 has a given part of speech. This lexicon is encoded as a
1255 dictionary that maps each word to a list of parts of speech that
1256 word can have.
1257 """
1258 - def __init__(self, grammar, lexicon, trace=0):
1259 """
1260 Create a new Earley chart parser, that uses C{grammar} to
1261 parse texts.
1262
1263 @type grammar: C{cfg.Grammar}
1264 @param grammar: The grammar used to parse texts.
1265 @type lexicon: C{dict} from C{string} to (C{list} of C{string})
1266 @param lexicon: A lexicon of words that records the parts of
1267 speech that each word can have. Each key is a word, and
1268 the corresponding value is a list of parts of speech.
1269 @type trace: C{int}
1270 @param trace: The level of tracing that should be used when
1271 parsing a text. C{0} will generate no tracing output;
1272 and higher numbers will produce more verbose tracing
1273 output.
1274 """
1275 self._grammar = grammar
1276 self._lexicon = lexicon
1277 self._trace = trace
1278 AbstractParse.__init__(self)
1279
1281 chart = Chart(list(tokens))
1282 grammar = self._grammar
1283
1284
1285 w = 50/(chart.num_leaves()+1)
1286 if self._trace > 0: print ' ', chart.pp_leaves(w)
1287
1288
1289 root = cfg.Nonterminal('[INIT]')
1290 edge = TreeEdge((0,0), root, (grammar.start(),))
1291 chart.insert(edge, ())
1292
1293
1294 predictor = PredictorRule()
1295 completer = CompleterRule()
1296 scanner = ScannerRule(self._lexicon)
1297
1298 for end in range(chart.num_leaves()+1):
1299 if self._trace > 1: print 'Processing queue %d' % end
1300 for edge in chart.select(end=end):
1301 if edge.is_incomplete():
1302 for e in predictor.apply(chart, grammar, edge):
1303 if self._trace > 0:
1304 print 'Predictor', chart.pp_edge(e,w)
1305 if edge.is_incomplete():
1306 for e in scanner.apply(chart, grammar, edge):
1307 if self._trace > 0:
1308 print 'Scanner ', chart.pp_edge(e,w)
1309 if edge.is_complete():
1310 for e in completer.apply(chart, grammar, edge):
1311 if self._trace > 0:
1312 print 'Completer', chart.pp_edge(e,w)
1313
1314
1315 return chart.parses(grammar.start(), tree_class=tree_class)
1316
1317
1318
1319
1320
1321 TD_STRATEGY = [CachedTopDownInitRule(), CachedTopDownExpandRule(),
1322 TopDownMatchRule(), SingleEdgeFundamentalRule()]
1323 BU_STRATEGY = [BottomUpInitRule(), BottomUpPredictRule(),
1324 SingleEdgeFundamentalRule()]
1325
1327 """
1328 A generic chart parser. A X{strategy}, or list of
1329 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to
1330 the chart. In particular, C{ChartParse} uses the following
1331 algorithm to parse texts:
1332
1333 - Until no new edges are added:
1334 - For each I{rule} in I{strategy}:
1335 - Apply I{rule} to any applicable edges in the chart.
1336 - Return any complete parses in the chart
1337 """
1338 - def __init__(self, grammar, strategy, trace=0):
1339 """
1340 Create a new chart parser, that uses C{grammar} to parse
1341 texts.
1342
1343 @type grammar: L{cfg.Grammar}
1344 @param grammar: The grammar used to parse texts.
1345 @type strategy: C{list} of L{ChartRuleI}
1346 @param strategy: A list of rules that should be used to decide
1347 what edges to add to the chart.
1348 @type trace: C{int}
1349 @param trace: The level of tracing that should be used when
1350 parsing a text. C{0} will generate no tracing output;
1351 and higher numbers will produce more verbose tracing
1352 output.
1353 """
1354 self._grammar = grammar
1355 self._strategy = strategy
1356 self._trace = trace
1357 AbstractParse.__init__(self)
1358
1360 chart = Chart(list(tokens))
1361 grammar = self._grammar
1362
1363
1364 w = 50/(chart.num_leaves()+1)
1365 if self._trace > 0: print chart.pp_leaves(w)
1366
1367 edges_added = 1
1368 while edges_added > 0:
1369 edges_added = 0
1370 for rule in self._strategy:
1371 edges_added_by_rule = 0
1372 for e in rule.apply_everywhere(chart, grammar):
1373 if self._trace > 0 and edges_added_by_rule == 0:
1374 print '%s:' % rule
1375 edges_added_by_rule += 1
1376 if self._trace > 1: print chart.pp_edge(e,w)
1377 if self._trace == 1 and edges_added_by_rule > 0:
1378 print ' - Added %d edges' % edges_added_by_rule
1379 edges_added += edges_added_by_rule
1380
1381
1382 return chart.parses(grammar.start(), tree_class=tree_class)
1383
1384
1385
1386
1387
1389 """
1390 A C{ChartParse} that allows you to step through the parsing
1391 process, adding a single edge at a time. It also allows you to
1392 change the parser's strategy or grammar midway through parsing a
1393 text.
1394
1395 The C{initialize} method is used to start parsing a text. C{step}
1396 adds a single edge to the chart. C{set_strategy} changes the
1397 strategy used by the chart parser. C{parses} returns the set of
1398 parses that has been found by the chart parser.
1399
1400 @ivar _restart: Records whether the parser's strategy, grammar,
1401 or chart has been changed. If so, then L{step} must restart
1402 the parsing algorithm.
1403 """
1404 - def __init__(self, grammar, strategy=None, trace=0):
1409
1410
1411
1412
1413
1415 "Begin parsing the given tokens."
1416 self._chart = Chart(list(tokens))
1417 self._restart = True
1418
1419
1420
1421
1422
1424 """
1425 @return: A generator that adds edges to the chart, one at a
1426 time. Each time the generator is resumed, it adds a single
1427 edge and yields that edge. If no more edges can be added,
1428 then it yields C{None}.
1429
1430 If the parser's strategy, grammar, or chart is changed, then
1431 the generator will continue adding edges using the new
1432 strategy, grammar, or chart.
1433
1434 Note that this generator never terminates, since the grammar
1435 or strategy might be changed to values that would add new
1436 edges. Instead, it yields C{None} when no more edges can be
1437 added with the current strategy and grammar.
1438 """
1439 if self._chart is None:
1440 raise ValueError, 'Parser must be initialized first'
1441 while 1:
1442 self._restart = False
1443 w = 50/(self._chart.num_leaves()+1)
1444
1445 for e in self._parse():
1446 if self._trace > 1: print self._current_chartrule
1447 if self._trace > 0: print self._chart.pp_edge(e,w)
1448 yield e
1449 if self._restart: break
1450 else:
1451 yield None
1452
1454 """
1455 A generator that implements the actual parsing algorithm.
1456 L{step} iterates through this generator, and restarts it
1457 whenever the parser's strategy, grammar, or chart is modified.
1458 """
1459 chart = self._chart
1460 grammar = self._grammar
1461 edges_added = 1
1462 while edges_added > 0:
1463 edges_added = 0
1464 for rule in self._strategy:
1465 self._current_chartrule = rule
1466 for e in rule.apply_everywhere_iter(chart, grammar):
1467 edges_added += 1
1468 yield e
1469
1470
1471
1472
1473
1475 "@return: The strategy used by this parser."
1476 return self._strategy
1477
1479 "@return: The grammar used by this parser."
1480 return self._grammar
1481
1483 "@return: The chart that is used by this parser."
1484 return self._chart
1485
1487 "@return: The chart rule used to generate the most recent edge."
1488 return self._current_chartrule
1489
1491 "@return: The parse trees currently contained in the chart."
1492 return self._chart.parses(self._grammar.start(), tree_class)
1493
1494
1495
1496
1497
1499 """
1500 Change the startegy that the parser uses to decide which edges
1501 to add to the chart.
1502 @type strategy: C{list} of L{ChartRuleI}
1503 @param strategy: A list of rules that should be used to decide
1504 what edges to add to the chart.
1505 """
1506 if strategy == self._strategy: return
1507 self._strategy = strategy[:]
1508 self._restart = True
1509
1511 "Change the grammar used by the parser."
1512 if grammar is self._grammar: return
1513 self._grammar = grammar
1514 self._restart = True
1515
1517 "Load a given chart into the chart parser."
1518 if chart is self._chart: return
1519 self._chart = chart
1520 self._restart = True
1521
1522
1523
1524
1525
1527
1528 self.initialize(token)
1529
1530
1531 for e in self.step():
1532 if e is None: break
1533
1534
1535 return self.parses(tree_class=tree_class)
1536
1537
1538
1539
1540
1542 """
1543 A demonstration of the chart parsers.
1544 """
1545 import sys, time
1546
1547
1548 S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
1549 V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')
1550
1551
1552 grammatical_productions = [
1553 cfg.Production(S, [NP, VP]), cfg.Production(PP, [P, NP]),
1554 cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]),
1555 cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP]),
1556 cfg.Production(VP, [V]),]
1557
1558
1559 lexical_productions = [
1560 cfg.Production(NP, ['John']), cfg.Production(NP, ['I']),
1561 cfg.Production(Det, ['the']), cfg.Production(Det, ['my']),
1562 cfg.Production(Det, ['a']),
1563 cfg.Production(N, ['dog']), cfg.Production(N, ['cookie']),
1564 cfg.Production(V, ['ate']), cfg.Production(V, ['saw']),
1565 cfg.Production(P, ['with']), cfg.Production(P, ['under']),
1566 ]
1567
1568
1569 earley_lexicon = {}
1570 for prod in lexical_productions:
1571 earley_lexicon.setdefault(prod.rhs()[0], []).append(prod.lhs())
1572
1573
1574 grammar = cfg.Grammar(S, grammatical_productions+lexical_productions)
1575
1576
1577 earley_grammar = cfg.Grammar(S, grammatical_productions)
1578
1579
1580 sent = 'I saw John with a dog with my cookie'
1581 print "Sentence:\n", sent
1582 from nltk_lite import tokenize
1583 tokens = list(tokenize.whitespace(sent))
1584
1585 print tokens
1586
1587
1588 print ' 1: Top-down chart parser'
1589 print ' 2: Bottom-up chart parser'
1590 print ' 3: Earley parser'
1591 print ' 4: Stepping chart parser (alternating top-down & bottom-up)'
1592 print ' 5: All parsers'
1593 print '\nWhich parser (1-5)? ',
1594 choice = sys.stdin.readline().strip()
1595 print
1596 if choice not in '12345':
1597 print 'Bad parser number'
1598 return
1599
1600
1601 times = {}
1602
1603
1604 if choice in ('1', '5'):
1605 cp = ChartParse(grammar, TD_STRATEGY, trace=2)
1606 t = time.time()
1607 parses = cp.get_parse_list(tokens)
1608 times['top down'] = time.time()-t
1609 assert len(parses)==5, 'Not all parses found'
1610 for tree in parses: print tree
1611
1612
1613 if choice in ('2', '5'):
1614 cp = ChartParse(grammar, BU_STRATEGY, trace=2)
1615 t = time.time()
1616 parses = cp.get_parse_list(tokens)
1617 times['bottom up'] = time.time()-t
1618 assert len(parses)==5, 'Not all parses found'
1619 for tree in parses: print tree
1620
1621
1622 if choice in ('3', '5'):
1623 cp = EarleyChartParse(earley_grammar, earley_lexicon, trace=1)
1624 t = time.time()
1625 parses = cp.get_parse_list(tokens)
1626 times['Earley parser'] = time.time()-t
1627 assert len(parses)==5, 'Not all parses found'
1628 for tree in parses: print tree
1629
1630
1631 if choice in ('4', '5'):
1632 t = time.time()
1633 cp = SteppingChartParse(grammar, trace=1)
1634 cp.initialize(tokens)
1635 for i in range(5):
1636 print '*** SWITCH TO TOP DOWN'
1637 cp.set_strategy(TD_STRATEGY)
1638 for j, e in enumerate(cp.step()):
1639 if j>20 or e is None: break
1640 print '*** SWITCH TO BOTTOM UP'
1641 cp.set_strategy(BU_STRATEGY)
1642 for j, e in enumerate(cp.step()):
1643 if j>20 or e is None: break
1644 times['stepping'] = time.time()-t
1645 assert len(cp.parses())==5, 'Not all parses found'
1646 for parse in cp.parses(): print parse
1647
1648
1649 maxlen = max(len(key) for key in times.keys())
1650 format = '%' + `maxlen` + 's parser: %6.3fsec'
1651 times_items = times.items()
1652 times_items.sort(lambda a,b:cmp(a[1], b[1]))
1653 for (parser, t) in times_items:
1654 print format % (parser, t)
1655
1656 if __name__ == '__main__': demo()
1657