Package nltk_lite :: Package parse :: Module viterbi
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.viterbi

  1  # Natural Language Toolkit: Viterbi Probabilistic Parser 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import types 
 10  from nltk_lite.parse import * 
 11   
 12  ##////////////////////////////////////////////////////// 
 13  ##  Viterbi PCFG Parser 
 14  ##////////////////////////////////////////////////////// 
 15   
16 -class ViterbiParse(AbstractParse):
17 """ 18 A bottom-up C{PCFG} parser that uses dynamic programming to find 19 the single most likely parse for a text. The C{ViterbiParse} parser 20 parses texts by filling in a X{most likely constituent table}. 21 This table records the most probable tree representation for any 22 given span and node value. In particular, it has an entry for 23 every start index, end index, and node value, recording the most 24 likely subtree that spans from the start index to the end index, 25 and has the given node value. 26 27 The C{ViterbiParse} parser fills in this table incrementally. It starts 28 by filling in all entries for constituents that span one element 29 of text (i.e., entries where the end index is one greater than the 30 start index). After it has filled in all table entries for 31 constituents that span one element of text, it fills in the 32 entries for constitutants that span two elements of text. It 33 continues filling in the entries for constituents spanning larger 34 and larger portions of the text, until the entire table has been 35 filled. Finally, it returns the table entry for a constituent 36 spanning the entire text, whose node value is the grammar's start 37 symbol. 38 39 In order to find the most likely constituent with a given span and 40 node value, the C{ViterbiParse} parser considers all productions that 41 could produce that node value. For each production, it finds all 42 children that collectively cover the span and have the node values 43 specified by the production's right hand side. If the probability 44 of the tree formed by applying the production to the children is 45 greater than the probability of the current entry in the table, 46 then the table is updated with this new tree. 47 48 A pseudo-code description of the algorithm used by 49 C{ViterbiParse} is: 50 51 - Create an empty most likely constituent table, M{MLC}. 52 - For M{width} in 1...len(M{text}): 53 - For M{start} in 1...len(M{text})-M{width}: 54 - For M{prod} in grammar.productions: 55 - For each sequence of subtrees [M{t[1]}, M{t[2]}, ..., 56 M{t[n]}] in M{MLC}, where M{t[i]}.node==M{prod}.rhs[i], 57 and the sequence covers [M{start}:M{start}+M{width}]: 58 - M{old_p} = M{MLC}[M{start}, M{start+width}, M{prod}.lhs] 59 - M{new_p} = P(M{t[1]})*P(M{t[1]})*...*P(M{t[n]})*P(M{prod}) 60 - if M{new_p} > M{old_p}: 61 - M{new_tree} = Tree(M{prod}.lhs, M{t[1]}, M{t[2]}, 62 ..., M{t[n]}) 63 - M{MLC}[M{start}, M{start+width}, M{prod}.lhs] 64 = M{new_tree} 65 - Return M{MLC}[0, len(M{text}), M{start_symbol}] 66 67 @type _grammar: C{pcfg.Grammar} 68 @ivar _grammar: The grammar used to parse sentences. 69 @type _trace: C{int} 70 @ivar _trace: The level of tracing output that should be generated 71 when parsing a text. 72 """
73 - def __init__(self, grammar, trace=0):
74 """ 75 Create a new C{ViterbiParse} parser, that uses {grammar} to 76 parse texts. 77 78 @type grammar: C{pcfg.Grammar} 79 @param grammar: The grammar used to parse texts. 80 @type trace: C{int} 81 @param trace: The level of tracing that should be used when 82 parsing a text. C{0} will generate no tracing output; 83 and higher numbers will produce more verbose tracing 84 output. 85 """ 86 self._grammar = grammar 87 self._trace = trace 88 AbstractParse.__init__(self)
89
90 - def trace(self, trace=2):
91 """ 92 Set the level of tracing output that should be generated when 93 parsing a text. 94 95 @type trace: C{int} 96 @param trace: The trace level. A trace level of C{0} will 97 generate no tracing output; and higher trace levels will 98 produce more verbose tracing output. 99 @rtype: C{None} 100 """ 101 self._trace = trace
102
103 - def get_parse_list(self, tokens):
104 # Inherit docs from ParseI 105 106 # The most likely constituent table. This table specifies the 107 # most likely constituent for a given span and type. 108 # Constituents can be either Trees or Tokens. For 109 # Trees, the "type" is the Nonterminal for the tree's 110 # root node value. For Tokens, the "type" is the token's 111 # type. The table is stored as a dictionary, since it is 112 # sparse. 113 constituents = {} 114 115 # Initialize the constituents dictionary with the words from 116 # the text. 117 if self._trace: print ('Inserting tokens into the most likely'+ 118 ' constituents table...') 119 for index in range(len(tokens)): 120 token = tokens[index] 121 constituents[index,index+1,token] = token 122 if self._trace > 1: 123 self._trace_lexical_insertion(token, index, len(tokens)) 124 125 # Consider each span of length 1, 2, ..., n; and add any trees 126 # that might cover that span to the constituents dictionary. 127 for length in range(1, len(tokens)+1): 128 if self._trace: 129 print ('Finding the most likely constituents'+ 130 ' spanning %d text elements...' % length) 131 #print constituents 132 for start in range(len(tokens)-length+1): 133 span = (start, start+length) 134 self._add_constituents_spanning(span, constituents, 135 tokens) 136 137 # Find all trees that span the entire text & have the right cat 138 trees = [constituents.get((0, len(tokens), 139 self._grammar.start()), [])] 140 141 # Sort the trees, and return the requested number of them. 142 trees.sort(lambda t1,t2: cmp(t2.prob(), t1.prob())) 143 return trees
144
145 - def _add_constituents_spanning(self, span, constituents, tokens):
146 """ 147 Find any constituents that might cover C{span}, and add them 148 to the most likely constituents table. 149 150 @rtype: C{None} 151 @type span: C{(int, int)} 152 @param span: The section of the text for which we are 153 trying to find possible constituents. The span is 154 specified as a pair of integers, where the first integer 155 is the index of the first token that should be included in 156 the constituent; and the second integer is the index of 157 the first token that should not be included in the 158 constituent. I.e., the constituent should cover 159 C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text 160 that we are parsing. 161 162 @type constituents: C{dictionary} from 163 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or 164 C{ProbabilisticTree}). 165 @param constituents: The most likely constituents table. This 166 table records the most probable tree representation for 167 any given span and node value. In particular, 168 C{constituents(M{s},M{e},M{nv})} is the most likely 169 C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]} 170 and has a node value C{M{nv}.symbol()}, where C{M{text}} 171 is the text that we are parsing. When 172 C{_add_constituents_spanning} is called, C{constituents} 173 should contain all possible constituents that are shorter 174 than C{span}. 175 176 @type tokens: C{list} of tokens 177 @param tokens: The text we are parsing. This is only used for 178 trace output. 179 """ 180 # Since some of the grammar productions may be unary, we need to 181 # repeatedly try all of the productions until none of them add any 182 # new constituents. 183 changed = 1 184 while changed: 185 changed = 0 186 187 # Find all ways instantiations of the grammar productions that 188 # cover the span. 189 instantiations = self._find_instantiations(span, constituents) 190 191 # For each production instantiation, add a new 192 # ProbabilisticTree whose probability is the product 193 # of the childrens' probabilities and the production's 194 # probability. 195 for (production, children) in instantiations: 196 subtrees = [c for c in children if isinstance(c, Tree)] 197 p = reduce(lambda pr,t:pr*t.prob(), 198 subtrees, production.prob()) 199 node = production.lhs().symbol() 200 tree = ProbabilisticTree(node, children, prob=p) 201 202 # If it's new a constituent, then add it to the 203 # constituents dictionary. 204 c = constituents.get((span[0], span[1], production.lhs()), 205 None) 206 if self._trace > 1: 207 if c is None or c != tree: 208 if c is None or c.prob() < tree.prob(): 209 print ' Insert:', 210 else: 211 print ' Discard:', 212 self._trace_production(production, p, span, len(tokens)) 213 if c is None or c.prob() < tree.prob(): 214 constituents[span[0], span[1], production.lhs()] = tree 215 changed = 1
216
217 - def _find_instantiations(self, span, constituents):
218 """ 219 @return: a list of the production instantiations that cover a 220 given span of the text. A X{production instantiation} is 221 a tuple containing a production and a list of children, 222 where the production's right hand side matches the list of 223 children; and the children cover C{span}. @rtype: C{list} 224 of C{pair} of C{Production}, (C{list} of 225 (C{ProbabilisticTree} or token. 226 227 @type span: C{(int, int)} 228 @param span: The section of the text for which we are 229 trying to find production instantiations. The span is 230 specified as a pair of integers, where the first integer 231 is the index of the first token that should be covered by 232 the production instantiation; and the second integer is 233 the index of the first token that should not be covered by 234 the production instantiation. 235 @type constituents: C{dictionary} from 236 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or 237 C{ProbabilisticTree}). 238 @param constituents: The most likely constituents table. This 239 table records the most probable tree representation for 240 any given span and node value. See the module 241 documentation for more information. 242 """ 243 rv = [] 244 for production in self._grammar.productions(): 245 childlists = self._match_rhs(production.rhs(), span, constituents) 246 247 for childlist in childlists: 248 rv.append( (production, childlist) ) 249 return rv
250
251 - def _match_rhs(self, rhs, span, constituents):
252 """ 253 @return: a set of all the lists of children that cover C{span} 254 and that match C{rhs}. 255 @rtype: C{list} of (C{list} of C{ProbabilisticTree} or 256 C{Token}) 257 258 @type rhs: C{list} of C{Nonterminal} or (any) 259 @param rhs: The list specifying what kinds of children need to 260 cover C{span}. Each nonterminal in C{rhs} specifies 261 that the corresponding child should be a tree whose node 262 value is that nonterminal's symbol. Each terminal in C{rhs} 263 specifies that the corresponding child should be a token 264 whose type is that terminal. 265 @type span: C{(int, int)} 266 @param span: The section of the text for which we are 267 trying to find child lists. The span is specified as a 268 pair of integers, where the first integer is the index of 269 the first token that should be covered by the child list; 270 and the second integer is the index of the first token 271 that should not be covered by the child list. 272 @type constituents: C{dictionary} from 273 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or 274 C{ProbabilisticTree}). 275 @param constituents: The most likely constituents table. This 276 table records the most probable tree representation for 277 any given span and node value. See the module 278 documentation for more information. 279 """ 280 (start, end) = span 281 282 # Base case 283 if start >= end and rhs == (): return [[]] 284 if start >= end or rhs == (): return [] 285 286 # Find everything that matches the 1st symbol of the RHS 287 childlists = [] 288 for split in range(start, end+1): 289 l=constituents.get((start,split,rhs[0])) 290 if l is not None: 291 rights = self._match_rhs(rhs[1:], (split,end), constituents) 292 childlists += [[l]+r for r in rights] 293 294 return childlists
295
296 - def _trace_production(self, production, p, span, width):
297 """ 298 Print trace output indicating that a given production has been 299 applied at a given location. 300 301 @param production: The production that has been applied 302 @type production: C{Production} 303 @param p: The probability of the tree produced by the production. 304 @type p: C{float} 305 @param span: The span of the production 306 @type span: C{tuple} 307 @rtype: C{None} 308 """ 309 310 str = '|' + '.' * span[0] 311 str += '=' * (span[1] - span[0]) 312 str += '.' * (width - span[1]) + '| ' 313 str += '%s' % production 314 if self._trace > 2: str = '%-40s %12.10f ' % (str, p) 315 316 print str
317
318 - def _trace_lexical_insertion(self, token, index, width):
319 str = ' Insert: |' + '.' * index + '=' + '.' * (width-index-1) + '| ' 320 str += '%s' % (token,) 321 print str
322
323 - def __repr__(self):
324 return '<ViterbiParser for %r>' % self._grammar
325 326 327 ##////////////////////////////////////////////////////// 328 ## Test Code 329 ##////////////////////////////////////////////////////// 330
331 -def demo():
332 """ 333 A demonstration of the probabilistic parsers. The user is 334 prompted to select which demo to run, and how many parses should 335 be found; and then each parser is run on the same demo, and a 336 summary of the results are displayed. 337 """ 338 import sys, time 339 from nltk_lite import tokenize 340 from nltk_lite.parse import cfg, pcfg, ViterbiParse 341 342 # Define two demos. Each demo has a sentence and a grammar. 343 demos = [('I saw John with my cookie', pcfg.toy1), 344 ('the boy saw Jack with Bob under the table with a telescope', 345 pcfg.toy2)] 346 347 # Ask the user which demo they want to use. 348 print 349 for i in range(len(demos)): 350 print '%3s: %s' % (i+1, demos[i][0]) 351 print ' %r' % demos[i][1] 352 print 353 print 'Which demo (%d-%d)? ' % (1, len(demos)), 354 try: 355 snum = int(sys.stdin.readline().strip())-1 356 sent, grammar = demos[snum] 357 except: 358 print 'Bad sentence number' 359 return 360 361 # Tokenize the sentence. 362 tokens = list(tokenize.whitespace(sent)) 363 364 parser = ViterbiParse(grammar) 365 all_parses = {} 366 367 print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar) 368 parser.trace(3) 369 t = time.time() 370 parses = parser.get_parse_list(tokens) 371 time = time.time()-t 372 if parses: 373 average = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 374 else: 375 average = 0 376 num_parses = len(parses) 377 for p in parses: 378 all_parses[p.freeze()] = 1 379 380 # Print some summary statistics 381 print 382 print 'Time (secs) # Parses Average P(parse)' 383 print '-----------------------------------------' 384 print '%11.4f%11d%19.14f' % (time, num_parses, average) 385 parses = all_parses.keys() 386 if parses: 387 p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 388 else: p = 0 389 print '------------------------------------------' 390 print '%11s%11d%19.14f' % ('n/a', len(parses), p) 391 392 # Ask the user if we should draw the parses. 393 print 394 print 'Draw parses (y/n)? ', 395 if sys.stdin.readline().strip().lower().startswith('y'): 396 from nltk_lite.draw.tree import draw_trees 397 print ' please wait...' 398 draw_trees(*parses) 399 400 # Ask the user if we should print the parses. 401 print 402 print 'Print parses (y/n)? ', 403 if sys.stdin.readline().strip().lower().startswith('y'): 404 for parse in parses: 405 print parse
406 407 if __name__ == '__main__': 408 demo() 409