Package nltk_lite :: Package chunk :: Module regexp
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.chunk.regexp

  1  # Natural Language Toolkit: Regular Expression Chunkers 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from nltk_lite.chunk import * 
 10  from nltk_lite.parse import AbstractParse 
 11   
12 -class RegexpChunkRule(object):
13 """ 14 A rule specifying how to modify the chunking in a C{ChunkString}, 15 using a transformational regular expression. The 16 C{RegexpChunkRule} class itself can be used to implement any 17 transformational rule based on regular expressions. There are 18 also a number of subclasses, which can be used to implement 19 simpler types of rules, based on matching regular expressions. 20 21 Each C{RegexpChunkRule} has a regular expression and a 22 replacement expression. When a C{RegexpChunkRule} is X{applied} 23 to a C{ChunkString}, it searches the C{ChunkString} for any 24 substring that matches the regular expression, and replaces it 25 using the replacement expression. This search/replace operation 26 has the same semantics as C{re.sub}. 27 28 Each C{RegexpChunkRule} also has a description string, which 29 gives a short (typically less than 75 characters) description of 30 the purpose of the rule. 31 32 This transformation defined by this C{RegexpChunkRule} should 33 only add and remove braces; it should I{not} modify the sequence 34 of angle-bracket delimited tags. Furthermore, this transformation 35 may not result in nested or mismatched bracketing. 36 """
37 - def __init__(self, regexp, repl, descr):
38 """ 39 Construct a new RegexpChunkRule. 40 41 @type regexp: C{regexp} or C{string} 42 @param regexp: This C{RegexpChunkRule}'s regular expression. 43 When this rule is applied to a C{ChunkString}, any 44 substring that matches C{regexp} will be replaced using 45 the replacement string C{repl}. Note that this must be a 46 normal regular expression, not a tag pattern. 47 @type repl: C{string} 48 @param repl: This C{RegexpChunkRule}'s replacement 49 expression. When this rule is applied to a 50 C{ChunkString}, any substring that matches C{regexp} will 51 be replaced using C{repl}. 52 @type descr: C{string} 53 @param descr: A short description of the purpose and/or effect 54 of this rule. 55 """ 56 if type(regexp).__name__ == 'SRE_Pattern': regexp = regexp.pattern 57 self._repl = repl 58 self._descr = descr 59 if type(regexp) == types.StringType: 60 self._regexp = re.compile(regexp) 61 else: 62 self._regexp = regexp
63
64 - def apply(self, chunkstr):
65 # Keep docstring generic so we can inherit it. 66 """ 67 Apply this rule to the given C{ChunkString}. See the 68 class reference documentation for a description of what it 69 means to apply a rule. 70 71 @type chunkstr: C{ChunkString} 72 @param chunkstr: The chunkstring to which this rule is 73 applied. 74 @rtype: C{None} 75 @raise ValueError: If this transformation generated an 76 invalid chunkstring. 77 """ 78 chunkstr.xform(self._regexp, self._repl)
79
80 - def descr(self):
81 """ 82 @rtype: C{string} 83 @return: a short description of the purpose and/or effect of 84 this rule. 85 """ 86 return self._descr
87
88 - def __repr__(self):
89 """ 90 @rtype: C{string} 91 @return: A string representation of this rule. This 92 string representation has the form:: 93 94 <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'> 95 96 Note that this representation does not include the 97 description string; that string can be accessed 98 separately with the C{descr} method. 99 """ 100 return ('<RegexpChunkRule: '+`self._regexp.pattern`+ 101 '->'+`self._repl`+'>')
102
103 -class ChunkRule(RegexpChunkRule):
104 """ 105 A rule specifying how to add chunks to a C{ChunkString}, using a 106 matching tag pattern. When applied to a C{ChunkString}, it will 107 find any substring that matches this tag pattern and that is not 108 already part of a chunk, and create a new chunk containing that 109 substring. 110 """
111 - def __init__(self, tag_pattern, descr):
112 113 """ 114 Construct a new C{ChunkRule}. 115 116 @type tag_pattern: C{string} 117 @param tag_pattern: This rule's tag pattern. When 118 applied to a C{ChunkString}, this rule will 119 chunk any substring that matches this tag pattern and that 120 is not already part of a chunk. 121 @type descr: C{string} 122 @param descr: A short description of the purpose and/or effect 123 of this rule. 124 """ 125 self._pattern = tag_pattern 126 regexp = re.compile('(?P<chunk>%s)%s' % 127 (tag_pattern2re_pattern(tag_pattern), 128 ChunkString.IN_CHINK_PATTERN)) 129 RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
130
131 - def __repr__(self):
132 """ 133 @rtype: C{string} 134 @return: A string representation of this rule. This 135 string representation has the form:: 136 137 <ChunkRule: '<IN|VB.*>'> 138 139 Note that this representation does not include the 140 description string; that string can be accessed 141 separately with the C{descr} method. 142 """ 143 return '<ChunkRule: '+`self._pattern`+'>'
144
145 -class ChinkRule(RegexpChunkRule):
146 """ 147 A rule specifying how to remove chinks to a C{ChunkString}, 148 using a matching tag pattern. When applied to a 149 C{ChunkString}, it will find any substring that matches this 150 tag pattern and that is contained in a chunk, and remove it 151 from that chunk, thus creating two new chunks. 152 """
153 - def __init__(self, tag_pattern, descr):
154 """ 155 Construct a new C{ChinkRule}. 156 157 @type tag_pattern: C{string} 158 @param tag_pattern: This rule's tag pattern. When 159 applied to a C{ChunkString}, this rule will 160 find any substring that matches this tag pattern and that 161 is contained in a chunk, and remove it from that chunk, 162 thus creating two new chunks. 163 @type descr: C{string} 164 @param descr: A short description of the purpose and/or effect 165 of this rule. 166 """ 167 self._pattern = tag_pattern 168 regexp = re.compile('(?P<chink>%s)%s' % 169 (tag_pattern2re_pattern(tag_pattern), 170 ChunkString.IN_CHUNK_PATTERN)) 171 RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
172
173 - def __repr__(self):
174 """ 175 @rtype: C{string} 176 @return: A string representation of this rule. This 177 string representation has the form:: 178 179 <ChinkRule: '<IN|VB.*>'> 180 181 Note that this representation does not include the 182 description string; that string can be accessed 183 separately with the C{descr} method. 184 """ 185 return '<ChinkRule: '+`self._pattern`+'>'
186
187 -class UnChunkRule(RegexpChunkRule):
188 """ 189 A rule specifying how to remove chunks to a C{ChunkString}, 190 using a matching tag pattern. When applied to a 191 C{ChunkString}, it will find any complete chunk that matches this 192 tag pattern, and un-chunk it. 193 """
194 - def __init__(self, tag_pattern, descr):
195 """ 196 Construct a new C{UnChunkRule}. 197 198 @type tag_pattern: C{string} 199 @param tag_pattern: This rule's tag pattern. When 200 applied to a C{ChunkString}, this rule will 201 find any complete chunk that matches this tag pattern, 202 and un-chunk it. 203 @type descr: C{string} 204 @param descr: A short description of the purpose and/or effect 205 of this rule. 206 """ 207 self._pattern = tag_pattern 208 regexp = re.compile('\{(?P<chunk>%s)\}' % 209 tag_pattern2re_pattern(tag_pattern)) 210 RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
211
212 - def __repr__(self):
213 """ 214 @rtype: C{string} 215 @return: A string representation of this rule. This 216 string representation has the form:: 217 218 <UnChunkRule: '<IN|VB.*>'> 219 220 Note that this representation does not include the 221 description string; that string can be accessed 222 separately with the C{descr} method. 223 """ 224 return '<UnChunkRule: '+`self._pattern`+'>'
225
226 -class MergeRule(RegexpChunkRule):
227 """ 228 A rule specifying how to merge chunks in a C{ChunkString}, using 229 two matching tag patterns: a left pattern, and a right pattern. 230 When applied to a C{ChunkString}, it will find any chunk whose end 231 matches left pattern, and immediately followed by a chunk whose 232 beginning matches right pattern. It will then merge those two 233 chunks into a single chunk. 234 """
235 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
236 """ 237 Construct a new C{MergeRule}. 238 239 @type right_tag_pattern: C{string} 240 @param right_tag_pattern: This rule's right tag 241 pattern. When applied to a C{ChunkString}, this 242 rule will find any chunk whose end matches 243 C{left_tag_pattern}, and immediately followed by a chunk 244 whose beginning matches this pattern. It will 245 then merge those two chunks into a single chunk. 246 @type left_tag_pattern: C{string} 247 @param left_tag_pattern: This rule's left tag 248 pattern. When applied to a C{ChunkString}, this 249 rule will find any chunk whose end matches 250 this pattern, and immediately followed by a chunk 251 whose beginning matches C{right_tag_pattern}. It will 252 then merge those two chunks into a single chunk. 253 254 @type descr: C{string} 255 @param descr: A short description of the purpose and/or effect 256 of this rule. 257 """ 258 self._left_tag_pattern = left_tag_pattern 259 self._right_tag_pattern = right_tag_pattern 260 regexp = re.compile('(?P<left>%s)}{(?=%s)' % 261 (tag_pattern2re_pattern(left_tag_pattern), 262 tag_pattern2re_pattern(right_tag_pattern))) 263 RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
264
265 - def __repr__(self):
266 """ 267 @rtype: C{string} 268 @return: A string representation of this rule. This 269 string representation has the form:: 270 271 <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'> 272 273 Note that this representation does not include the 274 description string; that string can be accessed 275 separately with the C{descr} method. 276 """ 277 return ('<MergeRule: '+`self._left_tag_pattern`+', '+ 278 `self._right_tag_pattern`+'>')
279
280 -class SplitRule(RegexpChunkRule):
281 """ 282 A rule specifying how to split chunks in a C{ChunkString}, using 283 two matching tag patterns: a left pattern, and a right pattern. 284 When applied to a C{ChunkString}, it will find any chunk that 285 matches the left pattern followed by the right pattern. It will 286 then split the chunk into two new chunks, at the point between the 287 two pattern matches. 288 """
289 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
290 """ 291 Construct a new C{SplitRule}. 292 293 @type right_tag_pattern: C{string} 294 @param right_tag_pattern: This rule's right tag 295 pattern. When applied to a C{ChunkString}, this rule will 296 find any chunk containing a substring that matches 297 C{left_tag_pattern} followed by this pattern. It will 298 then split the chunk into two new chunks at the point 299 between these two matching patterns. 300 @type left_tag_pattern: C{string} 301 @param left_tag_pattern: This rule's left tag 302 pattern. When applied to a C{ChunkString}, this rule will 303 find any chunk containing a substring that matches this 304 pattern followed by C{right_tag_pattern}. It will then 305 split the chunk into two new chunks at the point between 306 these two matching patterns. 307 @type descr: C{string} 308 @param descr: A short description of the purpose and/or effect 309 of this rule. 310 """ 311 self._left_tag_pattern = left_tag_pattern 312 self._right_tag_pattern = right_tag_pattern 313 regexp = re.compile('(?P<left>%s)(?=%s)' % 314 (tag_pattern2re_pattern(left_tag_pattern), 315 tag_pattern2re_pattern(right_tag_pattern))) 316 RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
317
318 - def __repr__(self):
319 """ 320 @rtype: C{string} 321 @return: A string representation of this rule. This 322 string representation has the form:: 323 324 <SplitRule: '<NN>', '<DT>'> 325 326 Note that this representation does not include the 327 description string; that string can be accessed 328 separately with the C{descr} method. 329 """ 330 return ('<SplitRule: '+`self._left_tag_pattern`+', '+ 331 `self._right_tag_pattern`+'>')
332
333 -class ExpandLeftRule(RegexpChunkRule):
334 """ 335 A rule specifying how to expand chunks in a C{ChunkString} to the left, 336 using two matching tag patterns: a left pattern, and a right pattern. 337 When applied to a C{ChunkString}, it will find any chunk whose beginning 338 matches right pattern, and immediately preceded by a chink whose 339 end matches left pattern. It will then expand the chunk to incorporate 340 the new material on the left. 341 """
342 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
343 """ 344 Construct a new C{ExpandRightRule}. 345 346 @type right_tag_pattern: C{string} 347 @param right_tag_pattern: This rule's right tag 348 pattern. When applied to a C{ChunkString}, this 349 rule will find any chunk whose beginning matches 350 C{right_tag_pattern}, and immediately preceded by a chink 351 whose end matches this pattern. It will 352 then merge those two chunks into a single chunk. 353 @type left_tag_pattern: C{string} 354 @param left_tag_pattern: This rule's left tag 355 pattern. When applied to a C{ChunkString}, this 356 rule will find any chunk whose beginning matches 357 this pattern, and immediately preceded by a chink 358 whose end matches C{left_tag_pattern}. It will 359 then expand the chunk to incorporate the new material on the left. 360 361 @type descr: C{string} 362 @param descr: A short description of the purpose and/or effect 363 of this rule. 364 """ 365 self._left_tag_pattern = left_tag_pattern 366 self._right_tag_pattern = right_tag_pattern 367 regexp = re.compile('(?P<left>%s)\{(?P<right>%s)' % 368 (tag_pattern2re_pattern(left_tag_pattern), 369 tag_pattern2re_pattern(right_tag_pattern))) 370 RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
371
372 - def __repr__(self):
373 """ 374 @rtype: C{string} 375 @return: A string representation of this rule. This 376 string representation has the form:: 377 378 <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'> 379 380 Note that this representation does not include the 381 description string; that string can be accessed 382 separately with the C{descr} method. 383 """ 384 return ('<ExpandLeftRule: '+`self._left_tag_pattern`+', '+ 385 `self._right_tag_pattern`+'>')
386
387 -class ExpandRightRule(RegexpChunkRule):
388 """ 389 A rule specifying how to expand chunks in a C{ChunkString} to the right, 390 using two matching tag patterns: a left pattern, and a right pattern. 391 When applied to a C{ChunkString}, it will find any chunk whose end 392 matches left pattern, and immediately followed by a chink whose 393 beginning matches right pattern. It will then expand the chunk to incorporate 394 the new material on the right. 395 """
396 - def __init__(self, left_tag_pattern, right_tag_pattern, descr):
397 """ 398 Construct a new C{ExpandRightRule}. 399 400 @type right_tag_pattern: C{string} 401 @param right_tag_pattern: This rule's right tag 402 pattern. When applied to a C{ChunkString}, this 403 rule will find any chunk whose end matches 404 C{left_tag_pattern}, and immediately followed by a chink 405 whose beginning matches this pattern. It will 406 then merge those two chunks into a single chunk. 407 @type left_tag_pattern: C{string} 408 @param left_tag_pattern: This rule's left tag 409 pattern. When applied to a C{ChunkString}, this 410 rule will find any chunk whose end matches 411 this pattern, and immediately followed by a chink 412 whose beginning matches C{right_tag_pattern}. It will 413 then expand the chunk to incorporate the new material on the right. 414 415 @type descr: C{string} 416 @param descr: A short description of the purpose and/or effect 417 of this rule. 418 """ 419 self._left_tag_pattern = left_tag_pattern 420 self._right_tag_pattern = right_tag_pattern 421 regexp = re.compile('(?P<left>%s)\}(?P<right>%s)' % 422 (tag_pattern2re_pattern(left_tag_pattern), 423 tag_pattern2re_pattern(right_tag_pattern))) 424 RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
425
426 - def __repr__(self):
427 """ 428 @rtype: C{string} 429 @return: A string representation of this rule. This 430 string representation has the form:: 431 432 <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'> 433 434 Note that this representation does not include the 435 description string; that string can be accessed 436 separately with the C{descr} method. 437 """ 438 return ('<ExpandRightRule: '+`self._left_tag_pattern`+', '+ 439 `self._right_tag_pattern`+'>')
440 441 442 CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' % 443 ('[^\{\}<>]+', 444 '[^\{\}<>]+')) 445 446 ##////////////////////////////////////////////////////// 447 ## Tag Pattern Format Conversion 448 ##////////////////////////////////////////////////////// 449
450 -def tag_pattern2re_pattern(tag_pattern):
451 """ 452 Convert a tag pattern to a regular expression pattern. A X{tag 453 pattern} is a modified version of a regular expression, designed 454 for matching sequences of tags. The differences between regular 455 expression patterns and tag patterns are: 456 457 - In tag patterns, C{'<'} and C{'>'} act as parentheses; so 458 C{'<NN>+'} matches one or more repetitions of C{'<NN>'}, not 459 C{'<NN'} followed by one or more repetitions of C{'>'}. 460 - Whitespace in tag patterns is ignored. So 461 C{'<DT> | <NN>'} is equivalant to C{'<DT>|<NN>'} 462 - In tag patterns, C{'.'} is equivalant to C{'[^{}<>]'}; so 463 C{'<NN.*>'} matches any single tag starting with C{'NN'}. 464 465 In particular, C{tag_pattern2re_pattern} performs the following 466 transformations on the given pattern: 467 468 - Replace '.' with '[^<>{}]' 469 - Remove any whitespace 470 - Add extra parens around '<' and '>', to make '<' and '>' act 471 like parentheses. E.g., so that in '<NN>+', the '+' has scope 472 over the entire '<NN>'; and so that in '<NN|IN>', the '|' has 473 scope over 'NN' and 'IN', but not '<' or '>'. 474 - Check to make sure the resulting pattern is valid. 475 476 @type tag_pattern: C{string} 477 @param tag_pattern: The tag pattern to convert to a regular 478 expression pattern. 479 @raise ValueError: If C{tag_pattern} is not a valid tag pattern. 480 In particular, C{tag_pattern} should not include braces; and it 481 should not contain nested or mismatched angle-brackets. 482 @rtype: C{string} 483 @return: A regular expression pattern corresponding to 484 C{tag_pattern}. 485 """ 486 # Clean up the regular expression 487 tag_pattern = re.sub(r'\s', '', tag_pattern) 488 tag_pattern = re.sub(r'<', '(<(', tag_pattern) 489 tag_pattern = re.sub(r'>', ')>)', tag_pattern) 490 491 # Check the regular expression 492 if not CHUNK_TAG_PATTERN.match(tag_pattern): 493 raise ValueError('Bad tag pattern: %s' % tag_pattern) 494 495 # Replace "." with CHUNK_TAG_CHAR. 496 # We have to do this after, since it adds {}[]<>s, which would 497 # confuse CHUNK_TAG_PATTERN. 498 # PRE doesn't have lookback assertions, so reverse twice, and do 499 # the pattern backwards (with lookahead assertions). This can be 500 # made much cleaner once we can switch back to SRE. 501 def reverse_str(str): 502 lst = list(str) 503 lst.reverse() 504 return ''.join(lst)
505 tc_rev = reverse_str(CHUNK_TAG_CHAR) 506 reversed = reverse_str(tag_pattern) 507 reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed) 508 tag_pattern = reverse_str(reversed) 509 510 return tag_pattern 511 512 513 ##////////////////////////////////////////////////////// 514 ## RegexpChunk 515 ##////////////////////////////////////////////////////// 516
517 -class RegexpChunk(ChunkParseI, AbstractParse):
518 """ 519 A regular expression based chunk parser. C{RegexpChunk} uses a 520 sequence of X{rules} to find chunks of a single type within a 521 text. The chunking of the text is encoded using a C{ChunkString}, 522 and each rule acts by modifying the chunking in the 523 C{ChunkString}. The rules are all implemented using regular 524 expression matching and substitution. 525 526 The C{RegexpChunkRule} class and its subclasses (C{ChunkRule}, 527 C{ChinkRule}, C{UnChunkRule}, C{MergeRule}, and C{SplitRule}) 528 define the rules that are used by C{RegexpChunk}. Each rule 529 defines an C{apply} method, which modifies the chunking encoded 530 by a given C{ChunkString}. 531 532 @type _rules: C{list} of C{RegexpChunkRule} 533 @ivar _rules: The list of rules that should be applied to a text. 534 @type _trace: C{int} 535 @ivar _trace: The default level of tracing. 536 537 """
538 - def __init__(self, rules, chunk_node='NP', top_node='S', trace=0):
539 """ 540 Construct a new C{RegexpChunk}. 541 542 @type rules: C{list} of C{RegexpChunkRule} 543 @param rules: The sequence of rules that should be used to 544 generate the chunking for a tagged text. 545 @type chunk_node: C{string} 546 @param chunk_node: The node value that should be used for 547 chunk subtrees. This is typically a short string 548 describing the type of information contained by the chunk, 549 such as C{"NP"} for base noun phrases. 550 @type top_node: C{string} 551 @param top_node: The node value that should be used for the 552 top node of the chunk structure. 553 @type trace: C{int} 554 @param trace: The level of tracing that should be used when 555 parsing a text. C{0} will generate no tracing output; 556 C{1} will generate normal tracing output; and C{2} or 557 higher will generate verbose tracing output. 558 """ 559 self._rules = rules 560 self._trace = trace 561 self._chunk_node = chunk_node 562 self._top_node = top_node 563 AbstractParse.__init__(self)
564
565 - def _trace_apply(self, chunkstr, verbose):
566 """ 567 Apply each of this C{RegexpChunk}'s rules to C{chunkstr}, in 568 turn. Generate trace output between each rule. If C{verbose} 569 is true, then generate verbose output. 570 571 @type chunkstr: C{ChunkString} 572 @param chunkstr: The chunk string to which each rule should be 573 applied. 574 @type verbose: C{boolean} 575 @param verbose: Whether output should be verbose. 576 @rtype: C{None} 577 """ 578 print '# Input:' 579 print chunkstr 580 for rule in self._rules: 581 rule.apply(chunkstr) 582 if verbose: 583 print '#', rule.descr()+' ('+`rule`+'):' 584 else: 585 print '#', rule.descr()+':' 586 print chunkstr
587
588 - def _notrace_apply(self, chunkstr):
589 """ 590 Apply each of this C{RegexpChunk}'s rules to C{chunkstr}, in 591 turn. 592 593 @param chunkstr: The chunk string to which each rule should be 594 applied. 595 @type chunkstr: C{ChunkString} 596 @rtype: C{None} 597 """ 598 599 for rule in self._rules: 600 rule.apply(chunkstr)
601
602 - def parse(self, chunk_struct, trace=None):
603 from nltk_lite.parse.tree import Tree 604 """ 605 @type chunk_struct: C{Tree} 606 @param chunk_struct: the chunk structure to be (further) chunked 607 @type trace: C{int} 608 @param trace: The level of tracing that should be used when 609 parsing a text. C{0} will generate no tracing output; 610 C{1} will generate normal tracing output; and C{2} or 611 highter will generate verbose tracing output. This value 612 overrides the trace level value that was given to the 613 constructor. 614 @rtype: C{Tree} 615 @return: a chunk structure that encodes the chunks in a given 616 tagged sentence. A chunk is a non-overlapping linguistic 617 group, such as a noun phrase. The set of chunks 618 identified in the chunk structure depends on the rules 619 used to define this C{RegexpChunk}. 620 """ 621 if len(chunk_struct) == 0: 622 print 'Warning: parsing empty text' 623 return Tree(self._top_node, []) 624 625 try: 626 chunk_struct.node 627 except AttributeError: 628 chunk_struct = Tree(self._top_node, chunk_struct) 629 630 # Use the default trace value? 631 if trace == None: trace = self._trace 632 633 chunkstr = ChunkString(chunk_struct) 634 635 # Apply the sequence of rules to the chunkstring. 636 if trace: 637 verbose = (trace>1) 638 self._trace_apply(chunkstr, verbose) 639 else: 640 self._notrace_apply(chunkstr) 641 642 # Use the chunkstring to create a chunk structure. 643 return chunkstr.to_chunkstruct(self._chunk_node)
644
645 - def rules(self):
646 """ 647 @return: the sequence of rules used by C{RegexpChunk}. 648 @rtype: C{list} of C{RegexpChunkRule} 649 """ 650 return self._rules
651
652 - def __repr__(self):
653 """ 654 @return: a concise string representation of this 655 C{RegexpChunk}. 656 @rtype: C{string} 657 """ 658 return "<RegexpChunk with %d rules>" % len(self._rules)
659
660 - def __str__(self):
661 """ 662 @return: a verbose string representation of this C{RegexpChunk}. 663 @rtype: C{string} 664 """ 665 s = "RegexpChunk with %d rules:\n" % len(self._rules) 666 margin = 0 667 for rule in self._rules: 668 margin = max(margin, len(rule.descr())) 669 if margin < 35: 670 format = " %" + `-(margin+3)` + "s%s\n" 671 else: 672 format = " %s\n %s\n" 673 for rule in self._rules: 674 s += format % (rule.descr(), `rule`) 675 return s[:-1]
676 677 ##////////////////////////////////////////////////////// 678 ## Chunk Grammar 679 ##////////////////////////////////////////////////////// 680
681 -class Regexp(ChunkParseI, AbstractParse):
682 """ 683 A grammar based chunk parser. C{chunk.Regexp} uses a set of 684 regular expression patterns to specify the behavior of the parser. 685 The chunking of the text is encoded using a C{ChunkString}, and 686 each rule acts by modifying the chunking in the C{ChunkString}. 687 The rules are all implemented using regular expression matching 688 and substitution. 689 690 A grammar contains one or more clauses in the following form: 691 692 NP: 693 {<DT|JJ>} # chunk determiners and adjectives 694 }<[\.VI].*>+{ # chink any tag beginning with V, I, or . 695 <.*>}{<DT> # split a chunk at a determiner 696 <DT|JJ>{}<NN.*> # merge chunk ending with det/adj with one starting with a noun 697 698 The patterns of a clause are executed in order. An earlier 699 pattern may introduce a chunk boundary that prevents a later 700 pattern from executing. Sometimes an individual pattern will 701 match on multiple, overlapping extents of the input. As with 702 regular expression substitution more generally, the chunker will 703 identify the first match possible, then continue looking for matches 704 after this one has ended. 705 706 The clauses of a grammar are also executed in order. A cascaded 707 chunk parser is one having more than one clause. The maximum depth 708 of a parse tree created by this chunk parser is the same as the 709 number of clauses in the grammar. 710 711 When tracing is turned on, the comment portion of a line is displayed 712 each time the corresponding pattern is applied. 713 714 @type _start: C{string} 715 @ivar _start: The start symbol of the grammar (the root node of resulting trees) 716 @type _stages: C{int} 717 @ivar _stages: The list of parsing stages corresponding to the grammar 718 719 """
720 - def __init__(self, grammar, top_node='S', loop=1, trace=0):
721 """ 722 Create a new chunk parser, from the given start state 723 and set of chunk patterns. 724 725 @param grammar: The list of patterns that defines the grammar 726 @type grammar: C{list} of C{string} 727 @param top_node: The top node of the tree being created 728 @type top_node: L{string} or L{Nonterminal} 729 @param loop: The number of times to run through the patterns 730 @type loop: L{int} 731 @type trace: C{int} 732 @param trace: The level of tracing that should be used when 733 parsing a text. C{0} will generate no tracing output; 734 C{1} will generate normal tracing output; and C{2} or 735 higher will generate verbose tracing output. 736 """ 737 from nltk_lite import chunk 738 self._trace = trace 739 self._stages = [] 740 self._grammar = grammar 741 self._loop = loop 742 rules = [] 743 for line in grammar.split('\n'): 744 # Process any comments 745 line = re.sub(r'\\#', r'_HASH_', line) 746 if '#' in line: 747 line, comment = line.split('#', 1) # split at first hash 748 else: 749 comment = '' 750 line = re.sub(r'_HASH_', r'\\#', line) 751 comment = comment.strip() 752 753 # New stage begins 754 if ':' in line: 755 if rules != []: 756 parser = RegexpChunk(rules, chunk_node=lhs, trace=trace) 757 self._stages.append(parser) 758 lhs, line = line.split(":") 759 lhs = lhs.strip() 760 rules = [] 761 762 line = line.strip() 763 if not line: continue 764 765 # Pattern bodies: chunk, chink, split, merge 766 if line[0] == '{' and line[-1] == '}': 767 rules.append(ChunkRule(line[1:-1], comment)) 768 elif line[0] == '}' and line[-1] == '{': 769 rules.append(ChinkRule(line[1:-1], comment)) 770 elif '}{' in line: 771 left, right = line.split('}{') 772 rules.append(SplitRule(left, right, comment)) 773 elif '{}' in line: 774 left, right = line.split('{}') 775 rules.append(MergeRule(left, right, comment)) 776 else: 777 raise ValueError, 'Illegal chunk pattern: %s' % line 778 if rules != []: 779 parser = RegexpChunk(rules, chunk_node=lhs, top_node=top_node, trace=trace) 780 self._stages.append(parser)
781
782 - def parse(self, chunk_struct, trace=None):
783 """ 784 Apply the chunk parser to this input. 785 786 @type chunk_struct: C{Tree} 787 @param chunk_struct: the chunk structure to be (further) chunked 788 (this tree is modified, and is also returned) 789 @type trace: C{int} 790 @param trace: The level of tracing that should be used when 791 parsing a text. C{0} will generate no tracing output; 792 C{1} will generate normal tracing output; and C{2} or 793 highter will generate verbose tracing output. This value 794 overrides the trace level value that was given to the 795 constructor. 796 @return: the chunked output. 797 @rtype: C{Tree} 798 """ 799 if trace == None: trace = self._trace 800 for i in range(self._loop): 801 for parser in self._stages: 802 chunk_struct = parser.parse(chunk_struct, trace=trace) 803 return chunk_struct
804
805 - def __repr__(self):
806 """ 807 @return: a concise string representation of this C{chunk.Regexp}. 808 @rtype: C{string} 809 """ 810 return "<chunk.Regexp with %d stages>" % len(self._stages)
811
812 - def __str__(self):
813 """ 814 @return: a verbose string representation of this 815 C{RegexpChunk}. 816 @rtype: C{string} 817 """ 818 s = "chunk.Regexp with %d stages:\n" % len(self._stages) 819 margin = 0 820 for parser in self._stages: 821 s += parser.__str__() + "\n" 822 return s[:-1]
823 824 ##////////////////////////////////////////////////////// 825 ## Demonstration code 826 ##////////////////////////////////////////////////////// 827
828 -def demo_eval(chunkparser, text):
829 """ 830 Demonstration code for evaluating a chunk parser, using a 831 C{ChunkScore}. This function assumes that C{text} contains one 832 sentence per line, and that each sentence has the form expected by 833 C{tree.chunk}. It runs the given chunk parser on each sentence in 834 the text, and scores the result. It prints the final score 835 (precision, recall, and f-measure); and reports the set of chunks 836 that were missed and the set of chunks that were incorrect. (At 837 most 10 missing chunks and 10 incorrect chunks are reported). 838 839 @param chunkparser: The chunkparser to be tested 840 @type chunkparser: C{ChunkParseI} 841 @param text: The chunked tagged text that should be used for 842 evaluation. 843 @type text: C{string} 844 """ 845 846 from nltk_lite import chunk 847 848 # Evaluate our chunk parser. 849 chunkscore = chunk.ChunkScore() 850 851 from nltk_lite.parse.tree import Tree 852 853 for sentence in text.split('\n'): 854 print sentence 855 sentence = sentence.strip() 856 if not sentence: continue 857 gold = chunk.tagstr2tree(sentence) 858 tokens = gold.leaves() 859 test = chunkparser.parse(Tree('S', tokens), trace=1) 860 chunkscore.score(gold, test) 861 print 862 863 print '/'+('='*75)+'\\' 864 print 'Scoring', chunkparser 865 print ('-'*77) 866 print 'Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4, 867 print 'Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6, 868 print 'F-Measure: %5.1f%%' % (chunkscore.f_measure()*100) 869 870 871 # Missed chunks. 872 if chunkscore.missed(): 873 print 'Missed:' 874 missed = chunkscore.missed() 875 for chunk in missed[:10]: 876 print ' ', chunk 877 if len(chunkscore.missed()) > 10: 878 print ' ...' 879 880 # Incorrect chunks. 881 if chunkscore.incorrect(): 882 print 'Incorrect:' 883 incorrect = chunkscore.incorrect() 884 for chunk in incorrect[:10]: 885 print ' ', chunk 886 if len(chunkscore.incorrect()) > 10: 887 print ' ...' 888 889 print '\\'+('='*75)+'/' 890 print
891
892 -def demo():
893 """ 894 A demonstration for the C{RegexpChunk} class. A single text is 895 parsed with four different chunk parsers, using a variety of rules 896 and strategies. 897 """ 898 899 from nltk_lite import chunk 900 from nltk_lite.tag import string2tags 901 from nltk_lite.parse.tree import Tree 902 903 text = """\ 904 [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. 905 [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. 906 [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. 907 """ 908 909 print '*'*75 910 print 'Evaluation text:' 911 print text 912 print '*'*75 913 print 914 915 grammar = r""" 916 NP: # NP stage 917 {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns 918 {<NNP>+} # chunk proper nouns 919 """ 920 cp = chunk.Regexp(grammar) 921 chunk.demo_eval(cp, text) 922 923 grammar = r""" 924 NP: 925 {<.*>} # start by chunking each tag 926 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods 927 <DT|JJ>{}<NN.*> # merge det/adj with nouns 928 """ 929 cp = chunk.Regexp(grammar) 930 chunk.demo_eval(cp, text) 931 932 grammar = r""" 933 NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns 934 VP: {<TO>?<VB.*>} # VP = verb words 935 """ 936 cp = chunk.Regexp(grammar) 937 chunk.demo_eval(cp, text) 938 939 grammar = r""" 940 NP: {<.*>*} # start by chunking everything 941 }<[\.VI].*>+{ # chink any verbs, prepositions or periods 942 <.*>}{<DT> # separate on determiners 943 PP: {<IN><NP>} # PP = preposition + noun phrase 944 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs 945 """ 946 cp = chunk.Regexp(grammar) 947 chunk.demo_eval(cp, text) 948 949 # Evaluation 950 951 from nltk_lite.corpora import conll2000 952 953 print 954 print "Demonstration of empty grammar:" 955 956 cp = chunk.Regexp("") 957 print chunk.accuracy(cp, conll2000.chunked(files='test', chunk_types=('NP',))) 958 959 print 960 print "Demonstration of accuracy evaluation using CoNLL tags:" 961 962 from itertools import islice 963 964 grammar = r""" 965 NP: 966 {<.*>} # start by chunking each tag 967 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods 968 <DT|JJ>{}<NN.*> # merge det/adj with nouns 969 """ 970 cp = chunk.Regexp(grammar) 971 print chunk.accuracy(cp, islice(conll2000.chunked(chunk_types=('NP', 'PP', 'VP')), 0, 5)) 972 973 print 974 print "Demonstration of tagged token input" 975 976 grammar = r""" 977 NP: {<.*>*} # start by chunking everything 978 }<[\.VI].*>+{ # chink any verbs, prepositions or periods 979 <.*>}{<DT> # separate on determiners 980 PP: {<IN><NP>} # PP = preposition + noun phrase 981 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs 982 """ 983 cp = chunk.Regexp(grammar) 984 print cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"), ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"), (".", ".")])
985 986 if __name__ == '__main__': 987 demo() 988