Package nltk_lite :: Package stem :: Module regexp
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.stem.regexp

 1  # Natural Language Toolkit: Stemmers 
 2  # 
 3  # Copyright (C) 2001-2007 University of Melbourne 
 4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  #         Steven Bird <sb@csse.unimelb.edu.au> 
 7  # URL: <http://nltk.sf.net> 
 8  # For license information, see LICENSE.TXT 
 9   
10  from nltk_lite.stem import * 
11   
12 -class Regexp(StemI):
13 """ 14 A stemmer that uses regular expressions to identify morphological 15 affixes. Any substrings that matches the regular expressions will 16 be removed. 17 """
18 - def __init__(self, regexp, min=0):
19 """ 20 Create a new regexp stemmer. 21 22 @type regexp: C{string} or C{regexp} 23 @param regexp: The regular expression that should be used to 24 identify morphological affixes. 25 @type min: int 26 @param min: The minimum length of string to stem 27 """ 28 29 if not hasattr(regexp, 'pattern'): 30 regexp = re.compile(regexp) 31 self._regexp = regexp 32 self._min = min
33
34 - def stem(self, word):
35 if len(word) < self._min: 36 return word 37 else: 38 return self._regexp.sub('', word)
39
40 - def __repr__(self):
41 return '<Regexp Stemmer: %r>' % self._regexp.pattern
42
43 -def demo():
44 from nltk_lite import tokenize, stem 45 46 # Create a simple regular expression based stemmer 47 stemmer = stem.Regexp('ing$|s$|e$', min=4) 48 text = "John was eating icecream" 49 tokens = tokenize.whitespace(text) 50 51 # Print the results. 52 print stemmer 53 for word in tokens: 54 print '%20s => %s' % (word, stemmer.stem(word)) 55 print
56 57 58 if __name__ == '__main__': demo() 59