Package nltk_lite :: Package contrib :: Module didyoumean
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.didyoumean

 1  # Spelling corrector by Maxime Biais http://www.biais.org/blog/ 
 2  # http://snippets.dzone.com/posts/show/3395 
 3   
 4  from nltk_lite.stem.porter import Porter 
 5  from nltk_lite.corpora import brown 
 6  from nltk_lite import tokenize 
 7    
 8  import sys 
 9  from collections import defaultdict 
10  import operator 
11    
12 -def sortby(nlist ,n, reverse=0):
13 nlist.sort(key=operator.itemgetter(n), reverse=reverse)
14
15 -class mydict(dict):
16 - def __missing__(self, key):
17 return 0
18
19 -class DidYouMean:
20 - def __init__(self):
21 self.stemmer = Porter()
22
23 - def specialhash(self, s):
24 s = s.lower() 25 s = s.replace("z", "s") 26 s = s.replace("h", "") 27 for i in [chr(ord("a") + i) for i in range(26)]: 28 s = s.replace(i+i, i) 29 s = self.stemmer.stem(s) 30 return s
31
32 - def test(self, token):
33 hashed = self.specialhash(token) 34 if hashed in self.learned: 35 words = self.learned[hashed].items() 36 sortby(words, 1, reverse=1) 37 if token in [i[0] for i in words]: 38 return 'This word seems OK' 39 else: 40 if len(words) == 1: 41 return 'Did you mean "%s" ?' % words[0][0] 42 else: 43 return 'Did you mean "%s" ? (or %s)' \ 44 % (words[0][0], ", ".join(['"'+i[0]+'"' \ 45 for i in words[1:]])) 46 return "I can't found similar word in my learned db"
47
48 - def learn(self, listofsentences=[], n=2000):
49 self.learned = defaultdict(mydict) 50 if listofsentences == []: 51 listofsentences = brown.raw() 52 for i, sent in enumerate(listofsentences): 53 if i >= n: # Limit to the first nth sentences of the corpus 54 break 55 for word in sent: 56 self.learned[self.specialhash(word)][word.lower()] += 1
57
58 -def demo():
59 d = DidYouMean() 60 d.learn() 61 # choice of words to be relevant related to the brown corpus 62 for i in "birdd, oklaoma, emphasise, bird, carot".split(", "): 63 print i, "-", d.test(i)
64 65 if __name__ == "__main__": 66 demo() 67