1 import math
2 import os
3
4
5 from nltk_lite import tag
6 from nltk_lite.tag import SequentialBackoff
7
8 from nltk_lite.contrib.marshal import MarshalDefault ; Default = MarshalDefault
9 from nltk_lite.contrib.marshal import MarshalUnigram ; Unigram = MarshalUnigram
10 from nltk_lite.contrib.marshal import MarshalAffix ; Affix = MarshalAffix
11 from nltk_lite.contrib.marshal import MarshalNgram ; Ngram = MarshalNgram
12 from nltk_lite.contrib.marshalbrill import *
13
16 self._tagger = []
17 self._brill = None
18
21
22 - def _append_affix (self, a_len, w_len, train_sents, verbose=False):
23 self._tagger.append( Affix(a_len, w_len, backoff=self._tagger[-1]) )
24 self._tagger[-1].train([train_sents], verbose)
25
27 self._tagger.append( Unigram(backoff=self._tagger[-1]) )
28 self._tagger[-1].train(train_sents, verbose)
29
30 - def _append_ngram (self, size, train_sents, verbose=False, cutoff_value=0.001):
31 cutoff = math.floor(len(train_sents)*cutoff_value)
32 self._tagger.append( Ngram(size, cutoff=cutoff, backoff=self._tagger[-1]) )
33 self._tagger[-1].train([train_sents], verbose)
34
35 - def _append_brill (self, train_sents, max_rules, min_score=2, trace=0):
36 templates = [
37 SymmetricProximateTokensTemplate(ProximateTagsRule, ( 1, 1) ),
38 SymmetricProximateTokensTemplate(ProximateTagsRule, ( 2, 2) ),
39 SymmetricProximateTokensTemplate(ProximateTagsRule, ( 1, 2) ),
40 SymmetricProximateTokensTemplate(ProximateTagsRule, ( 1, 3) ),
41 SymmetricProximateTokensTemplate(ProximateWordsRule, ( 1, 1) ),
42 SymmetricProximateTokensTemplate(ProximateWordsRule, ( 2, 2) ),
43 SymmetricProximateTokensTemplate(ProximateWordsRule, ( 1, 2) ),
44 SymmetricProximateTokensTemplate(ProximateWordsRule, ( 1, 3) ),
45 ProximateTokensTemplate (ProximateTagsRule, (-1, -1), (1,1) ),
46 ProximateTokensTemplate (ProximateWordsRule, (-1, -1), (1,1) ),
47 ]
48
49 trainer = BrillTrainer(self._tagger[-1], templates, trace)
50 self._brill = trainer.train(train_sents, max_rules, min_score)
51
53
54 handler = file(os.path.join(basepath, "model.mrs"), "w")
55
56 for index in range(len(self._tagger)):
57 filename = os.path.join(basepath, "tagger%02d.mod" % index)
58 handler.write("%s %s\n" % (self._tagger[index]._classname, filename) )
59 self._tagger[index].marshal(filename)
60
61 filename = os.path.join(basepath, "tagger%02d.mod" % (index+1))
62 handler.write("%s %s\n" % (self._brill._classname, filename) )
63 self._brill.marshal(filename)
64
65 handler.close()
66
68
69 self._tagger = []
70 self._brill = None
71
72
73 filename = os.path.join(basepath, "model.mrs")
74 handler = file(filename, "r")
75 model = handler.readlines()
76 handler.close()
77 model = [line[:-1] for line in model]
78 model = [line for line in model if len(line) > 0]
79
80
81 for tagger in model:
82 tagger_type, tagger_file = tagger.split(" ")
83 if tagger_type == "DefaultTagger":
84 self._tagger.append( Default("") )
85 self._tagger[-1].unmarshal(tagger_file)
86 elif tagger_type == "AffixTagger":
87 self._tagger.append( Affix(1, 2, backoff=self._tagger[-1]) )
88 self._tagger[-1].unmarshal(tagger_file)
89 elif tagger_type == "UnigramTagger":
90 self._tagger.append( Unigram(backoff=self._tagger[-1]) )
91 self._tagger[-1].unmarshal(tagger_file)
92 elif tagger_type == "NgramTagger":
93 self._tagger.append( Ngram(2, backoff=self._tagger[-1]) )
94 self._tagger[-1].unmarshal(tagger_file)
95 elif tagger_type == "BrillTagger":
96 self._brill = Brill(self._tagger[-1], [])
97 self._brill.unmarshal(tagger_file)
98 else:
99 print "error, tagger type not recognized."
100
102 self._append_default("N")
103
104 self._append_affix(-2, 6, train_sents, verbose)
105 self._append_affix(-3, 7, train_sents, verbose)
106 self._append_affix(-4, 8, train_sents, verbose)
107 self._append_affix(-5, 9, train_sents, verbose)
108
109 self._append_unigram(train_sents, verbose)
110
111 self._append_ngram(2, train_sents, verbose)
112
113 self._append_brill(train_sents, 1, 2, trace=3)
114
117
118 - def tag (self, tokens, verbose=False):
119 return self._tagger[-1].tag(tokens, verbose)
120
122 ct = CombinedTagger()
123
124 ct.unmarshal("tresoldi")
125
126 tokens = "Mauro viu o livro sobre a mesa".split()
127 print list(ct.tag(tokens))
128
129
130 acc = tag.accuracy(ct, [train_sents])
131 print 'Accuracy = %4.2f%%' % (100 * acc)
132