1
2
3
4
5
6
7
8
9 """
10 Read from the Senseval 2 Corpus.
11
12 SENSEVAL [http://www.senseval.org/]
13 Evaluation exercises for Word Sense Disambiguation.
14 Organized by ACL-SIGLEX [http://www.siglex.org/]
15
16 Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
17 http://www.d.umn.edu/~tpederse/data.html
18 Distributed with permission.
19
20 The NLTK version of the Senseval 2 files uses well-formed XML.
21 Each instance of the ambiguous words "hard", "interest", "line", and "serve"
22 is tagged with a sense identifier, and supplied with context.
23 """
24
25 from nltk_lite.corpora import get_basedir
26 from nltk_lite import tokenize
27 import os, re, xml.sax
28
29 items = ["hard", "interest", "line", "serve"]
30
32
34 xml.sax.ContentHandler.__init__(self)
35 self._lemma = ''
36 self._buffer_size = buffer_size
37 self.reset()
38
40 if hasattr(text, '__iter__') and hasattr(text, 'next'):
41 text = ''.join(text)
42 parser = xml.sax.make_parser()
43 parser.setContentHandler(self)
44 current = 0
45 while current < len(text):
46 buffer = text[current : current + self._buffer_size]
47 parser.feed(buffer)
48 for instance in self._instances:
49 yield instance
50 self.reset(True, False)
51 current += self._buffer_size
52 parser.close()
53
56
58 if tag == 'wf':
59 self._pos = _to_ascii(attr.getValueByQName('pos'))
60 elif tag == 'answer':
61 instance_id = _to_ascii(attr.getValueByQName('instance'))
62 self._senses.append(_to_ascii(attr.getValueByQName('senseid')))
63 self._iloc = instance_id
64
65 elif tag == 'context':
66 self._data = ''
67 elif tag == 'lexelt':
68 self._lemma = _to_ascii(attr.getValueByQName('item'))
69 elif tag == 'head':
70 self._head = self._wnum - 1
71
73 if tag == 'wf':
74 text = self._data.strip()
75 pos = self._pos
76 self._tokens.append((text, pos))
77 self._wnum += 1
78 self._data = ''
79 elif tag == 'context':
80 self._instances.append((tuple(self._senses), self._head, self._tokens))
81 self.reset(False)
82
84 return self._instances
85
86 - def reset(self, instances=True, state=True):
87 if instances:
88 self._instances = []
89 if state:
90 self._senses = []
91 self._head = None
92 self._data = ''
93 self._wnum = 1
94 self._iloc = None
95 self._tokens = []
96 self._pos = None
97
99 return text.encode('Latin-1')
100
101
103 """
104 @param files: One or more Senseval files to be processed
105 @type files: L{string} or L{tuple(string)}
106 @rtype: iterator over L{tuple}
107 """
108
109 if type(files) is str: files = (files,)
110 parser = SensevalParser()
111 for file in files:
112 path = os.path.join(get_basedir(), "senseval", file+".pos")
113 f = open(path).read()
114 for entry in parser.parse(f):
115 yield entry
116
118 from nltk_lite.corpora import senseval
119 from itertools import islice
120 import string
121
122
123
124 seen = set()
125 for (senses, position, context) in senseval.raw('line'):
126 if senses not in seen:
127 seen.add(senses)
128 print "senses:", senses
129 print "position:", position
130 print "context:", string.join('%s/%s' % ttok for ttok in context)
131 print
132
133 if __name__ == '__main__':
134 demo()
135