1
2
3
4
5
6
7
8
9
10 """
11 Wordnet interface, based on Oliver Steele's Pywordnet, together
12 with an implementation of Ted Pedersen's Wordnet::Similarity package.
13
14 Usage
15 -----
16
17 >>> from nltk_lite.wordnet import *
18
19 Retrieve words from the database
20
21 >>> N['dog']
22 dog(n.)
23 >>> V['dog']
24 dog(v.)
25 >>> ADJ['clear']
26 clear(adj.)
27 >>> ADV['clearly']
28 clearly(adv.)
29
30 Examine a word's senses and pointers:
31
32 >>> N['dog'].getSenses()
33 ('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron})
34
35 Extract the first sense:
36
37 >>> N['dog'][0] # aka N['dog'].getSenses()[0]
38 'dog' in {noun: dog, domestic dog, Canis familiaris}
39
40 Get the first five pointers (relationships) from dog to other synsets:
41
42 >>> N['dog'][0].getPointers()[:5]
43 (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
44
45 Get those synsets of which 'dog' is a member meronym:
46
47 >>> N['dog'][0].getPointerTargets(MEMBER_MERONYM)
48 [{noun: Canis, genus Canis}, {noun: pack}]
49
50 """
51
52 import os
53 import string
54 from os import environ
55 from nltk_lite.corpora import get_basedir
56 from types import IntType, StringType
57
58 ANTONYM = 'antonym'
59 HYPERNYM = 'hypernym'
60 HYPONYM = 'hyponym'
61 ATTRIBUTE = 'attribute'
62 ALSO_SEE = 'also see'
63 ENTAILMENT = 'entailment'
64 CAUSE = 'cause'
65 VERB_GROUP = 'verb group'
66 MEMBER_MERONYM = 'member meronym'
67 SUBSTANCE_MERONYM = 'substance meronym'
68 PART_MERONYM = 'part meronym'
69 MEMBER_HOLONYM = 'member holonym'
70 SUBSTANCE_HOLONYM = 'substance holonym'
71 PART_HOLONYM = 'part holonym'
72 SIMILAR = 'similar'
73 PARTICIPLE_OF = 'participle of'
74 PERTAINYM = 'pertainym'
75
76 FRAMES = 'frames'
77 CLASSIF_CATEGORY = 'domain category'
78 CLASSIF_USAGE = 'domain usage'
79 CLASSIF_REGIONAL = 'domain regional'
80 CLASS_CATEGORY = 'class category'
81 CLASS_USAGE = 'class usage'
82 CLASS_REGIONAL = 'class regional'
83
84 INSTANCE_HYPERNYM = 'hypernym (instance)'
85 INSTANCE_HYPONYM = 'hyponym (instance)'
86
87 POINTER_TYPES = (
88 ANTONYM,
89 HYPERNYM,
90 HYPONYM,
91 ATTRIBUTE,
92 ALSO_SEE,
93 ENTAILMENT,
94 CAUSE,
95 VERB_GROUP,
96 MEMBER_MERONYM,
97 SUBSTANCE_MERONYM,
98 PART_MERONYM,
99 MEMBER_HOLONYM,
100 SUBSTANCE_HOLONYM,
101 PART_HOLONYM,
102 SIMILAR,
103 PARTICIPLE_OF,
104 PERTAINYM,
105
106 FRAMES,
107 CLASSIF_CATEGORY,
108 CLASSIF_USAGE,
109 CLASSIF_REGIONAL,
110 CLASS_CATEGORY,
111 CLASS_USAGE,
112 CLASS_REGIONAL,
113
114 INSTANCE_HYPERNYM,
115 INSTANCE_HYPONYM,
116 )
117
118 ATTRIBUTIVE = 'attributive'
119 PREDICATIVE = 'predicative'
120 IMMEDIATE_POSTNOMINAL = 'immediate postnominal'
121 ADJECTIVE_POSITIONS = (ATTRIBUTIVE, PREDICATIVE, IMMEDIATE_POSTNOMINAL, None)
122
123 VERB_FRAME_STRINGS = (
124 None,
125 "Something %s",
126 "Somebody %s",
127 "It is %sing",
128 "Something is %sing PP",
129 "Something %s something Adjective/Noun",
130 "Something %s Adjective/Noun",
131 "Somebody %s Adjective",
132 "Somebody %s something",
133 "Somebody %s somebody",
134 "Something %s somebody",
135 "Something %s something",
136 "Something %s to somebody",
137 "Somebody %s on something",
138 "Somebody %s somebody something",
139 "Somebody %s something to somebody",
140 "Somebody %s something from somebody",
141 "Somebody %s somebody with something",
142 "Somebody %s somebody of something",
143 "Somebody %s something on somebody",
144 "Somebody %s somebody PP",
145 "Somebody %s something PP",
146 "Somebody %s PP",
147 "Somebody's (body part) %s",
148 "Somebody %s somebody to INFINITIVE",
149 "Somebody %s somebody INFINITIVE",
150 "Somebody %s that CLAUSE",
151 "Somebody %s to somebody",
152 "Somebody %s to INFINITIVE",
153 "Somebody %s whether INFINITIVE",
154 "Somebody %s somebody into V-ing something",
155 "Somebody %s something with something",
156 "Somebody %s INFINITIVE",
157 "Somebody %s VERB-ing",
158 "It %s that CLAUSE",
159 "Something %s INFINITIVE")
160
161
162
163
164
165
166 FILE_OPEN_MODE = os.name in ('dos', 'nt') and 'rb' or 'r'
167
169 """
170 @type filenameroot: {string}
171 @param filenameroot: base form of the index file's filename.
172 @return: the full path to the index file.
173 """
174
175 if os.name in ('dos', 'nt'):
176 path = os.path.join(get_basedir(), "wordnet", filenameroot + ".idx")
177 if os.path.exists(path):
178 return path
179
180 return os.path.join(get_basedir(), "wordnet", "index." + filenameroot)
181
183 """
184 @type filenameroot: {string}
185 @param filenameroot: base form of the data file's filename.
186 @return: the full path to the data file.
187 """
188
189 if os.name in ('dos', 'nt'):
190 path = os.path.join(get_basedir(), "wordnet", filenameroot + ".dat")
191
192 if os.path.exists(path):
193 return path
194
195 return os.path.join(get_basedir(), "wordnet", "data." + filenameroot)
196
198 """
199 Searches through a sorted file using the binary search algorithm.
200
201 @type file: file
202 @param file: the file to be searched through.
203 @type key: {string}
204 @param key: the identifier we are searching for.
205 @return: The line from the file with first word key.
206 """
207 from stat import ST_SIZE
208
209 key = key + ' '
210 keylen = len(key)
211 start, end = 0, os.stat(file.name)[ST_SIZE]
212 currentDepth = 0
213
214 while start < end:
215 lastState = start, end
216 middle = (start + end) / 2
217
218 if cache.get(middle):
219 offset, line = cache[middle]
220
221 else:
222 file.seek(max(0, middle - 1))
223
224 if middle > 0:
225 file.readline()
226
227 offset, line = file.tell(), file.readline()
228
229 if currentDepth < cacheDepth:
230 cache[middle] = (offset, line)
231
232 if offset > end:
233 assert end != middle - 1, "infinite loop"
234 end = middle - 1
235
236 elif line[:keylen] == key:
237 return line
238
239 elif line > key:
240 assert end != middle - 1, "infinite loop"
241 end = middle - 1
242
243 elif line < key:
244 start = offset + len(line) - 1
245
246 currentDepth = currentDepth + 1
247 thisState = start, end
248
249 if lastState == thisState:
250
251
252 return None
253
254 return None
255
256
257
258
259
260
262 """
263 An IndexFile is an implementation class that presents a
264 Sequence and Dictionary interface to a sorted index file.
265 """
266
268 """
269 @type pos: {string}
270 @param pos: The part of speech of this index file e.g. 'noun'
271 @type filenameroot: {string}
272 @param filenameroot: The base filename of the index file.
273 """
274 self.pos = pos
275 self.file = open(indexFilePathname(filenameroot), FILE_OPEN_MODE)
276
277
278 self.offsetLineCache = {}
279
280 self.rewind()
281
282
283
284
285
286
287
288
289
290
291
292
293
294
296 """
297 Rewind to the beginning of the file. Place the file pointer at the
298 beginning of the first line whose first character is not whitespace.
299 """
300 self.file.seek(0)
301
302 while 1:
303 offset = self.file.tell()
304 line = self.file.readline()
305
306 if (line[0] != ' '):
307 break
308
309 self.nextIndex = 0
310 self.nextOffset = offset
311
314
316
317 if hasattr(self, 'indexCache'):
318 return len(self.indexCache)
319
320 self.rewind()
321 lines = 0
322
323 while 1:
324 line = self.file.readline()
325
326 if line == "":
327 break
328
329 lines = lines + 1
330
331 return lines
332
335
337
338 if isinstance(index, StringType):
339
340 if hasattr(self, 'indexCache'):
341 return self.indexCache[index]
342
343 return binarySearchFile(self.file, index, self.offsetLineCache, 8)
344
345 elif isinstance(index, IntType):
346
347 if hasattr(self, 'indexCache'):
348 return self.get(self.keys[index])
349
350 if index < self.nextIndex:
351 self.rewind()
352
353 while self.nextIndex <= index:
354 self.file.seek(self.nextOffset)
355 line = self.file.readline()
356
357 if line == "":
358 raise IndexError, "index out of range"
359
360 self.nextIndex = self.nextIndex + 1
361 self.nextOffset = self.file.tell()
362
363 return line
364
365 else: raise TypeError, "%s is not a String or Int" % `index`
366
367 - def get(self, key, default=None):
368 """
369 @type key: {string}
370 @param key: first word of a line from an index file.
371 @param default: Return this if no entry exists for 'key'.
372 """
373 try:
374 return self[key]
375
376 except LookupError:
377 return default
378
380 """
381 @return: a list of the keys of this index file.
382 """
383
384 if hasattr(self, 'indexCache'):
385 keys = self.indexCache.keys()
386 keys.sort()
387 return keys
388
389 else:
390 keys = []
391 self.rewind()
392
393 while 1:
394 line = self.file.readline()
395
396 if not line: break
397
398 key = line.split(' ', 1)[0]
399 keys.append(key.replace('_', ' '))
400
401 return keys
402
404 """
405 @type key: {string}
406 @param key: the first word of a line in this index file.
407 @return: True/false if this key is a valid index into the file.
408 """
409 key = key.replace(' ', '_')
410
411 if hasattr(self, 'indexCache'):
412 return self.indexCache.has_key(key)
413
414 return self.get(key) != None
415
417
418 import shelve
419 import os
420
421 print "Building %s:" % (self.shelfname,),
422 tempname = self.shelfname + ".temp"
423
424 try:
425 indexCache = shelve.open(tempname)
426 self.rewind()
427 count = 0
428
429 while 1:
430 offset, line = self.file.tell(), self.file.readline()
431 if not line: break
432 key = line[:string.find(line, ' ')]
433 if (count % 1000) == 0:
434 print "%s..." % (key,),
435 import sys
436 sys.stdout.flush()
437 indexCache[key] = line
438 count = count + 1
439 indexCache.close()
440 os.rename(tempname, self.shelfname)
441
442 finally:
443 try: os.remove(tempname)
444 except: pass
445
446 print "done."
447 self.indexCache = shelve.open(self.shelfname, 'r')
448
449
450
451
452
453
454
455
456
457
458 from pos import *
459 from nltk_lite.wordnet import *
460 import os
461
462
463
464
465
466 GET_INDEX_SUBSTITUTIONS = ((' ', '-'), ('-', ' '), ('-', ''), (' ', ''), ('.', ''))
467
469 """Search for _form_ in the index file corresponding to
470 _pos_. getIndex applies to _form_ an algorithm that replaces
471 underscores with hyphens, hyphens with underscores, removes
472 hyphens and underscores, and removes periods in an attempt to find
473 a form of the string that is an exact match for an entry in the
474 index file corresponding to _pos_. getWord() is called on each
475 transformed string until a match is found or all the different
476 strings have been tried. It returns a Word or None."""
477 def trySubstitutions(trySubstitutions, form, substitutions, lookup=1, dictionary=dictionaryFor(pos)):
478 if lookup and dictionary.has_key(form):
479 return dictionary[form]
480 elif substitutions:
481 (old, new) = substitutions[0]
482 substitute = string.replace(form, old, new) and substitute != form
483 if substitute and dictionary.has_key(substitute):
484 return dictionary[substitute]
485 return trySubstitutions(trySubstitutions, form, substitutions[1:], lookup=0) or \
486 (substitute and trySubstitutions(trySubstitutions, substitute, substitutions[1:]))
487 return trySubstitutions(returnMatch, form, GET_INDEX_SUBSTITUTIONS)
488
489
490 MORPHOLOGICAL_SUBSTITUTIONS = {
491 NOUN:
492 [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'), ('zes', 'z'),
493 ('ches', 'ch'), ('shes', 'sh'), ('men', 'man'), ('ies', 'y')],
494 VERB:
495 [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''),
496 ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')],
497 ADJECTIVE:
498 [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
499 ADVERB:
500 []}
501
503 """Recursively uninflect _form_, and return the first form found
504 in the dictionary. If _collect_ is true, a sequence of all forms
505 is returned, instead of just the first one.
506
507 >>> morphy('dogs')
508 'dog'
509 >>> morphy('churches')
510 'church'
511 >>> morphy('aardwolves')
512 'aardwolf'
513 >>> morphy('abaci')
514 'abacus'
515 >>> morphy('hardrock', ADVERB)
516 """
517 pos = normalizePOS(pos)
518 fname = os.path.join(get_basedir(), "wordnet", {NOUN: NOUN, VERB: VERB, ADJECTIVE: ADJECTIVE, ADVERB: ADVERB}[pos] + '.exc')
519 excfile = open(fname)
520 substitutions = MORPHOLOGICAL_SUBSTITUTIONS[pos]
521 def trySubstitutions(trySubstitutions,
522 form,
523 substitutions,
524 lookup=1,
525 dictionary=dictionaryFor(pos),
526 excfile=excfile,
527 collect=collect,
528 collection=[]):
529 import string
530 exceptions = binarySearchFile(excfile, form)
531 if exceptions:
532 form = exceptions[string.find(exceptions, ' ')+1:-1]
533 if lookup and dictionary.has_key(form):
534 if collect:
535 collection.append(form)
536 else:
537 return form
538 elif substitutions:
539 old, new = substitutions[0]
540 substitutions = substitutions[1:]
541 substitute = None
542 if form.endswith(old):
543 substitute = form[:-len(old)] + new
544
545
546 form = trySubstitutions(trySubstitutions, form, substitutions) or \
547 (substitute and trySubstitutions(trySubstitutions, substitute, substitutions))
548 return (collect and collection) or form
549 elif collect:
550 return collection
551 return trySubstitutions(trySubstitutions, form, substitutions)
552
553
554
555 from cache import *
556 from lexname import *
557 from similarity import *
558 from wordnet import *
559