Package nltk_lite :: Package wordnet :: Module wordnet
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.wordnet.wordnet

   1  # Natural Language Toolkit: Wordnet Interface: Wordnet Module 
   2  # 
   3  # Copyright (C) 2001-2007 University of Pennsylvania 
   4  # Author: Oliver Steele <steele@osteele.com> 
   5  #         David Ormiston Smith <daosmith@csse.unimelb.edu.au>> 
   6  #         Steven Bird <sb@csse.unimelb.edu.au> 
   7  # URL: <http://nltk.sf.net> 
   8  # For license information, see LICENSE.TXT 
   9   
  10  import math, pickle, string, re 
  11   
  12  from pos import * 
  13  from nltk_lite.wordnet import * 
  14   
15 -class Word(object):
16 - def __init__(self, line):
17 """ 18 Extract a word from a line of a WordNet POS file. 19 @type line: C{string} 20 @param line: The appropriate line taken from the Wordnet data files. 21 """ 22 23 tokens = line.split() 24 ints = map(int, tokens[int(tokens[3]) + 4:]) 25 26 self.form = tokens[0].replace('_', ' ') # orthography 27 self.pos = normalizePOS(tokens[1]) # NOUN, VERB, ADJECTIVE, ADVERB 28 self.taggedSenseCount = ints[1] # Number of senses tagged 29 self._synsetOffsets = ints[2:ints[0]+2] # Offsets of this word's synsets
30
31 - def synsets(self):
32 """ 33 Get a sequence of the L{synsets}s of this word. 34 35 >>> from nltk_lite.wordnet import * 36 >>> N['dog'].synsets() 37 [{noun: dog, domestic dog, Canis familiaris}, {noun: frump, dog}, {noun: dog}, {noun: cad, bounder, blackguard, dog, hound, heel}, {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, {noun: pawl, detent, click, dog}, {noun: andiron, firedog, dog, dog-iron}] 38 39 @return: A list of this L{Word}'s L{Synsets}s 40 """ 41 42 try: 43 return self._synsets 44 except AttributeError: 45 self._synsets = [getSynset(self.pos, offset) 46 for offset in self._synsetOffsets] 47 del self._synsetOffsets 48 return self._synsets
49
50 - def isTagged(self):
51 """ 52 >>> from nltk_lite.wordnet import * 53 >>> N['dog'].isTagged() 54 1 55 56 @return: True/false (1/0) if one of this L{Word}'s senses is tagged. 57 """ 58 return self.taggedSenseCount > 0
59
60 - def getAdjectivePositions(self):
61 """ 62 >>> from nltk_lite.wordnet import * 63 >>> ADJ['clear'].getAdjectivePositions() 64 [None, 'predicative'] 65 66 @return: Return a list of adjective positions that this word can 67 appear in. These are elements of ADJECTIVE_POSITIONS. 68 """ 69 70 return list(set(synset.position for synset in word))
71
72 - def __getitem__(self, idx):
73 return self.synsets()[idx]
74
75 - def __iter__(self):
76 return iter(self.synsets())
77
78 - def __contains__(self, item):
79 return item in self.synsets()
80
81 - def __getslice__(self, i, j):
82 return self.synsets()[i:j]
83
84 - def __len__(self):
85 return len(self.synsets())
86
87 - def __repr__(self):
88 # return "<Word:" + self.form + '/' + self.pos + ">" 89 return self.__str__()
90
91 - def __str__(self):
92 return self.form + ' (' + self.pos + ")"
93
94 -class Synset(object):
95 """ 96 A set of synonyms. 97 98 Each synset contains one or more Senses, which represent a 99 specific sense of a specific word. Senses can be retrieved via 100 synset.getSenses() or through the index notations synset[0], 101 synset[string], or synset[word]. Synsets also originate zero or 102 more typed pointers, which can be accessed via 103 synset.getPointers() or synset.getPointers(pointerType). The 104 targets of a synset pointer can be retrieved via 105 synset.getPointerTargets() or 106 synset.getPointerTargets(pointerType), which are equivalent to 107 map(Pointer.getTarget(), synset.getPointerTargets(...)). 108 109 >>> from nltk_lite.wordnet import * 110 >>> V['think'][0].synset.verbFrames 111 (5, 9) 112 113 @type pos: C{string} 114 @param pos: The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB. 115 116 @type offset: C{int} 117 @param offset: An integer offset into the part-of-speech file. Together 118 with pos, this can be used as a unique id. 119 120 @type gloss: C{string} 121 @param gloss: A gloss (dictionary definition) for the sense. 122 123 @type verbFrames: C{list} of C{integer} 124 @param verbFrames: A sequence of integers that index into 125 VERB_FRAME_STRINGS. These list the verb frames that any 126 Sense in this synset participates in. (See also 127 Sense.verbFrames.) Defined only for verbs. 128 """ 129
130 - def __init__(self, pos, offset, line):
131 """Initialize the Synset from a line in a WordNet synset file.""" 132 133 # Part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB. 134 self.pos = pos 135 136 # Integer offset into the part-of-speech file. Together with pos, 137 # this can be used as a unique id. 138 self.offset = offset 139 140 # The synset entry can be broadly divided into two parts: the 141 # synset and relational data, and its human readable description, or 142 # gloss. The '|' character separates these. 143 144 dividerIndex = line.index('|') 145 tokens = line[:dividerIndex].split() 146 self.ssType = tokens[2] 147 self.gloss = line[dividerIndex + 1:].strip() 148 self.lexname = Lexname.lexnames[int(tokens[1])] 149 150 # TODO: This next code is dense and confusing. Clean up at some point. 151 # line is of the form: 152 # synset_offset lex_filenum ss_type w_cnt word lex_id [word lex_id...] p_cnt [ptr...] [frames...] | gloss 153 154 synset_cnt = int(tokens[3], 16) # hex integer representing number of items in the synset; same as w_cnt above 155 156 #extract all pairs of the form (sense, sense_index), plus a remainder 157 (senseTuples, remainder1) = _partition(tokens[4:], 2, synset_cnt) 158 self.words = [form for form, i in senseTuples] 159 #extract all pointer quadruples, plus a remainder 160 (self._pointerTuples, remainder2) = _partition(remainder1[1:], 4, int(remainder1[0])) 161 162 #frames: In data.verb only, a list of numbers corresponding to the 163 #generic verb sentence frames for word s in the synset. frames is of 164 #the form: 165 #f_cnt + f_num w_num [ + f_num w_num...] 166 #where f_cnt is a two digit decimal integer indicating the number of 167 #generic frames listed, f_num is a two digit decimal integer frame 168 #number, and w_num is a two digit hexadecimal integer indicating the 169 #word in the synset that the frame applies to. As with pointers, if 170 #this number is 00 , f_num applies to all word s in the synset. If 171 #non-zero, it is applicable only to the word indicated. Word numbers 172 #are assigned as described for pointers. 173 174 if pos == VERB: 175 (vfTuples, remainder3) = _partition(remainder2[1:], 3, int(remainder2[0])) 176 177 #now only used for senseVerbFrames 178 def extractVerbFrames(index, vfTuples): 179 return tuple(map(lambda t:int(t[1]), filter(lambda t,i=index:int(t[2],16) in (0, i), vfTuples)))
180 181 senseVerbFrames = [] 182 for index in range(1, len(self.words) + 1): 183 senseVerbFrames.append(extractVerbFrames(index, vfTuples)) 184 self._senseVerbFrames = senseVerbFrames 185 186 # A sequence of integers that index into VERB_FRAME_STRINGS. These 187 # list the verb frames that any Sense in this synset participates 188 # in (see also Sense.verbFrames). Defined only for verbs. 189 190 self.verbFrames = tuple(extractVerbFrames(None, vfTuples)) 191 192 #A list of verb frame strings for this synset 193 self.verbFrameStrings = self.extractVerbFrameStrings(vfTuples)
194
195 - def extractVerbFrameStrings(self, vfTuples):
196 """ 197 Return a list of verb frame strings for this synset. 198 """ 199 # extract a frame index if 3rd item is 00 200 frame_indices = [int(t[1], 16) for t in vfTuples if int(t[2], 16) == 0] 201 try: 202 verbFrames = [VERB_FRAME_STRINGS[i] for i in frame_indices] 203 except IndexError: 204 return [] 205 #ideally we should build 3rd person morphology for this form 206 form = self[0] 207 verbFrameStrings = [vf % form for vf in verbFrames] 208 return verbFrameStrings
209 210 # def words(self): 211 # """ 212 # Return a sequence of Words. 213 # 214 # >>> from nltk_lite.wordnet import * 215 # >>> N['dog'].words 216 # ['dog', 'domestic dog', 'Canis familiaris'] 217 # @return: A list of the L{Word}s in this L{Synset}. 218 # """ 219 220 # # Load the senses from the Wordnet files if necessary. 221 # if not hasattr(self, '_senses'): 222 # self._senses = [] 223 # senseVerbFrames = None 224 225 # if self.pos == VERB: 226 # senseVerbFrames = self._senseVerbFrames 227 228 # for word in self.words: 229 # position = None 230 # m = re.match(r'.*(\(.*\))$', word) 231 # if m: 232 # if m.group(1) == 'a': position = ATTRIBUTIVE 233 # elif m.group(1) == 'p': position = PREDICATIVE 234 # elif m.group(1) == 'ip': position = IMMEDIATE_POSTNOMINAL 235 # else: raise "Unknown attribute '%s'" % (key) 236 # self._senses.append(position) 237 238 # if self.pos == VERB: 239 # del self._senseVerbFrames 240 241 # del self.words 242 243 # return self._senses 244
245 - def relations(self):
246 """ 247 Return a dictionary of synsets 248 249 If pointerType is specified, only pointers of that type are 250 returned. In this case, pointerType should be an element of 251 POINTER_TYPES. 252 253 @return: relations defined on this L{Synset}. 254 """ 255 256 # Load the pointers from the Wordnet files if necessary. 257 if not hasattr(self, '_relations'): 258 self._relations = {} 259 260 for (type, offset, pos, indices) in self._pointerTuples: 261 key = _RELATION_TABLE[type] 262 if key not in self._relations: 263 self._relations[key] = [] 264 idx = int(indices, 16) & 255 265 synset_ref = normalizePOS(pos), int(offset), idx 266 self._relations[key].append(synset_ref) 267 del self._pointerTuples 268 return self._relations
269
270 - def relation(self, rel):
271 synsets = [] 272 for synset_ref in self.relations().get(rel, []): 273 pos, offset, idx = synset_ref 274 synset = getSynset(pos, offset) 275 if idx: 276 synsets.append(synset[idx-1]) 277 else: 278 synsets.append(synset) 279 return synsets
280 281 ### BROKEN:
282 - def isTagged(self):
283 """ 284 >>> from nltk_lite.wordnet import * 285 >>> N['dog'][0].isTagged() 286 1 287 288 >>> N['dog'][1].isTagged() 289 0 290 291 @return: True/false (1/0) if one of this L{Word}'s senses is tagged. 292 """ 293 return len(filter(Word.isTagged, self.words)) > 0
294
295 - def __str__(self):
296 """ 297 Return a human-readable representation. 298 299 >>> from nltk_lite.wordnet import * 300 >>> str(N['dog'][0].synset) 301 '{noun: dog, domestic dog, Canis familiaris}' 302 """ 303 return "{" + self.pos + ": " + string.join(self.words, ", ") + "}"
304
305 - def __repr__(self):
306 return "{" + self.pos + ": " + string.join(self.words, ", ") + "}"
307
308 - def __cmp__(self, other):
309 return _compareInstances(self, other, ('pos', 'offset'))
310
311 - def __eq__(self, other):
312 return _compareInstances(self, other, ('pos', 'offset')) == 0
313
314 - def __getitem__(self, idx):
315 try: 316 return self.words[idx] # integer key 317 except TypeError: 318 return self.relation(idx) # string key
319
320 - def __iter__(self):
321 return iter(self.words)
322
323 - def __contains__(self, item):
324 return item in self.words
325
326 - def __getslice__(self, i, j):
327 return self.words[i:j]
328
329 - def __nonzero__(self):
330 return 1
331
332 - def __len__(self):
333 """ 334 >>> from nltk_lite.wordnet import * 335 >>> len(N['dog'][0].synset) 336 3 337 """ 338 return len(self.words())
339
340 - def max_depth(self):
341 """ 342 @return: The length of the longest hypernym path from this synset to the root. 343 """ 344 345 if self[HYPERNYM] == []: 346 return 0 347 348 deepest = 0 349 for hypernym in self[HYPERNYM]: 350 depth = hypernym.max_depth() 351 if depth > deepest: 352 deepest = depth 353 return deepest + 1
354
355 - def min_depth(self):
356 """ 357 @return: The length of the shortest hypernym path from this synset to the root. 358 """ 359 360 if self[HYPERNYM] == []: 361 return 0 362 363 shallowest = 1000 364 for hypernym in self[HYPERNYM]: 365 depth = hypernym.max_depth() 366 if depth < shallowest: 367 shallowest = depth 368 return shallowest + 1
369
370 - def closure(self, rel, depth=-1):
371 """Return the transitive closure of source under the rel relationship, breadth-first 372 373 >>> dog = N['dog'][0] 374 >>> dog.closure(HYPERNYM) 375 [{noun: dog, domestic dog, Canis familiaris}, {noun: canine, canid}, {noun: carnivore}, {noun: placental, placental mammal, eutherian, eutherian mammal}, {noun: mammal, mammalian}, {noun: vertebrate, craniate}, {noun: chordate}, {noun: animal, animate being, beast, brute, creature, fauna}, {noun: organism, being}, {noun: living thing, animate thing}, {noun: object, physical object}, {noun: physical entity}, {noun: entity}] 376 """ 377 from nltk_lite.utilities import breadth_first 378 synset_offsets = [] 379 for synset in breadth_first(self, lambda s:s[rel], depth): 380 if synset.offset != self.offset and synset.offset not in synset_offsets: 381 synset_offsets.append(synset.offset) 382 yield synset
383 # return synsets 384
385 - def hypernym_paths(self):
386 """ 387 Get the path(s) from this synset to the root, where each path is a 388 list of the synset nodes traversed on the way to the root. 389 390 @return: A list of lists, where each list gives the node sequence 391 connecting the initial L{Synset} node and a root node. 392 """ 393 paths = [] 394 395 hypernyms = self[HYPERNYM] 396 if len(hypernyms) == 0: 397 paths = [[self]] 398 399 for hypernym in hypernyms: 400 for ancestor_list in hypernym.hypernym_paths(): 401 ancestor_list.append(self) 402 paths.append(ancestor_list) 403 return paths
404
405 - def hypernym_distances(self, distance, verbose=False):
406 """ 407 Get the path(s) from this synset to the root, counting the distance 408 of each node from the initial node on the way. A list of 409 (synset, distance) tuples is returned. 410 411 @type distance: C{int} 412 @param distance: the distance (number of edges) from this hypernym to 413 the original hypernym L{Synset} on which this method was called. 414 @return: A list of (L{Synset}, int) tuples where each L{Synset} is 415 a hypernym of the first L{Synset}. 416 """ 417 distances = set([(self, distance)]) 418 419 for hypernym in self[HYPERNYM]: 420 distances |= hypernym.hypernym_distances(distance+1, verbose=False) 421 if verbose: 422 print "> Hypernym Distances:", self, string.join(synset.__str__() + ":" + `dist` for synset, dist in distances) 423 return distances
424
425 - def shortest_path_distance(self, other):
426 """ 427 Returns the distance of the shortest path linking the two synsets (if 428 one exists). For each synset, all the ancestor nodes and their distances 429 are recorded and compared. The ancestor node common to both synsets that 430 can be reached with the minimum number of traversals is used. If no 431 ancestor nodes are common, -1 is returned. If a node is compared with 432 itself 0 is returned. 433 434 @type other: L{Synset} 435 @param other: The Synset to which the shortest path will be found. 436 @return: The number of edges in the shortest path connecting the two 437 nodes, or -1 if no path exists. 438 """ 439 440 if self == other: return 0 441 442 path_distance = -1 443 444 dist_list1 = self.hypernym_distances(0) 445 dist_dict1 = {} 446 447 dist_list2 = other.hypernym_distances(0) 448 dist_dict2 = {} 449 450 # Transform each distance list into a dictionary. In cases where 451 # there are duplicate nodes in the list (due to there being multiple 452 # paths to the root) the duplicate with the shortest distance from 453 # the original node is entered. 454 455 for (l, d) in [(dist_list1, dist_dict1), (dist_list2, dist_dict2)]: 456 for (key, value) in l: 457 if key in d: 458 if value < d[key]: 459 d[key] = value 460 else: 461 d[key] = value 462 463 # For each ancestor synset common to both subject synsets, find the 464 # connecting path length. Return the shortest of these. 465 466 for synset1 in dist_dict1.keys(): 467 for synset2 in dist_dict2.keys(): 468 if synset1 == synset2: 469 new_distance = dist_dict1[synset1] + dist_dict2[synset2] 470 if path_distance < 0 or new_distance < path_distance: 471 path_distance = new_distance 472 473 return path_distance
474
475 - def information_content(self, freq_data):
476 """ 477 Get the Information Content value of this L{Synset}, using 478 the supplied dict 'freq_data'. 479 480 @type freq_data: C{dict} 481 @param freq_data: Dictionary mapping synset identifiers (offsets) to 482 a tuple containing the frequency count of the synset, and the 483 frequency count of the root synset. 484 @return: The IC value of this L{Synset}, or -1 if no IC value can be 485 computed. 486 """ 487 key = self.offset 488 489 if freq_data.has_key(key): 490 prob = float(freq_data[key][0]) / freq_data[key][1] 491 return -math.log(prob) 492 493 else: return -1
494
495 - def tree(self, rel, depth=-1):
496 """ 497 >>> dog = N['dog'][0] 498 >>> from pprint import pprint 499 >>> pprint(dog.tree(HYPERNYM)) 500 ['dog' in {noun: dog, domestic dog, Canis familiaris}, 501 [{noun: canine, canid}, 502 [{noun: carnivore}, 503 [{noun: placental, placental mammal, eutherian, eutherian mammal}, 504 [{noun: mammal, mammalian}, 505 [{noun: vertebrate, craniate}, 506 [{noun: chordate}, 507 [{noun: animal, animate being, beast, brute, creature, fauna}, 508 [{noun: organism, being}, 509 [{noun: living thing, animate thing}, 510 [{noun: object, physical object}, 511 [{noun: physical entity}, [{noun: entity}]]]]]]]]]]]]] 512 """ 513 if depth == 0: 514 return [self] 515 else: 516 return [self] + map(lambda s, rel=rel:s.tree(rel, depth=1), self[rel])
517 518 # interface to similarity methods 519
520 - def path_similarity(self, other, verbose=False):
521 return path_similarity(self, other, verbose)
522
523 - def lch_similarity(self, other, verbose=False):
524 return lch_similarity(self, other, verbose)
525
526 - def wup_similarity(self, other, verbose=False):
527 return wup_similarity(self, other, verbose)
528
529 - def res_similarity(self, other, datafile="", verbose=False):
530 return res_similarity(self, other, datafile, verbose)
531
532 - def jcn_similarity(self, other, datafile="", verbose=False):
533 return jcn_similarity(self, other, datafile, verbose)
534
535 - def lin_similarity(self, other, datafile="", verbose=False):
536 return lin_similarity(self, other, datafile, verbose)
537 538 539 ############################################################################# 540 # Dictionary classes, which allow users to access 541 # Wordnet data via a handy dict notation (see below). 542 ############################################################################# 543 544 from types import IntType, StringType 545
546 -class Dictionary(object):
547 """ 548 A Dictionary contains all the Words in a given part of speech. Four 549 dictionaries, bound to N, V, ADJ, and ADV, are bound by default in 550 __init.py__. 551 552 Indexing a dictionary by a string retrieves the word named by that 553 string, e.g. dict['dog']. Indexing by an integer n retrieves the 554 nth word, e.g. dict[0]. Access by an arbitrary integer is very 555 slow except in the special case where the words are accessed 556 sequentially; this is to support the use of dictionaries as the 557 range of a for statement and as the sequence argument to map and filter. 558 559 >>> N['dog'] 560 dog(n.) 561 """ 562
563 - def __init__(self, pos, filenameroot):
564 """ 565 @type pos: C{string} 566 @param pos: This L{Dictionary}'s part of speech ('noun', 'verb' etc.) 567 @type filenameroot: C{string} 568 @param filenameroot: filename of the relevant Wordnet dictionary file 569 """ 570 self.pos = pos 571 self.indexFile = IndexFile(pos, filenameroot) 572 self.dataFile = open(dataFilePathname(filenameroot), FILE_OPEN_MODE)
573
574 - def __repr__(self):
575 dictionaryVariables = {} 576 577 if dictionaryVariables.get(self): 578 return self.__module__ + "." + dictionaryVariables[self] 579 580 return "<%s.%s instance for %s>" % \ 581 (self.__module__, "Dictionary", self.pos)
582
583 - def getWord(self, form, line=None):
584 """ 585 @type form: C{string} 586 @param form: word string e.g, 'dog' 587 @type line: C{string} 588 @param line: appropriate line sourced from the index file (optional) 589 @return: The L{Word} object with the supplied form, if present. 590 """ 591 key = form.lower().replace(' ', '_') 592 pos = self.pos 593 594 def loader(key=key, line=line, indexFile=self.indexFile): 595 line = line or indexFile.get(key) 596 return line and Word(line)
597 598 word = entityCache.get((pos, key), loader) 599 600 if word: return word 601 else: raise KeyError, "%s is not in the %s database" % (`form`, `pos`)
602
603 - def getSynset(self, offset):
604 """ 605 @type offset: C{int} 606 @param offset: integer offset into a Wordnet file, at which the 607 desired L{Synset} can be found. 608 609 @return: The relevant L{Synset}, if present. 610 """ 611 612 def loader(pos=self.pos, offset=offset, dataFile=self.dataFile): 613 dataFile.seek(offset) 614 line = dataFile.readline() 615 return Synset(pos, offset, line)
616 617 return entityCache.get((self.pos, offset), loader) 618
619 - def _buildIndexCacheFile(self):
620 self.indexFile._buildIndexCacheFile()
621
622 - def __nonzero__(self):
623 """ 624 >>> N and 'true' 625 'true' 626 """ 627 return 1
628
629 - def __len__(self):
630 """ 631 Return the number of index entries. 632 633 >>> len(ADJ) 634 21435 635 """ 636 if not hasattr(self, 'length'): 637 self.length = len(self.indexFile) 638 639 return self.length
640
641 - def __getslice__(self, a, b):
642 results = [] 643 644 if type(a) == type('') and type(b) == type(''): 645 raise "unimplemented" 646 647 elif type(a) == type(1) and type(b) == type(1): 648 for i in range(a, b): 649 results.append(self[i]) 650 651 else: 652 raise TypeError 653 654 return results
655
656 - def __getitem__(self, index):
657 """ 658 If index is a String, return the Word whose form is 659 index. If index is an integer n, return the Word 660 indexed by the n'th Word in the Index file. 661 662 >>> N['dog'] 663 dog(n.) 664 >>> N[0] 665 'hood(n.) 666 """ 667 if isinstance(index, StringType): 668 return self.getWord(index) 669 670 elif isinstance(index, IntType): 671 line = self.indexFile[index] 672 return self.getWord(string.replace(line[:string.find(line, ' ')], '_', ' '), line) 673 674 else: 675 raise TypeError, "%s is not a String or Int" % `index`
676
677 - def __iter__(self):
678 return iter(self.keys())
679
680 - def __contains__(self, item):
681 return self.has_key(item)
682
683 - def get(self, key, default=None):
684 """ 685 Return the Word whose form is key, or default. 686 687 >>> N.get('dog') 688 dog(n.) 689 690 @type key: C{string} 691 @param key: the string form of a L{Word} e.g. 'dog' 692 @type default: L{Word} 693 @param default: An optional L{Word} to return if no entry can be found 694 with the supplied key. 695 @return: The L{Word} whose form is given by 'key' 696 """ 697 try: 698 return self[key] 699 700 except LookupError: 701 return default
702
703 - def keys(self):
704 """ 705 @return: A sorted list of strings that index words in this 706 dictionary. 707 """ 708 return self.indexFile.keys()
709
710 - def has_key(self, form):
711 """ 712 Checks if the supplied argument is an index into this dictionary. 713 714 >>> N.has_key('dog') 715 1 716 >>> N.has_key('inu') 717 0 718 719 @type form: C{string} 720 @param form: a word string e.g. 'dog' 721 @return: true iff the argument indexes a word in this dictionary. 722 """ 723 return self.indexFile.has_key(form)
724 725 # Testing 726
727 - def _testKeys(self):
728 # Verify that index lookup can find each word in the index file. 729 print "Testing: ", self 730 file = open(self.indexFile.file.name, _FILE_OPEN_MODE) 731 counter = 0 732 733 while 1: 734 line = file.readline() 735 736 if line == '': break 737 738 if line[0] != ' ': 739 key = string.replace(line[:string.find(line, ' ')], '_', ' ') 740 741 if (counter % 1000) == 0: 742 print "%s..." % (key,), 743 import sys 744 sys.stdout.flush() 745 746 counter = counter + 1 747 self[key] 748 749 file.close() 750 print "done."
751 752 # Dictionaries 753 754 N = Dictionary(NOUN, NOUN) 755 V = Dictionary(VERB, VERB) 756 ADJ = Dictionary(ADJECTIVE, ADJECTIVE) 757 ADV = Dictionary(ADVERB, ADVERB) 758 759 Dictionaries = {NOUN: N, VERB: V, ADJECTIVE: ADJ, ADVERB: ADV} 760
761 -def dictionaryFor(pos):
762 """ 763 Return the dictionary for the supplied part of speech. 764 765 @type pos: C{string} 766 @param pos: The part of speech of the desired dictionary. 767 768 @return: The desired dictionary. 769 """ 770 pos = normalizePOS(pos) 771 try: 772 d = Dictionaries[pos] 773 except KeyError: 774 raise RuntimeError, "The " + `pos` + " dictionary has not been created" 775 776 return d
777 778 779 780 # Lexical Relations 781 782 _RELATION_TABLE = { 783 '!': ANTONYM, '@': HYPERNYM, '~': HYPONYM, '=': ATTRIBUTE, 784 '^': ALSO_SEE, '*': ENTAILMENT, '>': CAUSE, '$': VERB_GROUP, 785 '#m': MEMBER_MERONYM, '#s': SUBSTANCE_MERONYM, '#p': PART_MERONYM, 786 '%m': MEMBER_HOLONYM, '%s': SUBSTANCE_HOLONYM, '%p': PART_HOLONYM, 787 '&': SIMILAR, '<': PARTICIPLE_OF, '\\': PERTAINYM, '+': FRAMES, 788 ';c': CLASSIF_CATEGORY, ';u': CLASSIF_USAGE, ';r': CLASSIF_REGIONAL, 789 '-c': CLASS_CATEGORY, '-u': CLASS_USAGE, '-r': CLASS_REGIONAL, 790 '@i': INSTANCE_HYPERNYM,'~i': INSTANCE_HYPONYM, 791 } 792 793 # Lookup functions 794
795 -def getWord(form, pos=NOUN):
796 """ 797 Return a word with the given lexical form and pos. 798 799 @type form: C{string} 800 @param form: the sought-after word string e.g. 'dog' 801 802 @type pos: C{string} 803 @param pos: the desired part of speech. Defaults to 'noun'. 804 805 @return: the L{Word} object corresponding to form and pos, if it exists. 806 """ 807 return dictionaryFor(pos).getWord(form)
808
809 -def getSense(form, pos=NOUN, senseno=0):
810 """ 811 Lookup a sense by its sense number. Used by repr(sense). 812 813 @type form: C{string} 814 @param form: the sought-after word string e.g. 'dog' 815 @type pos: C{string} 816 @param pos: the desired part of speech. Defaults to 'noun'. 817 @type senseno: C{int} 818 @param senseno: the id of the desired word sense. Defaults to 0. 819 @return: the L{Sense} object corresponding to form, pos and senseno, if it exists. 820 """ 821 return getWord(form, pos)[senseno]
822
823 -def getSynset(pos, offset):
824 """ 825 Lookup a synset by its offset. 826 827 @type pos: C{string} 828 @param pos: the desired part of speech. 829 @type offset: C{int} 830 @param offset: the offset into the relevant Wordnet dictionary file. 831 @return: the L{Synset} object extracted from the Wordnet dictionary file. 832 """ 833 return dictionaryFor(pos).getSynset(offset)
834 835 # Utility functions 836
837 -def _check_datafile(datafile):
838 if datafile is "": 839 raise RuntimeError, "You must supply the path of a datafile containing frequency information, as generated by brown_information_content() in 'brown_ic.py'"
840
841 -def _load_ic_data(filename):
842 """ 843 Load in some precomputed frequency distribution data from a file. It is 844 expected that this data has been stored as two pickled dicts. 845 846 TODO: Possibly place the dicts into a global variable or something so 847 that they don't have to be repeatedly loaded from disk. 848 """ 849 infile = open(filename, "rb") 850 noun_freqs = pickle.load(infile) 851 verb_freqs = pickle.load(infile) 852 infile.close() 853 854 return (noun_freqs, verb_freqs)
855 856 # Private Utility Functions 857
858 -def _index(key, sequence, testfn=None, keyfn=None):
859 """ 860 Return the index of key within sequence, using testfn for 861 comparison and transforming items of sequence by keyfn first. 862 863 >>> _index('e', 'hello') 864 1 865 >>> _index('E', 'hello', testfn=_equalsIgnoreCase) 866 1 867 >>> _index('x', 'hello') 868 """ 869 index = 0 870 for element in sequence: 871 value = element 872 if keyfn: 873 value = keyfn(value) 874 if (not testfn and value == key) or (testfn and testfn(value, key)): 875 return index 876 index = index + 1 877 return None
878
879 -def _partition(sequence, size, count):
880 """ 881 Partition sequence into C{count} subsequences of 882 length C{size}, and a remainder. 883 884 Return C{(partitions, remainder)}, where C{partitions} is a sequence of 885 C{count} subsequences of cardinality C{size}, and 886 C{apply(append, partitions) + remainder == sequence}. 887 """ 888 889 partitions = [] 890 for index in range(0, size * count, size): 891 partitions.append(sequence[index:index + size]) 892 return (partitions, sequence[size * count:])
893
894 -def _compareInstances(a, b, fields):
895 """ 896 Return -1, 0, or 1 according to a comparison first by type, 897 then by class, and finally by each of fields. Used when comparing two 898 Wordnet objects (Synsets, Words, or Senses) to each other. 899 """ 900 if not hasattr(b, '__class__'): 901 return cmp(type(a), type(b)) 902 elif a.__class__ != b.__class__: 903 return cmp(a.__class__, b.__class__) 904 905 for field in fields: 906 diff = cmp(getattr(a, field), getattr(b, field)) 907 if diff: return diff 908 909 return 0
910
911 -def _equalsIgnoreCase(a, b):
912 """ 913 Return true iff a and b have the same lowercase representation. 914 915 >>> _equalsIgnoreCase('dog', 'Dog') 916 1 917 >>> _equalsIgnoreCase('dOg', 'DOG') 918 1 919 """ 920 return a == b or a.lower() == b.lower()
921 922 923
924 -def demo():
925 from nltk_lite.wordnet import N, V, ADJ, ADV, HYPERNYM 926 from pprint import pprint 927 928 dog = N['dog'] 929 cat = N['cat'] 930 931 print "N['dog']" 932 print 'dog' in N 933 print dog 934 print dog.pos, dog.form 935 print dog.taggedSenseCount 936 print dog.synsets() 937 print dog.isTagged() 938 # ADJ['clear'].getAdjectivePositions() 939 # N['cat'] < N['dog'] 940 # N['dog'] < V['dog'] 941 942 print "Verb Frames:", 943 print V['think'][0].verbFrameStrings 944 945 print "Relations:" 946 print dog[0].relations() 947 print dog[0].relation(HYPERNYM) 948 949 print "Glosses:" 950 print dog[0].gloss 951 print dog[0].relation(HYPERNYM)[0].gloss 952 953 print 954 print "Paths and Distances:" 955 print 956 957 print dog[0].hypernym_paths() 958 print dog[0].hypernym_distances(0) 959 print dog[0].shortest_path_distance(cat[0]) 960 961 print 962 print "Closures and Trees:" 963 print 964 965 print ADJ['red'][0].closure(SIMILAR, depth=1) 966 print ADJ['red'][0].closure(SIMILAR, depth=2) 967 pprint(dog[0].tree(HYPERNYM)) 968 969 # Adjectives that are transitively SIMILAR to any of the senses of 'red' 970 #flatten1(map(lambda sense:closure(sense, SIMILAR), ADJ['red'])) # too verbose 971 972 print "All the words in the hyponym synsets of dog[0]" 973 print [word for synset in dog[0][HYPONYM] for word in synset] 974 975 print "Hyponyms of the first (and only) sense of 'animal' that are homophonous with verbs:" 976 print [word for synset in N['animal'][0].closure(HYPONYM) for word in synset if word in V] 977 978 # BROKEN 979 print "Senses of 'raise'(v.) and 'lower'(v.) that are antonyms:" 980 print filter(lambda p:p[0] in p[1][ANTONYM], [(r,l) for r in V['raise'] for l in V['lower']]) 981 982 print 983 print "Similarity: dog~cat" 984 print 985 986 print "Path Distance Similarity:", 987 print dog[0].path_similarity(cat[0]) 988 print "Leacock Chodorow Similarity:", 989 print dog[0].lch_similarity(cat[0]) 990 print "Wu Palmer Similarity:", 991 print dog[0].wup_similarity(cat[0])
992 993 # set up the data file 994 # print "Resnik Similarity:", 995 # print dog[0].resnik_similarity(cat[0], datafile) 996 # print "Jiang-Conrath Similarity:", 997 # print dog[0].jiang_conrath_similarity(cat[0], datafile) 998 # print "Lin Similarity:", 999 # print dog[0].lin_similarity(cat[0], datafile) 1000 1001 if __name__ == '__main__': 1002 demo() 1003