Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package kimmo :: Module kimmo
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.kimmo.kimmo

  1  # pykimmo 3.0.0 -- a two-level morphology tool for nltk_lite 1.7 
  2  # by Rob Speer (rspeer@mit.edu) 
  3  # based on code from Carl de Marcken, Beracah Yankama, and Rob Speer 
  4   
  5  from rules import KimmoArrowRule, KimmoFSARule 
  6  from pairs import KimmoPair, sort_subsets 
  7  from morphology import * 
  8  from fsa import FSA 
  9  import yaml 
 10   
11 -def _pairify(state):
12 newstate = {} 13 for label, targets in state.items(): 14 newstate[KimmoPair.make(label)] = targets 15 return newstate
16 17
18 -class KimmoRuleSet(yaml.YAMLObject):
19 """ 20 An object that represents the morphological rules for a language. 21 22 The KimmoRuleSet stores a list of rules which must all succeed when they 23 process a given string. These rules can be used for generating a surface 24 form from a lexical form, or recognizing a lexical form from a surface 25 form. 26 """ 27 yaml_tag = '!KimmoRuleSet'
28 - def __init__(self, subsets, defaults, rules, morphology=None, null='0', boundary='#'):
29 """ 30 Creates a KimmoRuleSet. You may not want to do this directly, but use 31 KimmoRuleSet.load to load one from a YAML file. 32 33 A KimmoRuleSet takes these parameters: 34 subsets: a dictionary mapping strings to lists of strings. The strings 35 in the map become subsets representing all of the strings in the 36 list. 37 defaults: a list of KimmoPairs that can appear without being 38 specifically mentioned by a rule. 39 rules: a list of KimmoFSARules or KimmoArrowRules that define the 40 two-level morphology rules. 41 morphology: a KimmoMorphology object that defines a lexicon of word 42 roots and affixes. 43 null: the symbol representing the empty string in rules. 44 boundary: the symbol that will always appear at the end of lexical and 45 surface forms. 46 """ 47 self.debug = False 48 self._rules = list(rules) 49 self._pair_alphabet = set() 50 self._subsets = {} 51 self._null = null 52 self._boundary = boundary 53 self._subsets = subsets 54 self._morphology = morphology 55 56 for pair in defaults: 57 # defaults shouldn't contain subsets 58 if self.is_subset(pair.input()) or self.is_subset(pair.output()): 59 raise ValueError('default ' + str(pair) + ' contains subset') 60 self._pair_alphabet.add( pair ) 61 62 for r in self.rules(): 63 for kp in r.pairs(): 64 if (not (self.is_subset(kp.input()) or self.is_subset(kp.output()))): 65 self._pair_alphabet.add( kp )
66
67 - def rules(self):
68 "The list of rules in this ruleset." 69 return self._rules
70 - def subsets(self):
71 "The dictionary defining subsets of characters of the language." 72 return self._subsets
73 - def is_subset(self, key):
74 "Is this string a subset representing other strings?" 75 return key[0] == '~' or key in self.subsets()
76 - def null(self):
77 "The null symbol for this ruleset." 78 return self._null
79 - def morphology(self):
80 """The morphological lexicon (as a KimmoMorphology). Could be None, if 81 the ruleset is only used for generation. 82 """ 83 return self._morphology
84
85 - def _pairtext(self, char):
86 if char == self._null: return '' 87 else: return char
88
89 - def _generate(self, pairs, state_list, morphology_state=None, word='', 90 lexical=None, surface=None, features='', log=None):
91 if morphology_state: 92 morph = self._morphology 93 morphed = False 94 for state, feat in morph.next_states(morphology_state, word): 95 if feat is not None: 96 newfeat = combine_features(features, feat) 97 else: newfeat = features 98 for result in self._generate(pairs, state_list, 99 state, '', lexical, surface, newfeat, log): 100 yield result 101 morphed = True 102 if morphed: return 103 lexical_chars = list(morph.valid_lexical(morphology_state, 104 word, self._pair_alphabet)) + list(self._null) 105 else: lexical_chars = None 106 107 if lexical == '' or surface == '': 108 if morphology_state is None or morphology_state.lower() == 'end': 109 # check that all rules are in accepting states 110 for r in range(len(self._rules)): 111 rule = self._rules[r] 112 state = state_list[r] 113 if state not in rule.fsa().finals(): 114 return 115 if log: 116 log.succeed(pairs) 117 yield pairs, features 118 return 119 120 next_pairs = [p for p in self._pair_alphabet if 121 (lexical is None or lexical.startswith(self._pairtext(p.input()))) and 122 (surface is None or surface.startswith(self._pairtext(p.output())))] 123 for pair in next_pairs: 124 if pair.input() == self._null and pair.output() == self._null: 125 continue 126 if lexical_chars is not None and pair.input() not in lexical_chars: 127 continue 128 new_states = state_list[:] 129 for r in range(len(self._rules)): 130 rule = self._rules[r] 131 state = state_list[r] 132 next_state = self._advance_rule(rule, state, pair) 133 new_states[r] = next_state 134 135 newword = word + self._pairtext(pair.input()) 136 137 if log: 138 log.step(pairs, pair, self._rules, state_list, new_states, 139 morphology_state, newword) 140 fail = False 141 for new_state in new_states: 142 if new_state is None or str(new_state) == '0'\ 143 or str(new_state) == 'reject': 144 fail = True 145 break 146 if fail: continue 147 newlex, newsurf = lexical, surface 148 if lexical: newlex = lexical[len(self._pairtext(pair.input())):] 149 if surface: newsurf = surface[len(self._pairtext(pair.output())):] 150 for result in self._generate(pairs+[pair], new_states, 151 morphology_state, newword, newlex, newsurf, features, log): 152 yield result
153
154 - def generate(self, lexical, log=None):
155 """ 156 Given a lexical form, return all possible surface forms that fit 157 these rules. 158 159 Optionally, a 'log' object such as TextTrace(1) can be provided; this 160 object will display to the user all the steps of the Kimmo algorithm. 161 """ 162 if log: log.reset() 163 if not lexical.endswith(self._boundary): 164 lexical += self._boundary 165 got = self._generate([], [rule.fsa().start() for rule in 166 self._rules], lexical=lexical, log=log) 167 results = [] 168 for (pairs, features) in got: 169 results.append(''.join(self._pairtext(pair.output()).strip(self._boundary) for pair in pairs)) 170 return results
171
172 - def recognize(self, surface, log=None):
173 """ 174 Given a surface form, return all possible lexical forms that fit 175 these rules. Because the components of a lexical form can include 176 features such as the grammatical part of speech, each surface form 177 is returned as a 2-tuple of (surface text, features). 178 179 Optionally, a 'log' object such as TextTrace(1) can be provided; this 180 object will display to the user all the steps of the Kimmo algorithm. 181 """ 182 if log: log.reset() 183 if not surface.endswith(self._boundary): 184 surface += self._boundary 185 got = self._generate([], [rule.fsa().start() for rule in 186 self._rules], morphology_state='Begin', surface=surface, log=log) 187 results = [] 188 for (pairs, features) in got: 189 results.append((''.join(self._pairtext(pair.input()).strip(self._boundary) for pair in pairs), features)) 190 return results
191
192 - def _advance_rule(self, rule, state, pair):
193 trans = rule.fsa()._transitions[state] 194 expected_pairs = sort_subsets(trans.keys(), self._subsets) 195 for comppair in expected_pairs: 196 if comppair.includes(pair, self._subsets): 197 return rule.fsa().nextState(state, comppair) 198 return None
199
200 - def _test_case(self, input, outputs, arrow, method):
201 outputs.sort() 202 if arrow == '<=': 203 print '%s %s %s' % (', '.join(outputs), arrow, input) 204 else: 205 print '%s %s %s' % (input, arrow, ', '.join(outputs)) 206 value = method(input) 207 if len(value) and isinstance(value[0], tuple): 208 results = [v[0] for v in value] 209 else: results = value 210 results.sort() 211 if outputs != results: 212 print ' Failed: got %s' % (', '.join(results) or 'no results') 213 return False 214 else: return True
215
216 - def batch_test(self, filename):
217 """ 218 Test a rule set by reading lines from a file. 219 220 Each line contains one or more lexical forms on the left, and one or 221 more surface forms on the right (separated by commas if there are more 222 than one). In between, there is an arrow (=>, <=, or <=>), indicating 223 whether recognition, generation, or both should be tested. Comments 224 can be marked with ;. 225 226 Each form should produce the exact list of forms on the other side of 227 the arrow; if one is missing, or an extra one is produced, the test 228 will fail. 229 230 Examples of test lines: 231 cat+s => cats ; test generation only 232 conoc+o <=> conozco ; test generation and recognition 233 <= conoco ; this string should fail to be recognized 234 """ 235 f = open(filename) 236 try: 237 for line in f: 238 line = line[:line.find(';')].strip() 239 if not line: continue 240 arrow = None 241 for arrow_to_try in ['<=>', '=>', '<=']: 242 if line.find(arrow_to_try) >= 0: 243 lexicals, surfaces = line.split(arrow_to_try) 244 arrow = arrow_to_try 245 break 246 if arrow is None: 247 raise ValueError, "Can't find arrow in line: %s" % line 248 lexicals = lexicals.strip().split(', ') 249 surfaces = surfaces.strip().split(', ') 250 if lexicals == ['']: lexicals = [] 251 if surfaces == ['']: surfaces = [] 252 if arrow == '=>' or arrow == '<=>': 253 outputs = surfaces 254 for input in lexicals: 255 self._test_case(input, outputs, '=>', self.generate) 256 if arrow == '<=' or arrow == '<=>': 257 outputs = lexicals 258 for input in surfaces: 259 self._test_case(input, outputs, '<=', self.recognize) 260 finally: 261 f.close()
262 263 @classmethod
264 - def from_yaml(cls, loader, node):
265 """ 266 Loads a KimmoRuleSet from a parsed YAML node. 267 """ 268 map = loader.construct_mapping(node) 269 return cls.from_yaml_dict(map)
270 271 @classmethod
272 - def load(cls, filename):
273 """ 274 Loads a KimmoRuleSet from a YAML file. 275 276 The YAML file should contain a dictionary, with the following keys: 277 lexicon: the filename of the lexicon to load. 278 subsets: a dictionary mapping subset characters to space-separated 279 lists of symbols. One of these should usually be '@', mapping 280 to the entire alphabet. 281 defaults: a space-separated list of KimmoPairs that should be allowed 282 without a rule explicitly mentioning them. 283 null: the symbol that will be used to represent 'null' (usually '0'). 284 boundary: the symbol that represents the end of the word 285 (usually '#'). 286 rules: a dictionary mapping rule names to YAML representations of 287 those rules. 288 289 A rule can take these forms: 290 * a dictionary of states, where each state is a dictionary mapping 291 input pairs to following states. The start state is named 'start', 292 the state named 'reject' instantly rejects, and state names can be 293 prefixed with the word 'rejecting' so that they reject if the machine 294 ends in that state. 295 296 i-y-spelling: 297 start: 298 'i:y': step1 299 '@': start 300 rejecting step1: 301 'e:0': step2 302 '@': reject 303 rejecting step2: 304 '+:0': step3 305 '@': reject 306 rejecting step3: 307 'i:i': start 308 '@': reject 309 310 311 * a block of text with a DFA table in it, of the form used by 312 PC-KIMMO. The text should begin with a | so that YAML keeps your 313 line breaks, and the next line should be 'FSA'. State 0 instantly 314 rejects, and states with a period instead of a colon reject if the 315 machine ends in that state. 316 Examples: 317 318 i-y-spelling: | # this is the same rule as above 319 FSA 320 i e + i @ 321 y 0 0 i @ 322 1: 2 1 1 1 1 323 2. 0 3 0 0 0 324 3. 0 0 4 0 0 325 4. 0 0 0 1 0 326 327 epenthesis: | 328 FSA 329 c h s Csib y + # 0 @ 330 c h s Csib i 0 # e @ 331 1: 2 1 4 3 3 1 1 0 1 332 2: 2 3 3 3 3 1 1 0 1 333 3: 2 1 3 3 3 5 1 0 1 334 4: 2 3 3 3 3 5 1 0 1 335 5: 2 1 2 2 2 1 1 6 1 336 6. 0 0 7 0 0 0 0 0 0 337 7. 0 0 0 0 0 1 1 0 0 338 339 """ 340 f = open(filename) 341 result = cls._from_yaml_dict(yaml.load(f)) 342 f.close() 343 return result
344 345 @classmethod
346 - def _from_yaml_dict(cls, map):
347 lexicon = map.get('lexicon') 348 if lexicon: 349 lexicon = KimmoMorphology.load(lexicon) 350 subsets = map['subsets'] 351 for key, value in subsets.items(): 352 if isinstance(value, basestring): 353 subsets[key] = value.split() 354 defaults = map['defaults'] 355 if isinstance(defaults, basestring): 356 defaults = defaults.split() 357 defaults = [KimmoPair.make(text) for text in defaults] 358 ruledic = map['rules'] 359 rules = [] 360 for (name, rule) in ruledic.items(): 361 if isinstance(rule, dict): 362 rules.append(KimmoFSARule.from_dfa_dict(name, rule, subsets)) 363 elif isinstance(rule, basestring): 364 if rule.strip().startswith('FSA'): 365 rules.append(KimmoFSARule.parse_table(name, rule, subsets)) 366 else: rules.append(KimmoArrowRule(name, rule, subsets)) 367 else: 368 raise ValueError, "Can't recognize the data structure in '%s' as a rule: %s" % (name, rule) 369 return cls(subsets, defaults, rules, lexicon)
370
371 - def gui(self, startTk=True):
372 import draw 373 return draw.KimmoGUI(self, startTk)
374 draw_graphs = gui
375
376 -class TextTrace(object):
377 """ 378 Supply a TextTrace object as the 'log' argument to KimmoRuleSet.generate or 379 KimmoRuleSet.recognize, and it will output the steps it goes through 380 on a text terminal. 381 """
382 - def __init__(self, verbosity=1):
383 """ 384 Creates a TextTrace. The 'verbosity' argument ranges from 1 to 3, and 385 specifies how much text will be output to the screen. 386 """ 387 self.verbosity = verbosity
388 - def reset(self): pass
389 - def step(self, pairs, curr, rules, prev_states, states, 390 morphology_state, word):
391 lexical = ''.join(p.input() for p in pairs) 392 surface = ''.join(p.output() for p in pairs) 393 indent = ' '*len(lexical) 394 if self.verbosity > 2: 395 print '%s%s<%s>' % (indent, lexical, curr.input()) 396 print '%s%s<%s>' % (indent, surface, curr.output()) 397 for rule, state1, state2 in zip(rules, prev_states, states): 398 print '%s%s: %s => %s' % (indent, rule.name(), state1, state2) 399 if morphology_state: 400 print '%sMorphology: %r => %s' % (indent, word, morphology_state) 401 print 402 elif self.verbosity > 1: 403 print '%s%s<%s>' % (indent, lexical, curr.input()) 404 print '%s%s<%s>' % (indent, surface, curr.output()) 405 z = zip(prev_states, states) 406 if morphology_state: 407 z.append((word, morphology_state)) 408 print indent + (" ".join('%s>%s' % (old, new) for old, new in z)) 409 blocked = [] 410 for rule, state in zip(rules, states): 411 if str(state).lower() in ['0', 'reject']: 412 blocked.append(rule.name()) 413 if blocked: 414 print '%s[blocked by %s]' % (indent, ", ".join(blocked)) 415 print 416 else: 417 print '%s%s<%s> | %s<%s>' % (indent, lexical, curr.input(), 418 surface, curr.output()), 419 if morphology_state: 420 print '\t%r => %s' % (word, morphology_state), 421 blocked = [] 422 for rule, state in zip(rules, states): 423 if str(state).lower() in ['0', 'reject']: 424 blocked.append(rule.name()) 425 if blocked: 426 print ' [blocked by %s]' % (", ".join(blocked)), 427 print
428
429 - def succeed(self, pairs):
430 lexical = ''.join(p.input() for p in pairs) 431 surface = ''.join(p.output() for p in pairs) 432 indent = ' '*len(lexical) 433 434 print '%s%s' % (indent, lexical) 435 print '%s%s' % (indent, surface) 436 print '%sSUCCESS: %s <=> %s' % (indent, lexical, surface) 437 print 438 print
439
440 -def load(filename):
441 """ 442 Loads a ruleset from a file in YAML format. 443 444 See KimmoRuleSet.load for a more detailed description. 445 """ 446 return KimmoRuleSet.load(filename)
447
448 -def guidemo():
449 "An example of loading rules into the GUI." 450 rules = load('turkish.yaml') 451 rules.gui()
452
453 -def main():
454 """If a YAML file is specified on the command line, load it as a 455 KimmoRuleSet in the GUI.""" 456 import sys 457 if len(sys.argv) > 1: 458 filename = sys.argv[1] 459 k = load(filename) 460 k.gui()
461 462 if __name__ == '__main__': main() 463