1
2
3
4
5
6
7
8
9
10
11 """
12 This module provides functionality for reading settings files for Toolbox.
13 Settings files provide information (metadata) concerning lexicons and texts,
14 such as which fields are found within them and what kind of values those
15 fields can have.
16 """
17
18 from nltk_lite.etree.ElementTree import TreeBuilder
19 from nltk_lite.corpora.toolbox import StandardFormat
20
21
60
66
86
88 """This class is a container for FieldMetadata objects. A marker set
89 contains a list of the fields in a database together with information
90 about those files.
91
92 The raw SFB looks like this::
93
94 \\+mkrset
95 \\lngDefault Default
96 \\mkrRecord lx
97
98 \\+mkr dt
99 \\nam Date Last Edited
100 \\lng Default
101 \\mkrOverThis lx
102 \\-mkr
103
104 \\+mkr lx
105 \\nam Rotokas Word
106 \\lng Rotokas
107 \\-mkr
108 \\-mkrset
109 """
110
113
115 """Obtain a list of all of the field markers for the marker set.
116 @returns: list of field markers
117 @rtype: list of strings"""
118 return self._dict.keys()
119
125
133
148
182
183
279
280
282 """This class is used to parse and manipulate settings file for
283 lexicons."""
284
286 self._file = file
287 self._markerset = MarkerSet()
288 self._tree = None
289
290 - def parse(self, encoding=None) :
291 """Parse a settings file with lexicon metadata."""
292 s = Settings()
293 s.open(self._file)
294 self._tree = s.parse(encoding=encoding)
295 s.close()
296
297
298 for mkr in self._tree.findall('mkrset/mkr') :
299 rangeset = None
300 if self.__parse_value(mkr, "rngset") :
301 rangeset = self.__parse_value(mkr, "rngset").split()
302 fm = FieldMetadata(marker = mkr.text,
303 name = self.__parse_value(mkr, "nam"),
304 desc = self.__parse_value(mkr, "desc"),
305 lang = self.__parse_value(mkr, "lng"),
306 rangeset = rangeset,
307 multiword = self.__parse_boolean(mkr, "MultipleWordItems"),
308 required = self.__parse_boolean(mkr, "MustHaveData"),
309 parent_mkr = self.__parse_value(mkr, "mkrOverThis"))
310 self._markerset.add_field_metadata(fm)
311
312
313
314
315 for rs in self._tree.findall("rngset") :
316 mkr = rs.findtext("mkr")
317 fm = self._markerset.get_metadata_by_marker(mkr)
318 fm.set_rangeset([d.text for d in rs.findall("dat") ])
319 self._markerset.add_field_metadata(fm)
320
322 return self._tree.find('mkrset/mkrRecord').text
323
325 return self._markerset
326
328 if mkr.find(name) == None :
329 return False
330 else :
331 return True
332
334 try :
335 return mkr.find(name).text
336 except :
337 return None
338
340 """This class represents a process for text interlinearization."""
341
342 - def __init__(self,
343 from_mkr = None,
344 to_mkr = None,
345 out_mkr = None,
346 gloss_sep = None,
347 fail_mark = None,
348 parse_proc = None,
349 show_fail_mark = None,
350 show_root_guess = None) :
351 self.__from_mkr = from_mkr
352 self.__to_mkr = to_mkr
353 self.__out_mkr = out_mkr
354 self.__gloss_sep = gloss_sep
355 self.__fail_mark = fail_mark
356 self.__parse_proc = parse_proc
357 self.__show_fail_mark = show_fail_mark
358 self.__show_root_guess = show_root_guess
359
361 return self.__out_mkr
362
364 """The marker searched for in the lookup process."""
365 return self.__from_mkr
366
368 """The marker found in the lookup process."""
369 return self.__to_mkr
370
372 """???"""
373 return self.__gloss_sep
374
376 """The string used in the case of lookup failure,"""
377 return self.__fail_mark
378
380 """Determine whether this process is a parse process (as opposed to a lookup process)."""
381 return self.__parse_proc
382
384 """???"""
385 return self.__show_fail_mark
386
388 """???"""
389 return self.__show_root_guess
390
391
394
395
398
399
400 -class TextSettings(ToolboxSettings) :
401 """This class is used to parse and manipulate settings file for
402 lexicons."""
403
404 - def __init__(self, file):
405 self._file = file
406 self._markerset = MarkerSet()
407 self._tree = None
408
409 - def parse(self, encoding=None) :
410 """Parse a settings file with lexicon metadata."""
411 s = Settings()
412 s.open(self._file)
413 self._tree = s.parse(encoding=encoding)
414 s.close()
415
416
417 for proc in self._tree.findall("intprclst/intprc") :
418 parseProcess = self.__parse_boolean(proc, "bParseProc")
419 showRootGuess = self.__parse_boolean(proc, "bShowRootGuess")
420 showFailMark = self.__parse_boolean(proc, "bShowFailMark")
421 fromMkr = self.__parse_value(proc, "mkrFrom")
422 outMkr = self.__parse_value(proc, "mkrOut")
423 toMkr = self.__parse_value(proc, "mkrTo").strip()
424 glossSep = self.__parse_value(proc, "GlossSeparator")
425 failMark = self.__parse_value(proc, "FailMark")
426 ip = ParseProcess(from_mkr = fromMkr,
427 to_mkr = toMkr,
428 gloss_sep = glossSep,
429 fail_mark = failMark,
430 parse_proc = parseProcess,
431 show_fail_mark = showFailMark,
432 show_root_guess = showRootGuess,
433 out_mkr = outMkr)
434 if parseProcess :
435 pass
436 else :
437 pass
438
439 print "----- Interlinear Process -----"
440 print " FROM: [%s]" % ip.get_from_marker()
441 print " TO: [%s]" % ip.get_to_marker()
442 print " GLOSS SEP: [%s]" % ip.get_gloss_separator()
443 print " FAIL MARK: [%s]" % ip.get_failure_marker()
444 print " SHOW FAIL MARK: [%s]" % ip.show_failure_marker()
445 print " SHOW ROOT GUESS: [%s]" % ip.show_root_guess()
446 print " PARSE PROCESS: [%s]" % ip.is_parse_process()
447
448 trilook = proc.find("triLook")
449 if trilook :
450 print " -- trilook --"
451 print " DB TYPE: [%s]" % self.__parse_value(trilook, "dbtyp")
452 print " MKR OUTPUT: [%s]" % self.__parse_value(trilook, "mkrOut")
453
454 tripref = proc.find("triPref")
455 if tripref :
456 print " -- tripref --"
457 print " DB TYPE: [%s]" % self.__parse_value(tripref, "dbtyp")
458 print " MKR OUTPUT: [%s]" % self.__parse_value(tripref, "mkrOut")
459 try :
460 for d in tripref.findall("drflst/drf") :
461 print " DB: [%s]" % self.__parse_value(d, "File")
462 except :
463 pass
464 try :
465 for d in tripref.find("mrflst") :
466 print " MKR: [%s]" % d.text
467 except :
468 pass
469
470 triroot = proc.find("triRoot")
471 if triroot :
472 print " -- triroot --"
473 print " DB TYPE: [%s]" % self.__parse_value(triroot, "dbtyp")
474 print " MKR OUTPUT: [%s]" % self.__parse_value(triroot, "mkrOut")
475 try :
476 for d in triroot.findall("drflst/drf") :
477 print " DB: [%s]" % self.__parse_value(d, "File")
478 except :
479 pass
480 try :
481 for d in triroot.find("mrflst") :
482 print " MKR: [%s]" % d.text
483 except :
484 pass
485
486 print ""
487
488
489 for mkr in self._tree.findall('mkrset/mkr') :
490 rangeset = None
491 if self.__parse_value(mkr, "rngset") :
492 rangeset = self.__parse_value(mkr, "rngset").split()
493 fm = FieldMetadata(marker = mkr.text,
494 name = self.__parse_value(mkr, "nam"),
495 desc = self.__parse_value(mkr, "desc"),
496 lang = self.__parse_value(mkr, "lng"),
497 rangeset = rangeset,
498 multiword = self.__parse_boolean(mkr, "MultipleWordItems"),
499 required = self.__parse_boolean(mkr, "MustHaveData"),
500 parent_mkr = self.__parse_value(mkr, "mkrOverThis"))
501 self._markerset.add_field_metadata(fm)
502
503
504
505
506 for rs in self._tree.findall("rngset") :
507 mkr = rs.findtext("mkr")
508 fm = self._markerset.get_metadata_by_marker(mkr)
509 fm.set_rangeset([d.text for d in rs.findall("dat") ])
510 self._markerset.add_field_metadata(fm)
511
513 return self._tree.find('mkrset/mkrRecord').text
514
515 - def get_version(self) :
516 return self._tree.find('ver').text
517
518 - def get_description(self) :
519 return self._tree.find('desc').text
520
521 - def get_marker_set(self) :
522 return self._markerset
523
524 - def __parse_boolean(self, mkr, name) :
525 if mkr.find(name) == None :
526 return False
527 else :
528 return True
529
530 - def __parse_value(self, mkr, name) :
531 try :
532 return mkr.find(name).text
533 except :
534 return None
535
546
547 if __name__ == '__main__':
548 demo()
549