Package nltk_lite :: Package contrib :: Package toolbox :: Module language
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.toolbox.language

  1  #!/usr/bin/env python 
  2  # -*- coding: utf8 -*- 
  3   
  4  # Natural Language Toolkit: Toolbox Settings Parser 
  5  # 
  6  # Copyright (C) 2001-2006 University of Pennsylvania 
  7  # Author: Greg Aumann <greg_aumann@sil.org> 
  8  # URL: <http://nltk.sf.net> 
  9  # For license information, see LICENSE.TXT 
 10   
 11  """ 
 12  This module provides functionality for reading language settings files for  
 13  Toolbox.  
 14  """ 
 15   
 16  from nltk_lite.etree.ElementTree import TreeBuilder 
 17  from nltk_lite.contrib.toolbox.settings import ToolboxSettings 
 18  import re 
 19   
20 -class Letter(object):
21 __slots__ = ('upper', 'lower') 22
23 - def __init__(self):
24 self.upper = self.lower = None
25 26
27 -class Language(object):
28 """Class for Toolbox Language settings. 29 """
30 - def __init__(self, fname, encoding=None):
31 """Initialise from the settings file""" 32 set = ToolboxSettings() 33 set.open(fname) 34 settings = set.parse(unwrap=False, encoding=encoding) 35 36 self.init_case(settings.findtext('case')) 37 self.sort_order = {} 38 for sort_order in settings.findall('srtset/srt'): 39 so = SortOrder(sort_order) 40 self.sort_order[so.name] = so 41 self.default_order = self.sort_order[settings.findtext('srtset/srtDefault')]
42
43 - def init_case(self, case_pairs):
44 self.case = case = {} 45 for c in case_pairs.splitlines(): 46 val = c.split() 47 if len(val) != 2: 48 raise ValueError, '"%s" is not a valid case association' % c 49 u, l = val 50 let_u = case[u] = Letter() 51 let_l = case[l] = Letter() 52 let_u.upper = let_l.upper = u 53 let_u.lower = let_l.lower = l
54
55 - def lower(self, let):
56 """return the lower case form of the letter. 57 58 @rtype: string 59 """ 60 return self.case[let].lower
61
62 - def upper(self, let):
63 """return the upper case form of the letter. 64 65 @rtype: string 66 """ 67 return self.case[let].upper
68 69
70 -class Graph(object):
71 """""" 72 __slots__ = ('order', 'type') 73
74 - def __init__(self):
75 self.order = self.type = None
76 77
78 -class SortOrder(object):
79 """Class for Shoebox sort orders 80 81 """ 82
83 - def __init__(self, srt_order):
84 self.name = srt_order.text 85 self.desc = srt_order.findtext('desc') 86 # if they don't exist make them a empty list so we don't need to test again 87 try: 88 primary = srt_order.findtext('primary').splitlines() 89 except AttributeError: 90 primary = [] 91 try: 92 sec_pre = srt_order.findtext('SecPreceding').split() 93 except AttributeError: 94 sec_pre = [] 95 try: 96 sec_fol = srt_order.findtext('SecFollowing').split() 97 except AttributeError: 98 sec_fol = [] 99 try: 100 ignore = srt_order.findtext('ignore').split() 101 except AttributeError: 102 ignore = [] 103 self.sec_after = srt_order.find('SecAfterBase') is not None 104 105 primaries = [p.split() for p in primary] 106 107 self.graphs = graphs = {} 108 unmarked = len(sec_pre) + 1 109 primaries[0:0] = [' '] #, '\t', '\n'] 110 i = 1 111 for p in primaries: 112 j = 1 113 for m in p: 114 if m in graphs: 115 raise ValueError, 'primary "%s" already in sort order' % m 116 graphs[m] = g = Graph() 117 g.type = 'p' 118 g.order = (i, j, unmarked) 119 j += 1 120 i += 1 121 prims = graphs.keys() 122 prims.remove(' ') 123 self.letter_pat = self.make_pattern(prims) 124 125 i = 1 126 for s in sec_pre: 127 if s in graphs: 128 raise ValueError, 'secondary preceding "%s" already in sort order' % s 129 graphs[s] = g = Graph() 130 g.type = 's' 131 g.order = i 132 i += 1 133 134 # increment for unmarked case 135 i += 1 136 for s in sec_fol: 137 if s in graphs: 138 raise ValueError, 'secondary following "%s" already in sort order' % s 139 graphs[s] = g = Graph() 140 g.type = 's' 141 g.order = i 142 i += 1 143 144 self.graph_pat = self.make_pattern(graphs.keys())
145 ##~ graph_list = graphs.keys() 146 ##~ 147 ##~ # sort the longest first 148 ##~ tmpl = [(len(x), x) for x in graph_list] 149 ##~ tmpl.sort() 150 ##~ tmpl.reverse() 151 ##~ graph_list = [x[1] for x in tmpl] 152 ##~ self.graph_pat = re.compile('|'.join([re.escape(g) for g in graph_list])) 153
154 - def make_pattern(self, slist):
155 """Return a regular expression pattern to match the strings in slist""" 156 # sort the longest first 157 tmpl = [(len(x), x) for x in slist] 158 tmpl.sort() 159 tmpl.reverse() 160 sorted_list = [x[1] for x in tmpl] 161 escape = re.escape 162 pat = re.compile('|'.join([re.escape(g) for g in sorted_list])) 163 return pat
164
165 - def first_primary(self, s):
166 """return the first primary in the string s""" 167 match = self.letter_pat.search(s) 168 if match is not None: 169 return match.group() 170 else: 171 raise ValueError, 'no primary found in "%s"' % s
172
173 - def transform(self, s):
174 graphs = self.graphs 175 prim = [] 176 sec = [] 177 tert = [] 178 sec_order = None 179 for g in self.graph_pat.findall(s): 180 graph = graphs[g] 181 order = graph.order 182 type = graph.type 183 if type == 'p': 184 prim.append(order[0]) 185 tert.append(order[1]) 186 if sec_order is None: 187 sec.append(order[2]) 188 else: 189 sec.append(sec_order) 190 sec_order = None 191 elif type == 's': 192 # this ignores the situation of multiple consecutative secondaries 193 if self.sec_after: 194 sec[-1] = order 195 else: 196 # secondary is before the primary so save it for later 197 sec_order = order 198 return (tuple(prim), tuple(sec), tuple(tert))
199