Package nltk_lite :: Package corpora :: Module toolbox
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.toolbox

  1  #!/usr/bin/env python 
  2  # -*- coding: utf8 -*- 
  3   
  4  # Natural Language Toolkit: Toolbox Reader 
  5  # 
  6  # Copyright (C) 2001-2007 University of Pennsylvania 
  7  # Author: Greg Aumann <greg_aumann@sil.org> 
  8  #         Stuart Robinson <Stuart.Robinson@mpi.nl> 
  9  #         Steven Bird <sb@csse.unimelb.edu.au> 
 10  # URL: <http://nltk.sf.net> 
 11  # For license information, see LICENSE.TXT 
 12   
 13  """ 
 14  Module for reading, writing and manipulating Toolbox databases. 
 15  """ 
 16   
 17  import os, re 
 18  from nltk_lite.corpora import get_basedir 
 19  from string import split 
 20  from itertools import imap 
 21  from StringIO import StringIO 
 22  from nltk_lite.etree.ElementTree import TreeBuilder, Element 
 23   
24 -class StandardFormat(object):
25 """ 26 Class for reading and processing standard format marker files and strings. 27 """ 28
29 - def open(self, sfm_file):
30 """Open a standard format marker file for sequential reading. 31 32 @param sfm_file: name of the standard format marker input file 33 @type sfm_file: string 34 """ 35 self._file = file(sfm_file, 'rU')
36
37 - def open_string(self, s):
38 """Open a standard format marker string for sequential reading. 39 40 @param s: string to parse as a standard format marker input file 41 @type s: string 42 """ 43 self._file = StringIO(s)
44
45 - def raw_fields(self):
46 """Return an iterator for the fields in the standard format marker 47 file. 48 49 @return: an iterator that returns the next field in a (marker, value) 50 tuple. Linebreaks and trailing white space are preserved except 51 for the final newline in each field. 52 @rtype: iterator over C{(marker, value)} tuples 53 """ 54 join_string = '\n' 55 line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$' 56 first_line_pat = re.compile(line_regexp % u'\ufeff?') 57 line_pat = re.compile(line_regexp % '') 58 # need to get first line outside the loop for correct handling 59 # of the first marker if it spans multiple lines 60 file_iter = iter(self._file) 61 line = file_iter.next() 62 mobj = re.match(first_line_pat, line) 63 mkr, line_value = mobj.groups() 64 value_lines = [line_value,] 65 self.line_num = 0 66 for line in file_iter: 67 self.line_num += 1 68 mobj = re.match(line_pat, line) 69 line_mkr, line_value = mobj.groups() 70 if line_mkr: 71 yield (mkr, join_string.join(value_lines)) 72 mkr = line_mkr 73 value_lines = [line_value,] 74 else: 75 value_lines.append(line_value) 76 self.line_num += 1 77 yield (mkr, join_string.join(value_lines))
78
79 - def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
80 """Return an iterator for the fields in the standard format marker file. 81 82 @param strip: strip trailing whitespace from the last line of each field 83 @type strip: boolean 84 @param unwrap: Convert newlines in a field to spaces. 85 @type unwrap: boolean 86 @param encoding: Name of an encoding to use. If it is specified then 87 the C{fields} method returns unicode strings rather than non 88 unicode strings. 89 @type encoding: string or None 90 @param errors: Error handling scheme for codec. Same as the C{decode} 91 inbuilt string method. 92 @type errors: string 93 @param unicode_fields: Set of marker names whose values are UTF-8 encoded. 94 Ignored if encoding is None. If the whole file is UTF-8 encoded set 95 C{encoding='utf8'} and leave C{unicode_fields} with its default 96 value of None. 97 @type unicode_fields: set or dictionary (actually any sequence that 98 supports the 'in' operator). 99 @return: an iterator that returns the next field in a C{(marker, value)} 100 tuple. C{marker} and C{value} are unicode strings if an C{encoding} was specified in the 101 C{fields} method. Otherwise they are nonunicode strings. 102 @rtype: iterator over C{(marker, value)} tuples 103 """ 104 if encoding is None and unicode_fields is not None: 105 raise ValueError, 'unicode_fields is set but not encoding.' 106 unwrap_pat = re.compile(r'\n+') 107 for mkr, val in self.raw_fields(): 108 if encoding: 109 if unicode_fields is not None and mkr in unicode_fields: 110 val = val.decode('utf8', errors) 111 else: 112 val = val.decode(encoding, errors) 113 mkr = mkr.decode(encoding, errors) 114 if unwrap: 115 val = unwrap_pat.sub(' ', val) 116 if strip: 117 val = val.rstrip() 118 yield (mkr, val)
119
120 - def close(self):
121 """Close a previously opened standard format marker file or string.""" 122 self._file.close() 123 try: 124 del self.line_num 125 except AttributeError: 126 pass
127
128 -class ToolboxData(StandardFormat):
129 - def __init__(self):
130 super(ToolboxData, self).__init__()
131
132 - def parse(self, *args, **kwargs):
133 return self._record_parse(*args, **kwargs)
134
135 - def _record_parse(self, key=None, **kwargs):
136 """ 137 Returns an element tree structure corresponding to a toolbox data file with 138 all markers at the same level. 139 140 Thus the following Toolbox database:: 141 \_sh v3.0 400 Rotokas Dictionary 142 \_DateStampHasFourDigitYear 143 144 \lx kaa 145 \ps V.A 146 \ge gag 147 \gp nek i pas 148 149 \lx kaa 150 \ps V.B 151 \ge strangle 152 \gp pasim nek 153 154 after parsing will end up with the same structure (ignoring the extra 155 whitespace) as the following XML fragment after being parsed by 156 ElementTree:: 157 <toolbox_data> 158 <header> 159 <_sh>v3.0 400 Rotokas Dictionary</_sh> 160 <_DateStampHasFourDigitYear/> 161 </header> 162 163 <record> 164 <lx>kaa</lx> 165 <ps>V.A</ps> 166 <ge>gag</ge> 167 <gp>nek i pas</gp> 168 </record> 169 170 <record> 171 <lx>kaa</lx> 172 <ps>V.B</ps> 173 <ge>strangle</ge> 174 <gp>pasim nek</gp> 175 </record> 176 </toolbox_data> 177 178 @param key: Name of key marker at the start of each record. If set to 179 None (the default value) the first marker that doesn't begin with an 180 underscore is assumed to be the key. 181 @type key: string 182 @param kwargs: Keyword arguments passed to L{StandardFormat.fields()} 183 @type kwargs: keyword arguments dictionary 184 @rtype: ElementTree._ElementInterface 185 @return: contents of toolbox data divided into header and records 186 """ 187 builder = TreeBuilder() 188 builder.start('toolbox_data', {}) 189 builder.start('header', {}) 190 in_records = False 191 for mkr, value in self.fields(**kwargs): 192 if key is None and not in_records and mkr[0] != '_': 193 key = mkr 194 if mkr == key: 195 if in_records: 196 builder.end('record') 197 else: 198 builder.end('header') 199 in_records = True 200 builder.start('record', {}) 201 builder.start(mkr, {}) 202 builder.data(value) 203 builder.end(mkr) 204 if in_records: 205 builder.end('record') 206 else: 207 builder.end('header') 208 builder.end('toolbox_data') 209 return builder.close()
210 211
212 -def parse_corpus(file_name, key=None, **kwargs):
213 """ 214 Return an element tree resulting from parsing the toolbox datafile. 215 216 A convenience function that creates a C{ToolboxData} object, opens and 217 parses the toolbox data file. The data file is assumed to be in the toolbox 218 subdirectory of the directory where NLTK looks for corpora, 219 see L{corpora.get_basedir()}. 220 @param file_name: Name of file in toolbox corpus directory 221 @type file_name: string 222 @param key: marker at the start of each record 223 @type key: string 224 @param kwargs: Keyword arguments passed to L{ToolboxData.parse()} 225 @type kwargs: keyword arguments dictionary 226 @rtype: ElementTree._ElementInterface 227 @return: contents of toolbox data divided into header and records 228 """ 229 db = ToolboxData() 230 db.open(os.path.join(get_basedir(), 'toolbox', file_name)) 231 return db.parse(key, **kwargs)
232 233 import re 234 235 _is_value = re.compile(r"\S") 236
237 -def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
238 """Return a string with a standard format representation of the toolbox 239 data in tree (tree can be a toolbox database or a single record). 240 241 @param tree: flat representation of toolbox data (whole database or single record) 242 @type tree: ElementTree._ElementInterface 243 @param encoding: Name of an encoding to use. 244 @type encoding: string 245 @param errors: Error handling scheme for codec. Same as the C{encode} 246 inbuilt string method. 247 @type errors: string 248 @param unicode_fields: 249 @type unicode_fields: string 250 @rtype: string 251 @return: string using standard format markup 252 """ 253 if tree.tag == 'record': 254 root = Element('toolbox_data') 255 root.append(tree) 256 tree = root 257 258 if tree.tag != 'toolbox_data': 259 raise ValueError, "not a toolbox_data element structure" 260 if encoding is None and unicode_fields is not None: 261 raise ValueError, \ 262 "if encoding is not specified then neither should unicode_fields" 263 l = [] 264 for rec in tree: 265 l.append('\n') 266 for field in rec: 267 mkr = field.tag 268 value = field.text 269 if encoding is not None: 270 if unicode_fields is not None and mkr in unicode_fields: 271 cur_encoding = 'utf8' 272 else: 273 cur_encoding = encoding 274 if re.search(_is_value, value): 275 l.append((u"\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors)) 276 else: 277 l.append((u"\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors)) 278 else: 279 if re.search(_is_value, value): 280 l.append("\\%s %s\n" % (mkr, value)) 281 else: 282 l.append("\\%s%s\n" % (mkr, value)) 283 return ''.join(l[1:])
284
285 -def _parse_record(s):
286 """ 287 Deprecated: use C{StandardFormat.fields()} 288 289 @param s: toolbox record as a string 290 @type s: L{string} 291 @rtype: iterator over L{list(string)} 292 """ 293 294 s = "\n" + s # Fields (even first) must start w/ a carriage return 295 if s.endswith("\n") : s = s[:-1] # Remove single extra carriage return 296 for field in split(s, sep="\n\\")[1:] : # Parse by carriage return followed by backslash 297 parsed_field = split(field, sep=" ", maxsplit=1) # Split properly delineated field 298 try : 299 yield (parsed_field[0], parsed_field[1]) 300 except IndexError : 301 yield (parsed_field[0], '')
302 303
304 -def raw(files='rotokas.dic', include_header=False, head_field_marker=None):
305 """ 306 Deprecated: use C{StandardFormat.fields()} 307 308 @param files: One or more toolbox files to be processed 309 @type files: L{string} or L{tuple(string)} 310 @param include_header: flag that determines whether to treat header as record (default is no) 311 @type include_header: boolean 312 @param head_field_marker: option for explicitly setting which marker to use as the head field 313 when parsing the file (default is automatically determining it from 314 the first field of the first record) 315 @type head_field_marker: string 316 @rtype: iterator over L{list(string)} 317 """ 318 319 # Just one file to process? If so convert to a tuple so we can iterate 320 if type(files) is str : files = (files,) 321 322 for file in files: 323 path = os.path.join(get_basedir(), "toolbox", file) 324 fc = open(path, "U").read() 325 if fc.strip().startswith(r"\_") : 326 (header, body) = split(fc, sep="\n\n", maxsplit=1) 327 if include_header: 328 yield list(_parse_record(header)) 329 else : 330 body = fc 331 332 # Deal with head field marker 333 if head_field_marker : 334 hfm_with_backslash = "\\" + hfm 335 else : 336 ff = split(body, sep="\n", maxsplit=1)[0] # first field 337 hfm_with_backslash = split(ff, sep=" ", maxsplit=1)[0] # raw marker of first field 338 recordsep = "\n\n"+hfm_with_backslash # separates records from one another 339 340 # Parse records 341 for r in split("\n\n"+body, sep=recordsep)[1:] : 342 yield list(_parse_record(hfm_with_backslash + r))
343 344 # assumes headwords are unique
345 -def dictionary(files='rotokas.dic', include_header=False) :
346 """ 347 Deprecated: use C{ToolboxData.parse()} 348 349 @param files: One or more toolbox files to be processed 350 @type files: L{string} or L{tuple(string)} 351 @param include_header: treat header as entry? 352 @type include_header: boolean 353 @rtype: iterator over L{dict} 354 """ 355 return imap(dict, raw(files, include_header))
356
357 -def _dict_list_entry(entry):
358 d = {} 359 for field in entry: 360 if len(field) == 2: 361 name, value = field 362 if name not in d: 363 d[name] = [] 364 d[name].append(value) 365 return d
366 367 # if two entries have the same headword this key maps to a list of entries
368 -def dict_list(files='rotokas.dic', include_header=False) :
369 """ 370 Deprecated: use C{ToolboxData.parse()} 371 372 @param files: One or more toolbox files to be processed 373 @type files: L{string} or L{tuple(string)} 374 @param include_header: treat header as entry? 375 @type include_header: boolean 376 @rtype: iterator over L{dict} 377 """ 378 379 # Just one file to process? If so convert to a tuple so we can iterate 380 if type(files) is str : files = (files,) 381 382 for entry in raw(files, include_header) : 383 yield _dict_list_entry(entry)
384
385 -def demo():
386 from nltk_lite.corpora import toolbox 387 from itertools import islice 388 from pprint import pprint 389 390 print 'Raw:' 391 pprint(list(islice(toolbox.raw(), 3))) 392 393 print 'Dictionary:' 394 pprint(list(islice(toolbox.dictionary(), 3))) 395 396 print 'Dictionary-List:' 397 pprint(list(islice(toolbox.dict_list(), 3))) 398 399 print 'Complex test cases, no header' 400 pprint(list(toolbox.raw("test.dic"))) 401 402 print 'Complex test cases, no header, dictionary' 403 pprint(list(toolbox.dictionary("test.dic"))) 404 405 print 'Complex test cases, no header, dictionary list' 406 pprint(list(toolbox.dict_list("test.dic"))) 407 408 print 'Complex test cases, with header' 409 pprint(list(toolbox.raw("test.dic", include_header=True)))
410 411 if __name__ == '__main__': 412 demo() 413