1
2
3
4
5
6
7
8
9
10 """
11 Functions for tokenizing a text, based on a regular expression
12 which matches tokens or gaps.
13 """
14
15 import re, sre_parse, sre_constants, sre_compile
16
17 WHITESPACE = r'\s+'
18 NEWLINE = r'\n'
19 BLANKLINE = r'\s*\n\s*\n\s*'
20 WORD = r'\w+'
21 WORDPUNCT = r'[a-zA-Z]+|[^a-zA-Z\s]+'
22 SHOEBOXSEP = r'^\\'
23 TREEBANK = r'^\(.*?(?=^\(|\Z)'
24
26 """
27 Modifies the given parsed regular expression, replacing all groupings
28 (as indicated by parenthesis in the regular expression string) with
29 non-grouping variants (indicated with '(?:...)'). This works on the
30 output of sre_parse.parse, modifing the group indentifier in
31 SUBPATTERN structures to None.
32
33 @param parsed_re: the output of sre_parse.parse(string)
34 @type parsed_re: C{SubPattern}
35 """
36 if isinstance(parsed_re, sre_parse.SubPattern):
37
38
39
40 for i in range(len(parsed_re)):
41 parsed_re[i] = _remove_group_identifiers(parsed_re[i])
42 return parsed_re
43 elif isinstance(parsed_re, list) or isinstance(parsed_re, tuple):
44
45
46 to_process = list(parsed_re)
47 if to_process[0] == sre_constants.SUBPATTERN:
48
49 sub_item = list(to_process[1])
50 sub_item[0] = None
51 to_process[1] = tuple(sub_item)
52
53
54 processed = map(_remove_group_identifiers, to_process)
55
56
57 if isinstance(parsed_re, list):
58 return processed
59 else:
60 return tuple(processed)
61 else:
62
63 return parsed_re
64
65
66
67
68
69
70
71
72
73
74
90
92 """
93 @return: An iterator that generates tokens and the gaps between them
94 """
95
96 if advanced:
97 regex = _compile(pattern)
98 else:
99 regex = re.compile(pattern, re.UNICODE | re.MULTILINE | re.DOTALL)
100
101
102
103 if isinstance(text, (str, unicode)):
104 text = (text,)
105
106
107
108
109 leftover = ''
110 offset = 0
111 for substring in text:
112 position = 0
113
114
115 match = regex.match(substring)
116 if match:
117 yield leftover+substring[position:match.start()]
118 yield substring[match.start():match.end()]
119 position = match.end()
120 leftover = ''
121
122
123 while position < len(substring):
124 match = regex.search(substring, position)
125 if match:
126 yield leftover+substring[position:match.start()]
127 yield substring[match.start():match.end()]
128 position = match.end()
129 leftover = ''
130 else:
131 leftover = substring[position:]
132 break
133
134
135 offset += position
136
137
138 if leftover:
139 yield leftover
140
141 -def regexp(text, pattern, gaps=False, advanced=False):
142 """
143 Tokenize the text according to the regular expression pattern.
144
145 @param text: the string or string iterator to be tokenized
146 @type text: C{string} or C{iter(string)}
147 @param pattern: the regular expression
148 @type pattern: C{string}
149 @param gaps: set to True if the pattern matches material between tokens
150 @type gaps: C{boolean}
151 @param advanced: set to True if the pattern is complex, making use of () groups
152 @type advanced: C{boolean}
153 @return: An iterator over tokens
154 """
155
156 for (i,token) in enumerate(token_split(text, pattern, advanced)):
157 if ((i%2==0) == gaps and token != ''):
158 yield token
159
161 """
162 Tokenize the text at whitespace.
163
164 @param s: the string or string iterator to be tokenized
165 @type s: C{string} or C{iter(string)}
166 @return: An iterator over tokens
167 """
168 return regexp(s, pattern=WHITESPACE, gaps=True)
169
171 """
172 Tokenize the text into lines.
173
174 @param s: the string or string iterator to be tokenized
175 @type s: C{string} or C{iter(string)}
176 @return: An iterator over tokens
177 """
178 return regexp(s, pattern=NEWLINE, gaps=True)
179
181 """
182 Tokenize the text into paragraphs (separated by blank lines).
183
184 @param s: the string or string iterator to be tokenized
185 @type s: C{string} or C{iter(string)}
186 @return: An iterator over tokens
187 """
188 return regexp(s, pattern=BLANKLINE, gaps=True)
189
191 """
192 Tokenize the text into sequences of word characters (a-zA-Z0-9).
193
194 @param s: the string or string iterator to be tokenized
195 @type s: C{string} or C{iter(string)}
196 @return: An iterator over tokens
197 """
198 return regexp(s, pattern=WORD)
199
201 """
202 Tokenize the text into sequences of alphabetic and non-alphabetic
203 characters. E.g. "She said 'hello.'" would be tokenized to
204 ["She", "said", "'", "hello", ".'"]
205
206 @param s: the string or string iterator to be tokenized
207 @type s: C{string} or C{iter(string)}
208 @return: An iterator over tokens
209 """
210 return regexp(s, pattern=WORDPUNCT)
211
213 """
214 Tokenize a Shoebox entry into its fields (separated by backslash markers).
215
216 @param s: the string or string iterator to be tokenized
217 @type s: C{string} or C{iter(string)}
218 @return: An iterator over tokens
219 """
220 return regexp(s, pattern=SHOEBOXSEP, gaps=True)
221
223 """
224 Tokenize a Treebank file into its tree strings
225
226 @param s: the string or string iterator to be tokenized
227 @type s: C{string} or C{iter(string)}
228 @return: An iterator over tokens
229 """
230 return regexp(s, pattern=TREEBANK, advanced=True)
231
232
233
234
235
237 """
238 A helper function for L{demo} that displays a list of tokens.
239 """
240
241 str = ' '+`list(tokens)`+' '
242 str = re.sub(r"(.{,70})\s", r'\1\n ', str).rstrip()
243
244
245 str = re.sub(r'(.+\n.+\n.+)\s\S+\n[\s\S]+(?!$)', r'\1 ...]', str)
246
247 print str
248
250 """
251 A demonstration that shows the output of several different
252 tokenizers on the same string.
253 """
254
255 from nltk_lite import tokenize
256
257
258 s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
259 print 'Input text:'
260 print `s`
261 print
262 print 'Tokenize using whitespace:'
263 _display(tokenize.whitespace(s))
264 print
265 print 'Tokenize sequences of alphanumeric characters:'
266 _display(tokenize.regexp(s, pattern=r'\w+', gaps=False))
267 print
268 print 'Tokenize sequences of letters and sequences of nonletters:'
269 _display(tokenize.wordpunct(s))
270 print
271 print 'Tokenize by lines:'
272 _display(tokenize.line(s))
273 print
274 print 'Tokenize by blank lines:'
275 _display(tokenize.blankline(s))
276 print
277 print 'A simple sentence tokenizer:'
278 _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True))
279 print
280
281 if __name__ == '__main__':
282 demo()
283