Ruby  1.9.3p448(2013-06-27revision41675)
utf_8.c
Go to the documentation of this file.
1 /**********************************************************************
2  utf_8.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 #define USE_INVALID_CODE_SCHEME
33 
34 #ifdef USE_INVALID_CODE_SCHEME
35 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
36 #define INVALID_CODE_FE 0xfffffffe
37 #define INVALID_CODE_FF 0xffffffff
38 #define VALID_CODE_LIMIT 0x7fffffff
39 #endif
40 
41 #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
42 
43 static const int EncLen_UTF8[] = {
44  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
58  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
59  4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
60 };
61 
62 typedef enum {
63  FAILURE = -2,
65  S0, S1, S2, S3,
66  S4, S5, S6, S7
67 } state_t;
68 #define A ACCEPT
69 #define F FAILURE
70 static const signed char trans[][0x100] = {
71  { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
72  /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
73  /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
74  /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
75  /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
76  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
77  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
78  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
79  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
80  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
81  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
82  /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
83  /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84  /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85  /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86  /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
87  /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
88  },
89  { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
90  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
91  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
92  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
93  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
94  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
95  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
96  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98  /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
99  /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
100  /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
101  /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
102  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
103  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
104  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
105  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
106  },
107  { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
109  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
110  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
111  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
112  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
113  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
114  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
115  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
116  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
117  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
118  /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
119  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
121  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
122  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
123  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
124  },
125  { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
126  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
127  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
128  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
129  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
130  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
131  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
132  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
133  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
134  /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135  /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
136  /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
137  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
139  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
140  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
141  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
142  },
143  { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
144  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
145  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
146  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
147  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
148  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
149  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
150  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
151  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
152  /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153  /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154  /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
155  /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
156  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
157  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
158  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
159  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
160  },
161  { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
162  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
163  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
164  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
165  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
166  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
167  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
168  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
169  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
170  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
171  /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
172  /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
173  /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
174  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
175  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
176  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
177  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
178  },
179  { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */
180  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
181  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
182  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
183  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
184  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
185  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
186  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
187  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
188  /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
189  /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
190  /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
191  /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
192  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
193  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
194  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
195  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
196  },
197  { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */
198  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
199  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
200  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
201  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
202  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
203  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
204  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
205  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
206  /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
207  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
208  /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
209  /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
210  /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
211  /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
212  /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
213  /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
214  },
215 };
216 #undef A
217 #undef F
218 
219 static int
221 {
222  int firstbyte = *p++;
223  state_t s;
224  s = trans[0][firstbyte];
225  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
227 
228  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
229  s = trans[s][*p++];
230  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
232 
233  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
234  s = trans[s][*p++];
235  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
237 
238  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
239  s = trans[s][*p++];
240  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
242 }
243 
244 static int
245 is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
246 {
247  if (p < end) {
248  if (*p == 0x0a) return 1;
249 
250 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
251 #ifndef USE_CRNL_AS_LINE_TERMINATOR
252  if (*p == 0x0d) return 1;
253 #endif
254  if (p + 1 < end) {
255  if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
256  return 1;
257  if (p + 2 < end) {
258  if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
259  && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
260  return 1;
261  }
262  }
263 #endif
264  }
265 
266  return 0;
267 }
268 
269 static OnigCodePoint
270 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
271 {
272  int c, len;
273  OnigCodePoint n;
274 
275  len = enclen(enc, p, end);
276  c = *p++;
277  if (len > 1) {
278  len--;
279  n = c & ((1 << (6 - len)) - 1);
280  while (len--) {
281  c = *p++;
282  n = (n << 6) | (c & ((1 << 6) - 1));
283  }
284  return n;
285  }
286  else {
287 #ifdef USE_INVALID_CODE_SCHEME
288  if (c > 0xfd) {
289  return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
290  }
291 #endif
292  return (OnigCodePoint )c;
293  }
294 }
295 
296 static int
298 {
299  if ((code & 0xffffff80) == 0) return 1;
300  else if ((code & 0xfffff800) == 0) return 2;
301  else if ((code & 0xffff0000) == 0) return 3;
302  else if ((code & 0xffe00000) == 0) return 4;
303  else if ((code & 0xfc000000) == 0) return 5;
304  else if ((code & 0x80000000) == 0) return 6;
305 #ifdef USE_INVALID_CODE_SCHEME
306  else if (code == INVALID_CODE_FE) return 1;
307  else if (code == INVALID_CODE_FF) return 1;
308 #endif
309  else
311 }
312 
313 static int
315 {
316 #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
317 #define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80)
318 
319  if ((code & 0xffffff80) == 0) {
320  *buf = (UChar )code;
321  return 1;
322  }
323  else {
324  UChar *p = buf;
325 
326  if ((code & 0xfffff800) == 0) {
327  *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
328  }
329  else if ((code & 0xffff0000) == 0) {
330  *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
331  *p++ = UTF8_TRAILS(code, 6);
332  }
333  else if ((code & 0xffe00000) == 0) {
334  *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
335  *p++ = UTF8_TRAILS(code, 12);
336  *p++ = UTF8_TRAILS(code, 6);
337  }
338  else if ((code & 0xfc000000) == 0) {
339  *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
340  *p++ = UTF8_TRAILS(code, 18);
341  *p++ = UTF8_TRAILS(code, 12);
342  *p++ = UTF8_TRAILS(code, 6);
343  }
344  else if ((code & 0x80000000) == 0) {
345  *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
346  *p++ = UTF8_TRAILS(code, 24);
347  *p++ = UTF8_TRAILS(code, 18);
348  *p++ = UTF8_TRAILS(code, 12);
349  *p++ = UTF8_TRAILS(code, 6);
350  }
351 #ifdef USE_INVALID_CODE_SCHEME
352  else if (code == INVALID_CODE_FE) {
353  *p = 0xfe;
354  return 1;
355  }
356  else if (code == INVALID_CODE_FF) {
357  *p = 0xff;
358  return 1;
359  }
360 #endif
361  else {
363  }
364 
365  *p++ = UTF8_TRAIL0(code);
366  return (int)(p - buf);
367  }
368 }
369 
370 static int
372  const UChar* end, UChar* fold, OnigEncoding enc)
373 {
374  const UChar* p = *pp;
375 
376  if (ONIGENC_IS_MBC_ASCII(p)) {
377 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
378  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
379  if (*p == 0x49) {
380  *fold++ = 0xc4;
381  *fold = 0xb1;
382  (*pp)++;
383  return 2;
384  }
385  }
386 #endif
387 
389  (*pp)++;
390  return 1; /* return byte length of converted char to lower */
391  }
392  else {
393  return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold);
394  }
395 }
396 
397 
398 static int
400  const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
401 {
402  *sb_out = 0x80;
403  return onigenc_unicode_ctype_code_range(ctype, ranges);
404 }
405 
406 
407 static UChar*
408 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
409 {
410  const UChar *p;
411 
412  if (s <= start) return (UChar* )s;
413  p = s;
414 
415  while (!utf8_islead(*p) && p > start) p--;
416  return (UChar* )p;
417 }
418 
419 static int
421  const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[],
422  OnigEncoding enc)
423 {
424  return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items);
425 }
426 
427 OnigEncodingDefine(utf_8, UTF_8) = {
428  mbc_enc_len,
429  "UTF-8", /* name */
430  6, /* max byte length */
431  1, /* min byte length */
433  mbc_to_code,
435  code_to_mbc,
444 };
445 ENC_ALIAS("CP65001", "UTF-8")
446 
447 /*
448  * Name: UTF8-MAC
449  * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html
450  * Link: http://developer.apple.com/qa/qa2001/qa1235.html
451  * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html
452  * Link: http://www.gnu.org/software/emacs/NEWS.23.2
453  */
454 ENC_REPLICATE("UTF8-MAC", "UTF-8")
455 ENC_ALIAS("UTF-8-MAC", "UTF8-MAC")
456 ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */
457 
int onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
Definition: unicode.c:2042
static OnigCodePoint mbc_to_code(const UChar *p, const UChar *end, OnigEncoding enc)
Definition: utf_8.c:270
unsigned int OnigCodePoint
Definition: oniguruma.h:111
int onigenc_always_true_is_allowed_reverse_match(const UChar *s ARG_UNUSED, const UChar *end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:649
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
Definition: oniguruma.h:559
static int mbc_case_fold(OnigCaseFoldType flag, const UChar **pp, const UChar *end, UChar *fold, OnigEncoding enc)
Definition: utf_8.c:371
static UChar * left_adjust_char_head(const UChar *start, const UChar *s, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: utf_8.c:408
#define UTF8_TRAILS(code, shift)
#define ONIGENC_IS_MBC_ASCII(p)
Definition: oniguruma.h:222
static int mbc_enc_len(const UChar *p, const UChar *e, OnigEncoding enc ARG_UNUSED)
Definition: utf_8.c:220
unsigned int OnigCaseFoldType
Definition: oniguruma.h:117
#define A
Definition: utf_8.c:68
Definition: utf_8.c:66
unsigned char UTF8
Definition: generator.h:86
Definition: utf_8.c:66
Definition: utf_8.c:63
state_t
Definition: big5.c:90
static int is_mbc_newline(const UChar *p, const UChar *end, OnigEncoding enc)
Definition: utf_8.c:245
#define ARG_UNUSED
static const int EncLen_UTF8[]
Definition: utf_8.c:43
state_t
Definition: utf_8.c:62
#define F
Definition: utf_8.c:69
static int code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
Definition: utf_8.c:297
#define UTF8_TRAIL0(code)
#define ENC_ALIAS(name, orig)
Definition: encdb.c:18
Definition: utf_8.c:66
static const signed char trans[][0x100]
Definition: utf_8.c:70
int onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar *name, UChar *end)
Definition: unicode.c:2086
Definition: utf_8.c:65
unsigned char OnigUChar
Definition: oniguruma.h:110
static int get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint *ranges[], OnigEncoding enc ARG_UNUSED)
Definition: utf_8.c:399
Win32OLEIDispatch * p
Definition: win32ole.c:778
int onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint *ranges[])
Definition: unicode.c:2061
int onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar **pp, const UChar *end, UChar *fold)
Definition: unicode.c:2219
#define enclen(enc, p, e)
#define INVALID_CODE_FE
Definition: utf_8.c:36
OnigEncodingDefine(utf_8, UTF_8)
Definition: utf_8.c:65
int onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void *arg, OnigEncoding enc ARG_UNUSED)
Definition: unicode.c:2273
#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n)
Definition: oniguruma.h:250
#define ONIGENC_CASE_FOLD_TURKISH_AZERI
Definition: oniguruma.h:123
unsigned int OnigCtype
Definition: oniguruma.h:112
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:3913
Definition: utf_8.c:65
#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n)
Definition: oniguruma.h:243
Definition: utf_8.c:66
register unsigned int len
Definition: name2ctype.h:22210
#define UChar
Definition: oniguruma.h:107
static int code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
Definition: utf_8.c:314
#define ONIGENC_CONSTRUCT_MBCLEN_INVALID()
Definition: oniguruma.h:247
#define ENC_REPLICATE(name, orig)
Definition: encdb.c:17
int onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar *p, const OnigUChar *end, OnigCaseFoldCodeItem items[])
Definition: unicode.c:2409
Definition: utf_8.c:65
#define utf8_islead(c)
Definition: utf_8.c:41
#define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c)
Definition: nkf.c:108
static int get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar *p, const OnigUChar *end, OnigCaseFoldCodeItem items[], OnigEncoding enc)
Definition: utf_8.c:420
Definition: utf_8.c:64
#define INVALID_CODE_FF
Definition: utf_8.c:37