Ruby  1.9.3p448(2013-06-27revision41675)
utf_16le.c
Go to the documentation of this file.
1 /**********************************************************************
2  utf_16le.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
33 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
34 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
35 
36 static const int EncLen_UTF16[] = {
37  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
38  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
41  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50  2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
53 };
54 
55 static int
58 {
59  int len = (int)(e - p);
60  UChar byte;
61  if (len < 2)
63  byte = p[1];
64  if (!UTF16_IS_SURROGATE(byte)) {
66  }
67  if (UTF16_IS_SURROGATE_FIRST(byte)) {
68  if (len < 4)
70  if (UTF16_IS_SURROGATE_SECOND(p[3]))
72  }
74 }
75 
76 static int
77 utf16le_is_mbc_newline(const UChar* p, const UChar* end,
79 {
80  if (p + 1 < end) {
81  if (*p == 0x0a && *(p+1) == 0x00)
82  return 1;
83 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
84  if ((
85 #ifndef USE_CRNL_AS_LINE_TERMINATOR
86  *p == 0x0d ||
87 #endif
88  *p == 0x85) && *(p+1) == 0x00)
89  return 1;
90  if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
91  return 1;
92 #endif
93  }
94  return 0;
95 }
96 
97 static OnigCodePoint
99  OnigEncoding enc ARG_UNUSED)
100 {
101  OnigCodePoint code;
102  UChar c0 = *p;
103  UChar c1 = *(p+1);
104 
105  if (UTF16_IS_SURROGATE_FIRST(c1)) {
106  code = ((((c1 << 8) + c0) & 0x03ff) << 10)
107  + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000;
108  }
109  else {
110  code = c1 * 256 + p[0];
111  }
112  return code;
113 }
114 
115 static int
118 {
119  return (code > 0xffff ? 4 : 2);
120 }
121 
122 static int
125 {
126  UChar* p = buf;
127 
128  if (code > 0xffff) {
129  unsigned int high = (code >> 10) + 0xD7C0;
130  unsigned int low = (code & 0x3FF) + 0xDC00;
131  *p++ = high & 0xFF;
132  *p++ = (high >> 8) & 0xFF;
133  *p++ = low & 0xFF;
134  *p++ = (low >> 8) & 0xFF;
135  return 4;
136  }
137  else {
138  *p++ = (UChar )(code & 0xff);
139  *p++ = (UChar )((code & 0xff00) >> 8);
140  return 2;
141  }
142 }
143 
144 static int
146  const UChar** pp, const UChar* end, UChar* fold,
147  OnigEncoding enc)
148 {
149  const UChar* p = *pp;
150 
151  if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
152 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
153  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
154  if (*p == 0x49) {
155  *fold++ = 0x31;
156  *fold = 0x01;
157  (*pp) += 2;
158  return 2;
159  }
160  }
161 #endif
162 
163  *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
164  *fold = 0;
165  *pp += 2;
166  return 2;
167  }
168  else
169  return onigenc_unicode_mbc_case_fold(enc, flag, pp,
170  end, fold);
171 }
172 
173 #if 0
174 static int
175 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
176  const UChar* end)
177 {
178  const UChar* p = *pp;
179 
180  (*pp) += EncLen_UTF16[*(p+1)];
181 
182  if (*(p+1) == 0) {
183  int c, v;
184 
185  if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
186  return TRUE;
187  }
188 
189  c = *p;
190  v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
192  if ((v | BIT_CTYPE_LOWER) != 0) {
193  /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
194  if (c >= 0xaa && c <= 0xba)
195  return FALSE;
196  else
197  return TRUE;
198  }
199  return (v != 0 ? TRUE : FALSE);
200  }
201 
202  return FALSE;
203 }
204 #endif
205 
206 static UChar*
207 utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
209 {
210  if (s <= start) return (UChar* )s;
211 
212  if ((s - start) % 2 == 1) {
213  s--;
214  }
215 
216  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
217  s -= 2;
218 
219  return (UChar* )s;
220 }
221 
222 static int
224  const OnigUChar* p, const OnigUChar* end,
225  OnigCaseFoldCodeItem items[],
226  OnigEncoding enc)
227 {
229  flag, p, end, items);
230 }
231 
232 OnigEncodingDefine(utf_16le, UTF_16LE) = {
234  "UTF-16LE", /* name */
235  4, /* max byte length */
236  2, /* min byte length */
249 };
int onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
Definition: unicode.c:2042
unsigned int OnigCodePoint
Definition: oniguruma.h:111
static UChar * utf16le_left_adjust_char_head(const UChar *start, const UChar *s, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: utf_16le.c:207
#define FALSE
Definition: nkf.h:185
unsigned int OnigCaseFoldType
Definition: oniguruma.h:117
Definition: nkf.c:115
#define ARG_UNUSED
#define ONIGENC_IS_ASCII_CODE(code)
int onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar *name, UChar *end)
Definition: unicode.c:2086
#define UTF16_IS_SURROGATE_SECOND(c)
Definition: utf_16le.c:33
unsigned char OnigUChar
Definition: oniguruma.h:110
static const int EncLen_UTF16[]
Definition: utf_16le.c:36
Win32OLEIDispatch * p
Definition: win32ole.c:778
static OnigCodePoint utf16le_mbc_to_code(const UChar *p, const UChar *end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: utf_16le.c:98
int onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar **pp, const UChar *end, UChar *fold)
Definition: unicode.c:2219
static int utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar *p, const OnigUChar *end, OnigCaseFoldCodeItem items[], OnigEncoding enc)
Definition: utf_16le.c:223
OnigEncodingDefine(utf_16le, UTF_16LE)
int onigenc_always_false_is_allowed_reverse_match(const UChar *s ARG_UNUSED, const UChar *end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:656
static int utf16le_is_mbc_newline(const UChar *p, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: utf_16le.c:77
static int utf16le_mbc_case_fold(OnigCaseFoldType flag, const UChar **pp, const UChar *end, UChar *fold, OnigEncoding enc)
Definition: utf_16le.c:145
int onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void *arg, OnigEncoding enc ARG_UNUSED)
Definition: unicode.c:2273
#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n)
Definition: oniguruma.h:250
#define ONIGENC_CASE_FOLD_TURKISH_AZERI
Definition: oniguruma.h:123
#define TRUE
Definition: nkf.h:186
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:3913
#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n)
Definition: oniguruma.h:243
register unsigned int len
Definition: name2ctype.h:22210
#define UTF16_IS_SURROGATE(c)
Definition: utf_16le.c:34
#define BIT_CTYPE_UPPER
#define UChar
Definition: oniguruma.h:107
#define ONIGENC_CONSTRUCT_MBCLEN_INVALID()
Definition: oniguruma.h:247
static int utf16le_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
Definition: utf_16le.c:116
int onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar *p, const OnigUChar *end, OnigCaseFoldCodeItem items[])
Definition: unicode.c:2409
v
Definition: win32ole.c:790
#define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c)
#define UTF16_IS_SURROGATE_FIRST(c)
Definition: utf_16le.c:32
#define INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
Definition: oniguruma.h:124
static int utf16le_mbc_enc_len(const UChar *p, const OnigUChar *e, OnigEncoding enc ARG_UNUSED)
Definition: utf_16le.c:56
#define BIT_CTYPE_LOWER
int onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint *ranges[], struct OnigEncodingTypeST *enc ARG_UNUSED)
Definition: unicode.c:2073
static int utf16le_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
Definition: utf_16le.c:123