Ruby  1.9.3p448(2013-06-27revision41675)
iso_8859_1.c
Go to the documentation of this file.
1 /**********************************************************************
2  iso8859_1.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
33 
34 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
35  ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
36 
37 static const unsigned short EncISO_8859_1_CtypeTable[256] = {
38  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
39  0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
40  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
41  0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
42  0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
43  0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
44  0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
45  0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46  0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
47  0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
48  0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
49  0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
50  0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
51  0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
52  0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
53  0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
54  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
55  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
56  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
57  0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
58  0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
59  0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
60  0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
61  0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
62  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
63  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
64  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
65  0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
66  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
67  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
68  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
69  0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
70 };
71 
73  { 0xc0, 0xe0 },
74  { 0xc1, 0xe1 },
75  { 0xc2, 0xe2 },
76  { 0xc3, 0xe3 },
77  { 0xc4, 0xe4 },
78  { 0xc5, 0xe5 },
79  { 0xc6, 0xe6 },
80  { 0xc7, 0xe7 },
81  { 0xc8, 0xe8 },
82  { 0xc9, 0xe9 },
83  { 0xca, 0xea },
84  { 0xcb, 0xeb },
85  { 0xcc, 0xec },
86  { 0xcd, 0xed },
87  { 0xce, 0xee },
88  { 0xcf, 0xef },
89 
90  { 0xd0, 0xf0 },
91  { 0xd1, 0xf1 },
92  { 0xd2, 0xf2 },
93  { 0xd3, 0xf3 },
94  { 0xd4, 0xf4 },
95  { 0xd5, 0xf5 },
96  { 0xd6, 0xf6 },
97  { 0xd8, 0xf8 },
98  { 0xd9, 0xf9 },
99  { 0xda, 0xfa },
100  { 0xdb, 0xfb },
101  { 0xdc, 0xfc },
102  { 0xdd, 0xfd },
103  { 0xde, 0xfe }
104 };
105 
106 static int
108  OnigApplyAllCaseFoldFunc f, void* arg,
110 {
112  numberof(CaseFoldMap), CaseFoldMap, 1,
113  flag, f, arg);
114 }
115 
116 static int
118  const OnigUChar* p, const OnigUChar* end,
119  OnigCaseFoldCodeItem items[],
121 {
122  if (0x41 <= *p && *p <= 0x5a) {
123  items[0].byte_len = 1;
124  items[0].code_len = 1;
125  items[0].code[0] = (OnigCodePoint )(*p + 0x20);
126  if (*p == 0x53 && end > p + 1
127  && (*(p+1) == 0x53 || *(p+1) == 0x73)) { /* SS */
128  items[1].byte_len = 2;
129  items[1].code_len = 1;
130  items[1].code[0] = (OnigCodePoint )0xdf;
131  return 2;
132  }
133  else
134  return 1;
135  }
136  else if (0x61 <= *p && *p <= 0x7a) {
137  items[0].byte_len = 1;
138  items[0].code_len = 1;
139  items[0].code[0] = (OnigCodePoint )(*p - 0x20);
140  if (*p == 0x73 && end > p + 1
141  && (*(p+1) == 0x73 || *(p+1) == 0x53)) { /* ss */
142  items[1].byte_len = 2;
143  items[1].code_len = 1;
144  items[1].code[0] = (OnigCodePoint )0xdf;
145  return 2;
146  }
147  else
148  return 1;
149  }
150  else if (0xc0 <= *p && *p <= 0xcf) {
151  items[0].byte_len = 1;
152  items[0].code_len = 1;
153  items[0].code[0] = (OnigCodePoint )(*p + 0x20);
154  return 1;
155  }
156  else if (0xd0 <= *p && *p <= 0xdf) {
157  if (*p == 0xdf) {
158  items[0].byte_len = 1;
159  items[0].code_len = 2;
160  items[0].code[0] = (OnigCodePoint )'s';
161  items[0].code[1] = (OnigCodePoint )'s';
162 
163  items[1].byte_len = 1;
164  items[1].code_len = 2;
165  items[1].code[0] = (OnigCodePoint )'S';
166  items[1].code[1] = (OnigCodePoint )'S';
167 
168  items[2].byte_len = 1;
169  items[2].code_len = 2;
170  items[2].code[0] = (OnigCodePoint )'s';
171  items[2].code[1] = (OnigCodePoint )'S';
172 
173  items[3].byte_len = 1;
174  items[3].code_len = 2;
175  items[3].code[0] = (OnigCodePoint )'S';
176  items[3].code[1] = (OnigCodePoint )'s';
177 
178  return 4;
179  }
180  else if (*p != 0xd7) {
181  items[0].byte_len = 1;
182  items[0].code_len = 1;
183  items[0].code[0] = (OnigCodePoint )(*p + 0x20);
184  return 1;
185  }
186  }
187  else if (0xe0 <= *p && *p <= 0xef) {
188  items[0].byte_len = 1;
189  items[0].code_len = 1;
190  items[0].code[0] = (OnigCodePoint )(*p - 0x20);
191  return 1;
192  }
193  else if (0xf0 <= *p && *p <= 0xfe) {
194  if (*p != 0xf7) {
195  items[0].byte_len = 1;
196  items[0].code_len = 1;
197  items[0].code[0] = (OnigCodePoint )(*p - 0x20);
198  return 1;
199  }
200  }
201 
202  return 0;
203 }
204 
205 static int
206 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED,
207  UChar* lower, OnigEncoding enc ARG_UNUSED)
208 {
209  const UChar* p = *pp;
210 
211  if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
212  *lower++ = 's';
213  *lower = 's';
214  (*pp)++;
215  return 2;
216  }
217 
219  (*pp)++;
220  return 1;
221 }
222 
223 #if 0
224 static int
225 is_mbc_ambiguous(OnigCaseFoldType flag,
226  const UChar** pp, const UChar* end)
227 {
228  int v;
229  const UChar* p = *pp;
230 
231  if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
232  (*pp)++;
233  return TRUE;
234  }
235 
236  (*pp)++;
238  if ((v | BIT_CTYPE_LOWER) != 0) {
239  /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
240  if (*p >= 0xaa && *p <= 0xba)
241  return FALSE;
242  else
243  return TRUE;
244  }
245 
246  return (v != 0 ? TRUE : FALSE);
247 }
248 #endif
249 
250 static int
251 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
252 {
253  if (code < 256)
254  return ENC_IS_ISO_8859_1_CTYPE(code, ctype);
255  else
256  return FALSE;
257 }
258 
259 OnigEncodingDefine(iso_8859_1, ISO_8859_1) = {
261  "ISO-8859-1", /* name */
262  1, /* max enc length */
263  1, /* min enc length */
276 };
277 ENC_ALIAS("ISO8859-1", "ISO-8859-1")
278 
279 /*
280  * Name: windows-1252
281  * MIBenum: 2252
282  * Link: http://www.iana.org/assignments/character-sets
283  * Link: http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx
284  * Link: http://en.wikipedia.org/wiki/Windows-1252
285  */
286 ENC_REPLICATE("Windows-1252", "ISO-8859-1")
287 ENC_ALIAS("CP1252", "Windows-1252")
static int is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
Definition: iso_8859_1.c:251
unsigned int OnigCodePoint
Definition: oniguruma.h:111
int onigenc_always_true_is_allowed_reverse_match(const UChar *s ARG_UNUSED, const UChar *end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:649
#define FALSE
Definition: nkf.h:185
OnigCodePoint onigenc_single_byte_mbc_to_code(const UChar *p, const UChar *end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:619
#define numberof(array)
Definition: iso_8859_1.c:32
unsigned int OnigCaseFoldType
Definition: oniguruma.h:117
static int get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar *p, const OnigUChar *end, OnigCaseFoldCodeItem items[], OnigEncoding enc ARG_UNUSED)
Definition: iso_8859_1.c:117
#define ARG_UNUSED
#define ENC_ALIAS(name, orig)
Definition: encdb.c:18
unsigned char OnigUChar
Definition: oniguruma.h:110
Win32OLEIDispatch * p
Definition: win32ole.c:778
static int apply_all_case_fold(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void *arg, OnigEncoding enc ARG_UNUSED)
Definition: iso_8859_1.c:107
#define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c)
static const unsigned short EncISO_8859_1_CtypeTable[256]
Definition: iso_8859_1.c:37
int(* OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint *to, int to_len, void *arg)
Definition: oniguruma.h:154
#define ENC_IS_ISO_8859_1_CTYPE(code, ctype)
Definition: iso_8859_1.c:34
int onigenc_apply_all_case_fold_with_map(int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void *arg)
Definition: regenc.c:462
int onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:626
arg
Definition: ripper.y:1283
UChar * onigenc_single_byte_left_adjust_char_head(const UChar *start ARG_UNUSED, const UChar *s, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:641
int onigenc_is_mbc_newline_0x0a(const UChar *p, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:580
#define TRUE
Definition: nkf.h:186
static int mbc_case_fold(OnigCaseFoldType flag, const UChar **pp, const UChar *end ARG_UNUSED, UChar *lower, OnigEncoding enc ARG_UNUSED)
Definition: iso_8859_1.c:206
int onigenc_single_byte_mbc_enc_len(const UChar *p ARG_UNUSED, const UChar *e ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:612
int onigenc_not_support_get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint *ranges[], OnigEncoding enc)
Definition: regenc.c:572
#define BIT_CTYPE_UPPER
#define UChar
Definition: oniguruma.h:107
#define ENC_REPLICATE(name, orig)
Definition: encdb.c:17
static const OnigPairCaseFoldCodes CaseFoldMap[]
Definition: iso_8859_1.c:72
int onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:632
v
Definition: win32ole.c:790
OnigCodePoint code[ONIGENC_MAX_COMP_CASE_FOLD_CODE_LEN]
Definition: oniguruma.h:142
#define INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
Definition: oniguruma.h:124
#define BIT_CTYPE_LOWER
OnigEncodingDefine(iso_8859_1, ISO_8859_1)
int onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar *p, UChar *end)
Definition: regenc.c:790
Definition: nkf.c:88