Ruby  1.9.3p448(2013-06-27revision41675)
transcode.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  transcode.c -
4 
5  $Author: usa $
6  created at: Tue Oct 30 16:10:22 JST 2007
7 
8  Copyright (C) 2007 Martin Duerst
9 
10 **********************************************************************/
11 
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "internal.h"
15 #include "transcode_data.h"
16 #include <ctype.h>
17 
18 #define ENABLE_ECONV_NEWLINE_OPTION 1
19 
20 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
24 
26 
32 #ifdef ENABLE_ECONV_NEWLINE_OPTION
34 #endif
36 
44 
45 static unsigned char *
46 allocate_converted_string(const char *sname, const char *dname,
47  const unsigned char *str, size_t len,
48  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
49  size_t *dst_len_ptr);
50 
51 /* dynamic structure, one per conversion (similar to iconv_t) */
52 /* may carry conversion state (e.g. for iso-2022-jp) */
53 typedef struct rb_transcoding {
55 
56  int flags;
57 
59  unsigned int next_table;
61  unsigned char next_byte;
62  unsigned int output_index;
63 
64  ssize_t recognized_len; /* already interpreted */
65  ssize_t readagain_len; /* not yet interpreted */
66  union {
67  unsigned char ary[8]; /* max_input <= sizeof(ary) */
68  unsigned char *ptr; /* length: max_input */
69  } readbuf; /* recognized_len + readagain_len used */
70 
71  ssize_t writebuf_off;
72  ssize_t writebuf_len;
73  union {
74  unsigned char ary[8]; /* max_output <= sizeof(ary) */
75  unsigned char *ptr; /* length: max_output */
76  } writebuf;
77 
78  union rb_transcoding_state_t { /* opaque data for stateful encoding */
79  void *ptr;
80  char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
82  } state;
84 #define TRANSCODING_READBUF(tc) \
85  ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
86  (tc)->readbuf.ary : \
87  (tc)->readbuf.ptr)
88 #define TRANSCODING_WRITEBUF(tc) \
89  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
90  (tc)->writebuf.ary : \
91  (tc)->writebuf.ptr)
92 #define TRANSCODING_WRITEBUF_SIZE(tc) \
93  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
94  sizeof((tc)->writebuf.ary) : \
95  (size_t)(tc)->transcoder->max_output)
96 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
97 #define TRANSCODING_STATE(tc) \
98  ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
99  (tc)->state.ary : \
100  (tc)->state.ptr)
101 
102 typedef struct {
104  unsigned char *out_buf_start;
105  unsigned char *out_data_start;
106  unsigned char *out_data_end;
107  unsigned char *out_buf_end;
110 
111 struct rb_econv_t {
112  int flags;
113  const char *source_encoding_name;
115 
116  int started;
117 
118  const unsigned char *replacement_str;
120  const char *replacement_enc;
122 
123  unsigned char *in_buf_start;
124  unsigned char *in_data_start;
125  unsigned char *in_data_end;
126  unsigned char *in_buf_end;
132 
133  /* last error */
134  struct {
137  const char *source_encoding;
138  const char *destination_encoding;
139  const unsigned char *error_bytes_start;
142  } last_error;
143 
144  /* The following fields are only for Encoding::Converter.
145  * rb_econv_open set them NULL. */
148 };
149 
150 /*
151  * Dispatch data and logic
152  */
153 
154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
155 
156 typedef struct {
157  const char *sname;
158  const char *dname;
159  const char *lib; /* null means means no need to load a library */
162 
164 
165 static transcoder_entry_t *
166 make_transcoder_entry(const char *sname, const char *dname)
167 {
168  st_data_t val;
169  st_table *table2;
170 
171  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
173  st_add_direct(transcoder_table, (st_data_t)sname, val);
174  }
175  table2 = (st_table *)val;
176  if (!st_lookup(table2, (st_data_t)dname, &val)) {
178  entry->sname = sname;
179  entry->dname = dname;
180  entry->lib = NULL;
181  entry->transcoder = NULL;
182  val = (st_data_t)entry;
183  st_add_direct(table2, (st_data_t)dname, val);
184  }
185  return (transcoder_entry_t *)val;
186 }
187 
188 static transcoder_entry_t *
189 get_transcoder_entry(const char *sname, const char *dname)
190 {
191  st_data_t val;
192  st_table *table2;
193 
194  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
195  return NULL;
196  }
197  table2 = (st_table *)val;
198  if (!st_lookup(table2, (st_data_t)dname, &val)) {
199  return NULL;
200  }
201  return (transcoder_entry_t *)val;
202 }
203 
204 void
206 {
207  const char *const sname = tr->src_encoding;
208  const char *const dname = tr->dst_encoding;
209 
210  transcoder_entry_t *entry;
211 
212  entry = make_transcoder_entry(sname, dname);
213  if (entry->transcoder) {
214  rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
215  sname, dname);
216  }
217 
218  entry->transcoder = tr;
219 }
220 
221 static void
222 declare_transcoder(const char *sname, const char *dname, const char *lib)
223 {
224  transcoder_entry_t *entry;
225 
226  entry = make_transcoder_entry(sname, dname);
227  entry->lib = lib;
228 }
229 
230 #define MAX_TRANSCODER_LIBNAME_LEN 64
231 static const char transcoder_lib_prefix[] = "enc/trans/";
232 
233 void
234 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
235 {
236  if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
237  rb_raise(rb_eArgError, "invalid library name - %s",
238  lib ? lib : "(null)");
239  }
240  declare_transcoder(enc1, enc2, lib);
241 }
242 
243 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
244 
245 typedef struct search_path_queue_tag {
247  const char *enc;
249 
250 typedef struct {
254  const char *base_enc;
256 
257 static int
259 {
260  const char *dname = (const char *)key;
261  search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
263 
264  if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
265  return ST_CONTINUE;
266  }
267 
269  q->enc = dname;
270  q->next = NULL;
271  *bfs->queue_last_ptr = q;
272  bfs->queue_last_ptr = &q->next;
273 
274  st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
275  return ST_CONTINUE;
276 }
277 
278 static int
279 transcode_search_path(const char *sname, const char *dname,
280  void (*callback)(const char *sname, const char *dname, int depth, void *arg),
281  void *arg)
282 {
283  search_path_bfs_t bfs;
285  st_data_t val;
286  st_table *table2;
287  int found;
288  int pathlen = -1;
289 
290  if (encoding_equal(sname, dname))
291  return -1;
292 
294  q->enc = sname;
295  q->next = NULL;
296  bfs.queue_last_ptr = &q->next;
297  bfs.queue = q;
298 
301 
302  while (bfs.queue) {
303  q = bfs.queue;
304  bfs.queue = q->next;
305  if (!bfs.queue)
306  bfs.queue_last_ptr = &bfs.queue;
307 
308  if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
309  xfree(q);
310  continue;
311  }
312  table2 = (st_table *)val;
313 
314  if (st_lookup(table2, (st_data_t)dname, &val)) {
315  st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
316  xfree(q);
317  found = 1;
318  goto cleanup;
319  }
320 
321  bfs.base_enc = q->enc;
323  bfs.base_enc = NULL;
324 
325  xfree(q);
326  }
327  found = 0;
328 
329  cleanup:
330  while (bfs.queue) {
331  q = bfs.queue;
332  bfs.queue = q->next;
333  xfree(q);
334  }
335 
336  if (found) {
337  const char *enc = dname;
338  int depth;
339  pathlen = 0;
340  while (1) {
341  st_lookup(bfs.visited, (st_data_t)enc, &val);
342  if (!val)
343  break;
344  pathlen++;
345  enc = (const char *)val;
346  }
347  depth = pathlen;
348  enc = dname;
349  while (1) {
350  st_lookup(bfs.visited, (st_data_t)enc, &val);
351  if (!val)
352  break;
353  callback((const char *)val, enc, --depth, arg);
354  enc = (const char *)val;
355  }
356  }
357 
358  st_free_table(bfs.visited);
359 
360  return pathlen; /* is -1 if not found */
361 }
362 
363 static const rb_transcoder *
365 {
366  if (entry->transcoder)
367  return entry->transcoder;
368 
369  if (entry->lib) {
370  const char *lib = entry->lib;
371  size_t len = strlen(lib);
373  VALUE fn;
374  const int safe = rb_safe_level();
375 
376  entry->lib = NULL;
377 
378  if (len > MAX_TRANSCODER_LIBNAME_LEN)
379  return NULL;
380  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
381  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
382  fn = rb_str_new2(path);
384  OBJ_FREEZE(fn);
385  if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
386  return NULL;
387  }
388 
389  if (entry->transcoder)
390  return entry->transcoder;
391 
392  return NULL;
393 }
394 
395 static const char*
396 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
397 {
398  if (encoding_equal(encname, "UTF-8")) {
399  *len_ret = 3;
400  *repl_encname_ptr = "UTF-8";
401  return "\xEF\xBF\xBD";
402  }
403  else {
404  *len_ret = 1;
405  *repl_encname_ptr = "US-ASCII";
406  return "?";
407  }
408 }
409 
410 /*
411  * Transcoding engine logic
412  */
413 
414 static const unsigned char *
416  const unsigned char *in_start,
417  const unsigned char *inchar_start,
418  const unsigned char *in_p,
419  size_t *char_len_ptr)
420 {
421  const unsigned char *ptr;
422  if (inchar_start - in_start < tc->recognized_len) {
424  inchar_start, unsigned char, in_p - inchar_start);
425  ptr = TRANSCODING_READBUF(tc);
426  }
427  else {
428  ptr = inchar_start - tc->recognized_len;
429  }
430  *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
431  return ptr;
432 }
433 
434 static rb_econv_result_t
435 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
436  const unsigned char *in_stop, unsigned char *out_stop,
437  rb_transcoding *tc,
438  const int opt)
439 {
440  const rb_transcoder *tr = tc->transcoder;
441  int unitlen = tr->input_unit_length;
442  ssize_t readagain_len = 0;
443 
444  const unsigned char *inchar_start;
445  const unsigned char *in_p;
446 
447  unsigned char *out_p;
448 
449  in_p = inchar_start = *in_pos;
450 
451  out_p = *out_pos;
452 
453 #define SUSPEND(ret, num) \
454  do { \
455  tc->resume_position = (num); \
456  if (0 < in_p - inchar_start) \
457  MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
458  inchar_start, unsigned char, in_p - inchar_start); \
459  *in_pos = in_p; \
460  *out_pos = out_p; \
461  tc->recognized_len += in_p - inchar_start; \
462  if (readagain_len) { \
463  tc->recognized_len -= readagain_len; \
464  tc->readagain_len = readagain_len; \
465  } \
466  return (ret); \
467  resume_label ## num:; \
468  } while (0)
469 #define SUSPEND_OBUF(num) \
470  do { \
471  while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
472  } while (0)
473 
474 #define SUSPEND_AFTER_OUTPUT(num) \
475  if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
476  SUSPEND(econv_after_output, num); \
477  }
478 
479 #define next_table (tc->next_table)
480 #define next_info (tc->next_info)
481 #define next_byte (tc->next_byte)
482 #define writebuf_len (tc->writebuf_len)
483 #define writebuf_off (tc->writebuf_off)
484 
485  switch (tc->resume_position) {
486  case 0: break;
487  case 1: goto resume_label1;
488  case 2: goto resume_label2;
489  case 3: goto resume_label3;
490  case 4: goto resume_label4;
491  case 5: goto resume_label5;
492  case 6: goto resume_label6;
493  case 7: goto resume_label7;
494  case 8: goto resume_label8;
495  case 9: goto resume_label9;
496  case 10: goto resume_label10;
497  case 11: goto resume_label11;
498  case 12: goto resume_label12;
499  case 13: goto resume_label13;
500  case 14: goto resume_label14;
501  case 15: goto resume_label15;
502  case 16: goto resume_label16;
503  case 17: goto resume_label17;
504  case 18: goto resume_label18;
505  case 19: goto resume_label19;
506  case 20: goto resume_label20;
507  case 21: goto resume_label21;
508  case 22: goto resume_label22;
509  case 23: goto resume_label23;
510  case 24: goto resume_label24;
511  case 25: goto resume_label25;
512  case 26: goto resume_label26;
513  case 27: goto resume_label27;
514  case 28: goto resume_label28;
515  case 29: goto resume_label29;
516  case 30: goto resume_label30;
517  case 31: goto resume_label31;
518  case 32: goto resume_label32;
519  case 33: goto resume_label33;
520  case 34: goto resume_label34;
521  }
522 
523  while (1) {
524  inchar_start = in_p;
525  tc->recognized_len = 0;
527 
529 
530  if (in_stop <= in_p) {
531  if (!(opt & ECONV_PARTIAL_INPUT))
532  break;
534  continue;
535  }
536 
537 #define BYTE_ADDR(index) (tr->byte_array + (index))
538 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
539 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
540 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
541 #define BL_MIN_BYTE (BL_BASE[0])
542 #define BL_MAX_BYTE (BL_BASE[1])
543 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
544 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
545 
546  next_byte = (unsigned char)*in_p++;
547  follow_byte:
549  next_info = INVALID;
550  else {
552  }
553  follow_info:
554  switch (next_info & 0x1F) {
555  case NOMAP:
556  {
557  const unsigned char *p = inchar_start;
558  writebuf_off = 0;
559  while (p < in_p) {
560  TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
561  }
563  writebuf_off = 0;
564  while (writebuf_off < writebuf_len) {
565  SUSPEND_OBUF(3);
566  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
567  }
568  }
569  continue;
570  case 0x00: case 0x04: case 0x08: case 0x0C:
571  case 0x10: case 0x14: case 0x18: case 0x1C:
573  while (in_p >= in_stop) {
574  if (!(opt & ECONV_PARTIAL_INPUT))
575  goto incomplete;
577  }
578  next_byte = (unsigned char)*in_p++;
579  next_table = (unsigned int)next_info;
580  goto follow_byte;
581  case ZERObt: /* drop input */
582  continue;
583  case ONEbt:
584  SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
585  continue;
586  case TWObt:
587  SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
588  SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
589  continue;
590  case THREEbt:
591  SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
592  SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
593  SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
594  continue;
595  case FOURbt:
596  SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
597  SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
598  SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
599  SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
600  continue;
601  case GB4bt:
602  SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
603  SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
604  SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
605  SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
606  continue;
607  case STR1:
608  tc->output_index = 0;
611  tc->output_index++;
612  }
613  continue;
614  case FUNii:
616  goto follow_info;
617  case FUNsi:
618  {
619  const unsigned char *char_start;
620  size_t char_len;
621  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
622  next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
623  goto follow_info;
624  }
625  case FUNio:
626  SUSPEND_OBUF(13);
627  if (tr->max_output <= out_stop - out_p)
628  out_p += tr->func_io(TRANSCODING_STATE(tc),
629  next_info, out_p, out_stop - out_p);
630  else {
632  next_info,
634  writebuf_off = 0;
635  while (writebuf_off < writebuf_len) {
636  SUSPEND_OBUF(20);
637  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
638  }
639  }
640  break;
641  case FUNso:
642  {
643  const unsigned char *char_start;
644  size_t char_len;
645  SUSPEND_OBUF(14);
646  if (tr->max_output <= out_stop - out_p) {
647  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
648  out_p += tr->func_so(TRANSCODING_STATE(tc),
649  char_start, (size_t)char_len,
650  out_p, out_stop - out_p);
651  }
652  else {
653  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
655  char_start, (size_t)char_len,
657  writebuf_off = 0;
658  while (writebuf_off < writebuf_len) {
659  SUSPEND_OBUF(22);
660  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
661  }
662  }
663  break;
664  }
665  case FUNsio:
666  {
667  const unsigned char *char_start;
668  size_t char_len;
669  SUSPEND_OBUF(33);
670  if (tr->max_output <= out_stop - out_p) {
671  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
672  out_p += tr->func_sio(TRANSCODING_STATE(tc),
673  char_start, (size_t)char_len, next_info,
674  out_p, out_stop - out_p);
675  }
676  else {
677  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
679  char_start, (size_t)char_len, next_info,
681  writebuf_off = 0;
682  while (writebuf_off < writebuf_len) {
683  SUSPEND_OBUF(34);
684  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
685  }
686  }
687  break;
688  }
689  case INVALID:
690  if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
691  if (tc->recognized_len + (in_p - inchar_start) < unitlen)
693  while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
694  in_p = in_stop;
696  }
697  if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
698  in_p = in_stop;
699  }
700  else {
701  in_p = inchar_start + (unitlen - tc->recognized_len);
702  }
703  }
704  else {
705  ssize_t invalid_len; /* including the last byte which causes invalid */
706  ssize_t discard_len;
707  invalid_len = tc->recognized_len + (in_p - inchar_start);
708  discard_len = ((invalid_len - 1) / unitlen) * unitlen;
709  readagain_len = invalid_len - discard_len;
710  }
711  goto invalid;
712  case UNDEF:
713  goto undef;
714  default:
715  rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
716  }
717  continue;
718 
719  invalid:
721  continue;
722 
723  incomplete:
725  continue;
726 
727  undef:
729  continue;
730  }
731 
732  /* cleanup */
733  if (tr->finish_func) {
734  SUSPEND_OBUF(4);
735  if (tr->max_output <= out_stop - out_p) {
736  out_p += tr->finish_func(TRANSCODING_STATE(tc),
737  out_p, out_stop - out_p);
738  }
739  else {
742  writebuf_off = 0;
743  while (writebuf_off < writebuf_len) {
744  SUSPEND_OBUF(23);
745  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
746  }
747  }
748  }
749  while (1)
751 #undef SUSPEND
752 #undef next_table
753 #undef next_info
754 #undef next_byte
755 #undef writebuf_len
756 #undef writebuf_off
757 }
758 
759 static rb_econv_result_t
760 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
761  const unsigned char *in_stop, unsigned char *out_stop,
762  rb_transcoding *tc,
763  const int opt)
764 {
765  if (tc->readagain_len) {
766  unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
767  const unsigned char *readagain_pos = readagain_buf;
768  const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
769  rb_econv_result_t res;
770 
771  MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
772  unsigned char, tc->readagain_len);
773  tc->readagain_len = 0;
774  res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
775  if (res != econv_source_buffer_empty) {
777  readagain_pos, unsigned char, readagain_stop - readagain_pos);
778  tc->readagain_len += readagain_stop - readagain_pos;
779  return res;
780  }
781  }
782  return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
783 }
784 
785 static rb_transcoding *
787 {
788  rb_transcoding *tc;
789 
790  tc = ALLOC(rb_transcoding);
791  tc->transcoder = tr;
792  tc->flags = flags;
793  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
794  tc->state.ptr = xmalloc(tr->state_size);
795  if (tr->state_init_func) {
796  (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
797  }
798  tc->resume_position = 0;
799  tc->recognized_len = 0;
800  tc->readagain_len = 0;
801  tc->writebuf_len = 0;
802  tc->writebuf_off = 0;
803  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
804  tc->readbuf.ptr = xmalloc(tr->max_input);
805  }
806  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
807  tc->writebuf.ptr = xmalloc(tr->max_output);
808  }
809  return tc;
810 }
811 
812 static rb_econv_result_t
814  const unsigned char **input_ptr, const unsigned char *input_stop,
815  unsigned char **output_ptr, unsigned char *output_stop,
816  int flags)
817 {
818  return transcode_restartable(
819  input_ptr, output_ptr,
820  input_stop, output_stop,
821  tc, flags);
822 }
823 
824 static void
826 {
827  const rb_transcoder *tr = tc->transcoder;
828  if (tr->state_fini_func) {
829  (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
830  }
831  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
832  xfree(tc->state.ptr);
833  if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
834  xfree(tc->readbuf.ptr);
835  if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
836  xfree(tc->writebuf.ptr);
837  xfree(tc);
838 }
839 
840 static size_t
842 {
843  size_t size = sizeof(rb_transcoding);
844  const rb_transcoder *tr = tc->transcoder;
845 
846  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
847  size += tr->state_size;
848  }
849  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
850  size += tr->max_input;
851  }
852  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
853  size += tr->max_output;
854  }
855  return size;
856 }
857 
858 static rb_econv_t *
859 rb_econv_alloc(int n_hint)
860 {
861  rb_econv_t *ec;
862 
863  if (n_hint <= 0)
864  n_hint = 1;
865 
866  ec = ALLOC(rb_econv_t);
867  ec->flags = 0;
870  ec->started = 0;
871  ec->replacement_str = NULL;
872  ec->replacement_len = 0;
873  ec->replacement_enc = NULL;
874  ec->replacement_allocated = 0;
875  ec->in_buf_start = NULL;
876  ec->in_data_start = NULL;
877  ec->in_data_end = NULL;
878  ec->in_buf_end = NULL;
879  ec->num_allocated = n_hint;
880  ec->num_trans = 0;
882  ec->num_finished = 0;
883  ec->last_tc = NULL;
885  ec->last_error.error_tc = NULL;
889  ec->last_error.error_bytes_len = 0;
890  ec->last_error.readagain_len = 0;
891  ec->source_encoding = NULL;
893  return ec;
894 }
895 
896 static int
898 {
899  int n, j;
900  int bufsize = 4096;
901  unsigned char *p;
902 
903  if (ec->num_trans == ec->num_allocated) {
904  n = ec->num_allocated * 2;
906  ec->num_allocated = n;
907  }
908 
909  p = xmalloc(bufsize);
910 
911  MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
912 
914  ec->elems[i].out_buf_start = p;
915  ec->elems[i].out_buf_end = p + bufsize;
916  ec->elems[i].out_data_start = p;
917  ec->elems[i].out_data_end = p;
919 
920  ec->num_trans++;
921 
922  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
923  for (j = ec->num_trans-1; i <= j; j--) {
924  rb_transcoding *tc = ec->elems[j].tc;
925  const rb_transcoder *tr2 = tc->transcoder;
926  if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
927  ec->last_tc = tc;
928  break;
929  }
930  }
931 
932  return 0;
933 }
934 
935 static rb_econv_t *
937 {
938  rb_econv_t *ec;
939  int i, ret;
940 
941  for (i = 0; i < n; i++) {
942  const rb_transcoder *tr;
943  tr = load_transcoder_entry(entries[i]);
944  if (!tr)
945  return NULL;
946  }
947 
948  ec = rb_econv_alloc(n);
949 
950  for (i = 0; i < n; i++) {
951  const rb_transcoder *tr = load_transcoder_entry(entries[i]);
952  ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
953  if (ret == -1) {
954  rb_econv_close(ec);
955  return NULL;
956  }
957  }
958 
959  return ec;
960 }
961 
962 struct trans_open_t {
965 };
966 
967 static void
968 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
969 {
970  struct trans_open_t *toarg = arg;
971 
972  if (!toarg->entries) {
973  toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
974  }
975  toarg->entries[depth] = get_transcoder_entry(sname, dname);
976 }
977 
978 static rb_econv_t *
979 rb_econv_open0(const char *sname, const char *dname, int ecflags)
980 {
982  int num_trans;
983  rb_econv_t *ec;
984 
985  rb_encoding *senc, *denc;
986  int sidx, didx;
987 
988  senc = NULL;
989  if (*sname) {
990  sidx = rb_enc_find_index(sname);
991  if (0 <= sidx) {
992  senc = rb_enc_from_index(sidx);
993  }
994  }
995 
996  denc = NULL;
997  if (*dname) {
998  didx = rb_enc_find_index(dname);
999  if (0 <= didx) {
1000  denc = rb_enc_from_index(didx);
1001  }
1002  }
1003 
1004  if (*sname == '\0' && *dname == '\0') {
1005  num_trans = 0;
1006  entries = NULL;
1007  }
1008  else {
1009  struct trans_open_t toarg;
1010  toarg.entries = NULL;
1011  toarg.num_additional = 0;
1012  num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1013  entries = toarg.entries;
1014  if (num_trans < 0) {
1015  xfree(entries);
1016  return NULL;
1017  }
1018  }
1019 
1020  ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1021  xfree(entries);
1022  if (!ec)
1023  return NULL;
1024 
1025  ec->flags = ecflags;
1026  ec->source_encoding_name = sname;
1027  ec->destination_encoding_name = dname;
1028 
1029  return ec;
1030 }
1031 
1032 #define MAX_ECFLAGS_DECORATORS 32
1033 
1034 static int
1035 decorator_names(int ecflags, const char **decorators_ret)
1036 {
1037  int num_decorators;
1038 
1039  switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1043  case 0:
1044  break;
1045  default:
1046  return -1;
1047  }
1048 
1049  if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1051  return -1;
1052 
1053  num_decorators = 0;
1054 
1055  if (ecflags & ECONV_XML_TEXT_DECORATOR)
1056  decorators_ret[num_decorators++] = "xml_text_escape";
1057  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1058  decorators_ret[num_decorators++] = "xml_attr_content_escape";
1059  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1060  decorators_ret[num_decorators++] = "xml_attr_quote";
1061 
1062  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1063  decorators_ret[num_decorators++] = "crlf_newline";
1064  if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1065  decorators_ret[num_decorators++] = "cr_newline";
1066  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1067  decorators_ret[num_decorators++] = "universal_newline";
1068 
1069  return num_decorators;
1070 }
1071 
1072 rb_econv_t *
1073 rb_econv_open(const char *sname, const char *dname, int ecflags)
1074 {
1075  rb_econv_t *ec;
1076  int num_decorators;
1077  const char *decorators[MAX_ECFLAGS_DECORATORS];
1078  int i;
1079 
1080  num_decorators = decorator_names(ecflags, decorators);
1081  if (num_decorators == -1)
1082  return NULL;
1083 
1084  ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1085  if (!ec)
1086  return NULL;
1087 
1088  for (i = 0; i < num_decorators; i++)
1089  if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1090  rb_econv_close(ec);
1091  return NULL;
1092  }
1093 
1094  ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1095 
1096  return ec;
1097 }
1098 
1099 static int
1101  const unsigned char **input_ptr, const unsigned char *input_stop,
1102  unsigned char **output_ptr, unsigned char *output_stop,
1103  int flags,
1104  int start)
1105 {
1106  int try;
1107  int i, f;
1108 
1109  const unsigned char **ipp, *is, *iold;
1110  unsigned char **opp, *os, *oold;
1111  rb_econv_result_t res;
1112 
1113  try = 1;
1114  while (try) {
1115  try = 0;
1116  for (i = start; i < ec->num_trans; i++) {
1117  rb_econv_elem_t *te = &ec->elems[i];
1118 
1119  if (i == 0) {
1120  ipp = input_ptr;
1121  is = input_stop;
1122  }
1123  else {
1124  rb_econv_elem_t *prev_te = &ec->elems[i-1];
1125  ipp = (const unsigned char **)&prev_te->out_data_start;
1126  is = prev_te->out_data_end;
1127  }
1128 
1129  if (i == ec->num_trans-1) {
1130  opp = output_ptr;
1131  os = output_stop;
1132  }
1133  else {
1134  if (te->out_buf_start != te->out_data_start) {
1135  ssize_t len = te->out_data_end - te->out_data_start;
1136  ssize_t off = te->out_data_start - te->out_buf_start;
1137  MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1138  te->out_data_start = te->out_buf_start;
1139  te->out_data_end -= off;
1140  }
1141  opp = &te->out_data_end;
1142  os = te->out_buf_end;
1143  }
1144 
1145  f = flags;
1146  if (ec->num_finished != i)
1147  f |= ECONV_PARTIAL_INPUT;
1148  if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1149  start = 1;
1150  flags &= ~ECONV_AFTER_OUTPUT;
1151  }
1152  if (i != 0)
1153  f &= ~ECONV_AFTER_OUTPUT;
1154  iold = *ipp;
1155  oold = *opp;
1156  te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1157  if (iold != *ipp || oold != *opp)
1158  try = 1;
1159 
1160  switch (res) {
1164  case econv_after_output:
1165  return i;
1166 
1169  break;
1170 
1171  case econv_finished:
1172  ec->num_finished = i+1;
1173  break;
1174  }
1175  }
1176  }
1177  return -1;
1178 }
1179 
1180 static rb_econv_result_t
1182  const unsigned char **input_ptr, const unsigned char *input_stop,
1183  unsigned char **output_ptr, unsigned char *output_stop,
1184  int flags,
1185  int *result_position_ptr)
1186 {
1187  int i;
1188  int needreport_index;
1189  int sweep_start;
1190 
1191  unsigned char empty_buf;
1192  unsigned char *empty_ptr = &empty_buf;
1193 
1194  if (!input_ptr) {
1195  input_ptr = (const unsigned char **)&empty_ptr;
1196  input_stop = empty_ptr;
1197  }
1198 
1199  if (!output_ptr) {
1200  output_ptr = &empty_ptr;
1201  output_stop = empty_ptr;
1202  }
1203 
1204  if (ec->elems[0].last_result == econv_after_output)
1206 
1207  needreport_index = -1;
1208  for (i = ec->num_trans-1; 0 <= i; i--) {
1209  switch (ec->elems[i].last_result) {
1213  case econv_after_output:
1214  case econv_finished:
1215  sweep_start = i+1;
1216  needreport_index = i;
1217  goto found_needreport;
1218 
1221  break;
1222 
1223  default:
1224  rb_bug("unexpected transcode last result");
1225  }
1226  }
1227 
1228  /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1229 
1231  (flags & ECONV_AFTER_OUTPUT)) {
1232  rb_econv_result_t res;
1233 
1234  res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1235  (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
1236  result_position_ptr);
1237 
1238  if (res == econv_source_buffer_empty)
1239  return econv_after_output;
1240  return res;
1241  }
1242 
1243  sweep_start = 0;
1244 
1245  found_needreport:
1246 
1247  do {
1248  needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1249  sweep_start = needreport_index + 1;
1250  } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1251 
1252  for (i = ec->num_trans-1; 0 <= i; i--) {
1254  rb_econv_result_t res = ec->elems[i].last_result;
1255  if (res == econv_invalid_byte_sequence ||
1256  res == econv_incomplete_input ||
1257  res == econv_undefined_conversion ||
1258  res == econv_after_output) {
1260  }
1261  if (result_position_ptr)
1262  *result_position_ptr = i;
1263  return res;
1264  }
1265  }
1266  if (result_position_ptr)
1267  *result_position_ptr = -1;
1269 }
1270 
1271 static rb_econv_result_t
1273  const unsigned char **input_ptr, const unsigned char *input_stop,
1274  unsigned char **output_ptr, unsigned char *output_stop,
1275  int flags)
1276 {
1277  rb_econv_result_t res;
1278  int result_position;
1279  int has_output = 0;
1280 
1281  memset(&ec->last_error, 0, sizeof(ec->last_error));
1282 
1283  if (ec->num_trans == 0) {
1284  size_t len;
1285  if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1286  if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1287  len = output_stop - *output_ptr;
1288  memcpy(*output_ptr, ec->in_data_start, len);
1289  *output_ptr = output_stop;
1290  ec->in_data_start += len;
1292  goto gotresult;
1293  }
1294  len = ec->in_data_end - ec->in_data_start;
1295  memcpy(*output_ptr, ec->in_data_start, len);
1296  *output_ptr += len;
1297  ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1298  if (flags & ECONV_AFTER_OUTPUT) {
1299  res = econv_after_output;
1300  goto gotresult;
1301  }
1302  }
1303  if (output_stop - *output_ptr < input_stop - *input_ptr) {
1304  len = output_stop - *output_ptr;
1305  }
1306  else {
1307  len = input_stop - *input_ptr;
1308  }
1309  if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1310  *(*output_ptr)++ = *(*input_ptr)++;
1311  res = econv_after_output;
1312  goto gotresult;
1313  }
1314  memcpy(*output_ptr, *input_ptr, len);
1315  *output_ptr += len;
1316  *input_ptr += len;
1317  if (*input_ptr != input_stop)
1319  else if (flags & ECONV_PARTIAL_INPUT)
1321  else
1322  res = econv_finished;
1323  goto gotresult;
1324  }
1325 
1326  if (ec->elems[ec->num_trans-1].out_data_start) {
1327  unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1328  unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1329  if (data_start != data_end) {
1330  size_t len;
1331  if (output_stop - *output_ptr < data_end - data_start) {
1332  len = output_stop - *output_ptr;
1333  memcpy(*output_ptr, data_start, len);
1334  *output_ptr = output_stop;
1335  ec->elems[ec->num_trans-1].out_data_start += len;
1337  goto gotresult;
1338  }
1339  len = data_end - data_start;
1340  memcpy(*output_ptr, data_start, len);
1341  *output_ptr += len;
1342  ec->elems[ec->num_trans-1].out_data_start =
1343  ec->elems[ec->num_trans-1].out_data_end =
1344  ec->elems[ec->num_trans-1].out_buf_start;
1345  has_output = 1;
1346  }
1347  }
1348 
1349  if (ec->in_buf_start &&
1350  ec->in_data_start != ec->in_data_end) {
1351  res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1352  (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1353  if (res != econv_source_buffer_empty)
1354  goto gotresult;
1355  }
1356 
1357  if (has_output &&
1358  (flags & ECONV_AFTER_OUTPUT) &&
1359  *input_ptr != input_stop) {
1360  input_stop = *input_ptr;
1361  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1362  if (res == econv_source_buffer_empty)
1363  res = econv_after_output;
1364  }
1365  else if ((flags & ECONV_AFTER_OUTPUT) ||
1366  ec->num_trans == 1) {
1367  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1368  }
1369  else {
1370  flags |= ECONV_AFTER_OUTPUT;
1371  do {
1372  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1373  } while (res == econv_after_output);
1374  }
1375 
1376  gotresult:
1377  ec->last_error.result = res;
1378  if (res == econv_invalid_byte_sequence ||
1379  res == econv_incomplete_input ||
1380  res == econv_undefined_conversion) {
1381  rb_transcoding *error_tc = ec->elems[result_position].tc;
1382  ec->last_error.error_tc = error_tc;
1386  ec->last_error.error_bytes_len = error_tc->recognized_len;
1387  ec->last_error.readagain_len = error_tc->readagain_len;
1388  }
1389 
1390  return res;
1391 }
1392 
1394 
1395 static int
1397 {
1398  int ret;
1399  unsigned char utfbuf[1024];
1400  const unsigned char *utf;
1401  size_t utf_len;
1402  int utf_allocated = 0;
1403  char charef_buf[16];
1404  const unsigned char *p;
1405 
1406  if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1407  utf = ec->last_error.error_bytes_start;
1408  utf_len = ec->last_error.error_bytes_len;
1409  }
1410  else {
1413  utfbuf, sizeof(utfbuf),
1414  &utf_len);
1415  if (!utf)
1416  return -1;
1417  if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1418  utf_allocated = 1;
1419  }
1420 
1421  if (utf_len % 4 != 0)
1422  goto fail;
1423 
1424  p = utf;
1425  while (4 <= utf_len) {
1426  unsigned int u = 0;
1427  u += p[0] << 24;
1428  u += p[1] << 16;
1429  u += p[2] << 8;
1430  u += p[3];
1431  snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1432 
1433  ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1434  if (ret == -1)
1435  goto fail;
1436 
1437  p += 4;
1438  utf_len -= 4;
1439  }
1440 
1441  if (utf_allocated)
1442  xfree((void *)utf);
1443  return 0;
1444 
1445  fail:
1446  if (utf_allocated)
1447  xfree((void *)utf);
1448  return -1;
1449 }
1450 
1453  const unsigned char **input_ptr, const unsigned char *input_stop,
1454  unsigned char **output_ptr, unsigned char *output_stop,
1455  int flags)
1456 {
1457  rb_econv_result_t ret;
1458 
1459  unsigned char empty_buf;
1460  unsigned char *empty_ptr = &empty_buf;
1461 
1462  ec->started = 1;
1463 
1464  if (!input_ptr) {
1465  input_ptr = (const unsigned char **)&empty_ptr;
1466  input_stop = empty_ptr;
1467  }
1468 
1469  if (!output_ptr) {
1470  output_ptr = &empty_ptr;
1471  output_stop = empty_ptr;
1472  }
1473 
1474  resume:
1475  ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1476 
1477  if (ret == econv_invalid_byte_sequence ||
1478  ret == econv_incomplete_input) {
1479  /* deal with invalid byte sequence */
1480  /* todo: add more alternative behaviors */
1481  switch (ec->flags & ECONV_INVALID_MASK) {
1482  case ECONV_INVALID_REPLACE:
1483  if (output_replacement_character(ec) == 0)
1484  goto resume;
1485  }
1486  }
1487 
1488  if (ret == econv_undefined_conversion) {
1489  /* valid character in source encoding
1490  * but no related character(s) in destination encoding */
1491  /* todo: add more alternative behaviors */
1492  switch (ec->flags & ECONV_UNDEF_MASK) {
1493  case ECONV_UNDEF_REPLACE:
1494  if (output_replacement_character(ec) == 0)
1495  goto resume;
1496  break;
1497 
1499  if (output_hex_charref(ec) == 0)
1500  goto resume;
1501  break;
1502  }
1503  }
1504 
1505  return ret;
1506 }
1507 
1508 const char *
1510 {
1511  rb_transcoding *tc = ec->last_tc;
1512  const rb_transcoder *tr;
1513 
1514  if (tc == NULL)
1515  return "";
1516 
1517  tr = tc->transcoder;
1518 
1520  return tr->src_encoding;
1521  return tr->dst_encoding;
1522 }
1523 
1524 static unsigned char *
1525 allocate_converted_string(const char *sname, const char *dname,
1526  const unsigned char *str, size_t len,
1527  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1528  size_t *dst_len_ptr)
1529 {
1530  unsigned char *dst_str;
1531  size_t dst_len;
1532  size_t dst_bufsize;
1533 
1534  rb_econv_t *ec;
1535  rb_econv_result_t res;
1536 
1537  const unsigned char *sp;
1538  unsigned char *dp;
1539 
1540  if (caller_dst_buf)
1541  dst_bufsize = caller_dst_bufsize;
1542  else if (len == 0)
1543  dst_bufsize = 1;
1544  else
1545  dst_bufsize = len;
1546 
1547  ec = rb_econv_open(sname, dname, 0);
1548  if (ec == NULL)
1549  return NULL;
1550  if (caller_dst_buf)
1551  dst_str = caller_dst_buf;
1552  else
1553  dst_str = xmalloc(dst_bufsize);
1554  dst_len = 0;
1555  sp = str;
1556  dp = dst_str+dst_len;
1557  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1558  dst_len = dp - dst_str;
1559  while (res == econv_destination_buffer_full) {
1560  if (SIZE_MAX/2 < dst_bufsize) {
1561  goto fail;
1562  }
1563  dst_bufsize *= 2;
1564  if (dst_str == caller_dst_buf) {
1565  unsigned char *tmp;
1566  tmp = xmalloc(dst_bufsize);
1567  memcpy(tmp, dst_str, dst_bufsize/2);
1568  dst_str = tmp;
1569  }
1570  else {
1571  dst_str = xrealloc(dst_str, dst_bufsize);
1572  }
1573  dp = dst_str+dst_len;
1574  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1575  dst_len = dp - dst_str;
1576  }
1577  if (res != econv_finished) {
1578  goto fail;
1579  }
1580  rb_econv_close(ec);
1581  *dst_len_ptr = dst_len;
1582  return dst_str;
1583 
1584  fail:
1585  if (dst_str != caller_dst_buf)
1586  xfree(dst_str);
1587  rb_econv_close(ec);
1588  return NULL;
1589 }
1590 
1591 /* result: 0:success -1:failure */
1592 int
1594  const unsigned char *str, size_t len, const char *str_encoding)
1595 {
1596  const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1597  unsigned char insert_buf[4096];
1598  const unsigned char *insert_str = NULL;
1599  size_t insert_len;
1600 
1601  int last_trans_index;
1602  rb_transcoding *tc;
1603 
1604  unsigned char **buf_start_p;
1605  unsigned char **data_start_p;
1606  unsigned char **data_end_p;
1607  unsigned char **buf_end_p;
1608 
1609  size_t need;
1610 
1611  ec->started = 1;
1612 
1613  if (len == 0)
1614  return 0;
1615 
1616  if (encoding_equal(insert_encoding, str_encoding)) {
1617  insert_str = str;
1618  insert_len = len;
1619  }
1620  else {
1621  insert_str = allocate_converted_string(str_encoding, insert_encoding,
1622  str, len, insert_buf, sizeof(insert_buf), &insert_len);
1623  if (insert_str == NULL)
1624  return -1;
1625  }
1626 
1627  need = insert_len;
1628 
1629  last_trans_index = ec->num_trans-1;
1630  if (ec->num_trans == 0) {
1631  tc = NULL;
1632  buf_start_p = &ec->in_buf_start;
1633  data_start_p = &ec->in_data_start;
1634  data_end_p = &ec->in_data_end;
1635  buf_end_p = &ec->in_buf_end;
1636  }
1637  else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1638  tc = ec->elems[last_trans_index].tc;
1639  need += tc->readagain_len;
1640  if (need < insert_len)
1641  goto fail;
1642  if (last_trans_index == 0) {
1643  buf_start_p = &ec->in_buf_start;
1644  data_start_p = &ec->in_data_start;
1645  data_end_p = &ec->in_data_end;
1646  buf_end_p = &ec->in_buf_end;
1647  }
1648  else {
1649  rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1650  buf_start_p = &ee->out_buf_start;
1651  data_start_p = &ee->out_data_start;
1652  data_end_p = &ee->out_data_end;
1653  buf_end_p = &ee->out_buf_end;
1654  }
1655  }
1656  else {
1657  rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1658  buf_start_p = &ee->out_buf_start;
1659  data_start_p = &ee->out_data_start;
1660  data_end_p = &ee->out_data_end;
1661  buf_end_p = &ee->out_buf_end;
1662  tc = ec->elems[last_trans_index].tc;
1663  }
1664 
1665  if (*buf_start_p == NULL) {
1666  unsigned char *buf = xmalloc(need);
1667  *buf_start_p = buf;
1668  *data_start_p = buf;
1669  *data_end_p = buf;
1670  *buf_end_p = buf+need;
1671  }
1672  else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1673  MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1674  *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1675  *data_start_p = *buf_start_p;
1676  if ((size_t)(*buf_end_p - *data_end_p) < need) {
1677  unsigned char *buf;
1678  size_t s = (*data_end_p - *buf_start_p) + need;
1679  if (s < need)
1680  goto fail;
1681  buf = xrealloc(*buf_start_p, s);
1682  *data_start_p = buf;
1683  *data_end_p = buf + (*data_end_p - *buf_start_p);
1684  *buf_start_p = buf;
1685  *buf_end_p = buf + s;
1686  }
1687  }
1688 
1689  memcpy(*data_end_p, insert_str, insert_len);
1690  *data_end_p += insert_len;
1691  if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1692  memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1693  *data_end_p += tc->readagain_len;
1694  tc->readagain_len = 0;
1695  }
1696 
1697  if (insert_str != str && insert_str != insert_buf)
1698  xfree((void*)insert_str);
1699  return 0;
1700 
1701  fail:
1702  if (insert_str != str && insert_str != insert_buf)
1703  xfree((void*)insert_str);
1704  return -1;
1705 }
1706 
1707 void
1709 {
1710  int i;
1711 
1712  if (ec->replacement_allocated) {
1713  xfree((void *)ec->replacement_str);
1714  }
1715  for (i = 0; i < ec->num_trans; i++) {
1716  rb_transcoding_close(ec->elems[i].tc);
1717  if (ec->elems[i].out_buf_start)
1718  xfree(ec->elems[i].out_buf_start);
1719  }
1720  xfree(ec->in_buf_start);
1721  xfree(ec->elems);
1722  xfree(ec);
1723 }
1724 
1725 size_t
1727 {
1728  size_t size = sizeof(rb_econv_t);
1729  int i;
1730 
1731  if (ec->replacement_allocated) {
1732  size += ec->replacement_len;
1733  }
1734  for (i = 0; i < ec->num_trans; i++) {
1735  size += rb_transcoding_memsize(ec->elems[i].tc);
1736 
1737  if (ec->elems[i].out_buf_start) {
1738  size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1739  }
1740  }
1741  size += ec->in_buf_end - ec->in_buf_start;
1742  size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1743 
1744  return size;
1745 }
1746 
1747 int
1749 {
1750  if (ec->num_trans == 0)
1751  return 0;
1752 #if SIZEOF_SIZE_T > SIZEOF_INT
1753  if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1754 #endif
1755  return (int)ec->elems[0].tc->readagain_len;
1756 }
1757 
1758 void
1759 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1760 {
1761  rb_transcoding *tc;
1762  if (ec->num_trans == 0 || n == 0)
1763  return;
1764  tc = ec->elems[0].tc;
1765  memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1766  tc->readagain_len -= n;
1767 }
1768 
1770  const char *ascii_compat_name;
1771  const char *ascii_incompat_name;
1772 };
1773 
1774 static int
1776 {
1777  struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1778  transcoder_entry_t *entry = (transcoder_entry_t *)val;
1779  const rb_transcoder *tr;
1780 
1781  if (DECORATOR_P(entry->sname, entry->dname))
1782  return ST_CONTINUE;
1783  tr = load_transcoder_entry(entry);
1784  if (tr && tr->asciicompat_type == asciicompat_decoder) {
1785  data->ascii_compat_name = tr->dst_encoding;
1786  return ST_STOP;
1787  }
1788  return ST_CONTINUE;
1789 }
1790 
1791 const char *
1793 {
1794  st_data_t v;
1795  st_table *table2;
1796  struct asciicompat_encoding_t data;
1797 
1798  if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1799  return NULL;
1800  table2 = (st_table *)v;
1801 
1802  /*
1803  * Assumption:
1804  * There is at most one transcoder for
1805  * converting from ASCII incompatible encoding.
1806  *
1807  * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1808  */
1809  if (table2->num_entries != 1)
1810  return NULL;
1811 
1813  data.ascii_compat_name = NULL;
1814  st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1815  return data.ascii_compat_name;
1816 }
1817 
1818 VALUE
1819 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1820 {
1821  unsigned const char *ss, *sp, *se;
1822  unsigned char *ds, *dp, *de;
1823  rb_econv_result_t res;
1824  int max_output;
1825 
1826  if (NIL_P(dst)) {
1827  dst = rb_str_buf_new(len);
1828  if (ec->destination_encoding)
1830  }
1831 
1832  if (ec->last_tc)
1833  max_output = ec->last_tc->transcoder->max_output;
1834  else
1835  max_output = 1;
1836 
1838  while (res == econv_destination_buffer_full) {
1839  long dlen = RSTRING_LEN(dst);
1840  if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1841  unsigned long new_capa = (unsigned long)dlen + len + max_output;
1842  if (LONG_MAX < new_capa)
1843  rb_raise(rb_eArgError, "too long string");
1844  rb_str_resize(dst, new_capa);
1845  rb_str_set_len(dst, dlen);
1846  }
1847  ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
1848  se = ss + len;
1849  ds = (unsigned char *)RSTRING_PTR(dst);
1850  de = ds + rb_str_capacity(dst);
1851  dp = ds += dlen;
1852  res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1853  off += sp - ss;
1854  len -= sp - ss;
1855  rb_str_set_len(dst, dlen + (dp - ds));
1857  }
1858 
1859  return dst;
1860 }
1861 
1862 VALUE
1863 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1864 {
1865  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1866 }
1867 
1868 VALUE
1869 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1870 {
1871  return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1872 }
1873 
1874 VALUE
1876 {
1877  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1878 }
1879 
1880 static int
1881 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1882 {
1883  transcoder_entry_t *entry;
1884  const rb_transcoder *tr;
1885 
1886  if (ec->started != 0)
1887  return -1;
1888 
1889  entry = get_transcoder_entry(sname, dname);
1890  if (!entry)
1891  return -1;
1892 
1893  tr = load_transcoder_entry(entry);
1894  if (!tr) return -1;
1895 
1896  return rb_econv_add_transcoder_at(ec, tr, n);
1897 }
1898 
1899 static int
1900 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1901 {
1902  return rb_econv_add_converter(ec, "", decorator_name, n);
1903 }
1904 
1905 int
1906 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1907 {
1908  const rb_transcoder *tr;
1909 
1910  if (ec->num_trans == 0)
1911  return rb_econv_decorate_at(ec, decorator_name, 0);
1912 
1913  tr = ec->elems[0].tc->transcoder;
1914 
1915  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1917  return rb_econv_decorate_at(ec, decorator_name, 1);
1918 
1919  return rb_econv_decorate_at(ec, decorator_name, 0);
1920 }
1921 
1922 int
1923 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1924 {
1925  const rb_transcoder *tr;
1926 
1927  if (ec->num_trans == 0)
1928  return rb_econv_decorate_at(ec, decorator_name, 0);
1929 
1930  tr = ec->elems[ec->num_trans-1].tc->transcoder;
1931 
1932  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1934  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1935 
1936  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1937 }
1938 
1939 void
1941 {
1942  const rb_transcoder *trs[3];
1943  int n, i, j;
1944  transcoder_entry_t *entry;
1945  int num_trans;
1946 
1947  n = 0;
1949  entry = get_transcoder_entry("", "universal_newline");
1950  if (entry->transcoder)
1951  trs[n++] = entry->transcoder;
1952  }
1953  if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) {
1954  entry = get_transcoder_entry("", "crlf_newline");
1955  if (entry->transcoder)
1956  trs[n++] = entry->transcoder;
1957  }
1958  if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) {
1959  entry = get_transcoder_entry("", "cr_newline");
1960  if (entry->transcoder)
1961  trs[n++] = entry->transcoder;
1962  }
1963 
1964  num_trans = ec->num_trans;
1965  j = 0;
1966  for (i = 0; i < num_trans; i++) {
1967  int k;
1968  for (k = 0; k < n; k++)
1969  if (trs[k] == ec->elems[i].tc->transcoder)
1970  break;
1971  if (k == n) {
1972  ec->elems[j] = ec->elems[i];
1973  j++;
1974  }
1975  else {
1976  rb_transcoding_close(ec->elems[i].tc);
1977  xfree(ec->elems[i].out_buf_start);
1978  ec->num_trans--;
1979  }
1980  }
1981 
1983 
1984 }
1985 
1986 static VALUE
1987 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1988 {
1989  int has_description = 0;
1990 
1991  if (NIL_P(mesg))
1992  mesg = rb_str_new(NULL, 0);
1993 
1994  if (*sname != '\0' || *dname != '\0') {
1995  if (*sname == '\0')
1996  rb_str_cat2(mesg, dname);
1997  else if (*dname == '\0')
1998  rb_str_cat2(mesg, sname);
1999  else
2000  rb_str_catf(mesg, "%s to %s", sname, dname);
2001  has_description = 1;
2002  }
2003 
2004  if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2008  const char *pre = "";
2009  if (has_description)
2010  rb_str_cat2(mesg, " with ");
2011  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2012  rb_str_cat2(mesg, pre); pre = ",";
2013  rb_str_cat2(mesg, "universal_newline");
2014  }
2015  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2016  rb_str_cat2(mesg, pre); pre = ",";
2017  rb_str_cat2(mesg, "crlf_newline");
2018  }
2019  if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2020  rb_str_cat2(mesg, pre); pre = ",";
2021  rb_str_cat2(mesg, "cr_newline");
2022  }
2023  if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2024  rb_str_cat2(mesg, pre); pre = ",";
2025  rb_str_cat2(mesg, "xml_text");
2026  }
2027  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2028  rb_str_cat2(mesg, pre); pre = ",";
2029  rb_str_cat2(mesg, "xml_attr_content");
2030  }
2031  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2032  rb_str_cat2(mesg, pre); pre = ",";
2033  rb_str_cat2(mesg, "xml_attr_quote");
2034  }
2035  has_description = 1;
2036  }
2037  if (!has_description) {
2038  rb_str_cat2(mesg, "no-conversion");
2039  }
2040 
2041  return mesg;
2042 }
2043 
2044 VALUE
2045 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2046 {
2047  VALUE mesg, exc;
2048  mesg = rb_str_new_cstr("code converter not found (");
2049  econv_description(sname, dname, ecflags, mesg);
2050  rb_str_cat2(mesg, ")");
2052  return exc;
2053 }
2054 
2055 static VALUE
2057 {
2058  VALUE mesg, exc;
2061  const char *err = (const char *)ec->last_error.error_bytes_start;
2062  size_t error_len = ec->last_error.error_bytes_len;
2063  VALUE bytes = rb_str_new(err, error_len);
2064  VALUE dumped = rb_str_dump(bytes);
2065  size_t readagain_len = ec->last_error.readagain_len;
2066  VALUE bytes2 = Qnil;
2067  VALUE dumped2;
2068  int idx;
2070  mesg = rb_sprintf("incomplete %s on %s",
2071  StringValueCStr(dumped),
2073  }
2074  else if (readagain_len) {
2075  bytes2 = rb_str_new(err+error_len, readagain_len);
2076  dumped2 = rb_str_dump(bytes2);
2077  mesg = rb_sprintf("%s followed by %s on %s",
2078  StringValueCStr(dumped),
2079  StringValueCStr(dumped2),
2081  }
2082  else {
2083  mesg = rb_sprintf("%s on %s",
2084  StringValueCStr(dumped),
2086  }
2087 
2089  rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
2090  rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
2091  rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
2092 
2093  set_encs:
2094  rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
2095  rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
2097  if (0 <= idx)
2098  rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2100  if (0 <= idx)
2101  rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2102  return exc;
2103  }
2105  VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2107  VALUE dumped = Qnil;
2108  int idx;
2109  if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2110  rb_encoding *utf8 = rb_utf8_encoding();
2111  const char *start, *end;
2112  int n;
2113  start = (const char *)ec->last_error.error_bytes_start;
2114  end = start + ec->last_error.error_bytes_len;
2115  n = rb_enc_precise_mbclen(start, end, utf8);
2116  if (MBCLEN_CHARFOUND_P(n) &&
2117  (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2118  unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2119  dumped = rb_sprintf("U+%04X", cc);
2120  }
2121  }
2122  if (dumped == Qnil)
2123  dumped = rb_str_dump(bytes);
2124  if (strcmp(ec->last_error.source_encoding,
2125  ec->source_encoding_name) == 0 &&
2126  strcmp(ec->last_error.destination_encoding,
2127  ec->destination_encoding_name) == 0) {
2128  mesg = rb_sprintf("%s from %s to %s",
2129  StringValueCStr(dumped),
2132  }
2133  else {
2134  int i;
2135  mesg = rb_sprintf("%s to %s in conversion from %s",
2136  StringValueCStr(dumped),
2138  ec->source_encoding_name);
2139  for (i = 0; i < ec->num_trans; i++) {
2140  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2141  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2142  rb_str_catf(mesg, " to %s",
2143  ec->elems[i].tc->transcoder->dst_encoding);
2144  }
2145  }
2148  if (0 <= idx)
2149  rb_enc_associate_index(bytes, idx);
2150  rb_ivar_set(exc, rb_intern("error_char"), bytes);
2151  goto set_encs;
2152  }
2153  return Qnil;
2154 }
2155 
2156 static void
2158  VALUE destination,
2159  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2160  int max_output,
2161  unsigned char **out_start_ptr,
2162  unsigned char **out_pos,
2163  unsigned char **out_stop_ptr)
2164 {
2165  size_t len = (*out_pos - *out_start_ptr);
2166  size_t new_len = (len + max_output) * 2;
2167  *out_start_ptr = resize_destination(destination, len, new_len);
2168  *out_pos = *out_start_ptr + len;
2169  *out_stop_ptr = *out_start_ptr + new_len;
2170 }
2171 
2172 static int
2174 {
2175  rb_transcoding *tc;
2176  const rb_transcoder *tr;
2177  rb_encoding *enc;
2178  const unsigned char *replacement;
2179  const char *repl_enc;
2180  const char *ins_enc;
2181  size_t len;
2182 
2183  if (ec->replacement_str)
2184  return 0;
2185 
2186  ins_enc = rb_econv_encoding_to_insert_output(ec);
2187 
2188  tc = ec->last_tc;
2189  if (*ins_enc) {
2190  tr = tc->transcoder;
2191  enc = rb_enc_find(tr->dst_encoding);
2192  replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2193  }
2194  else {
2195  replacement = (unsigned char *)"?";
2196  len = 1;
2197  repl_enc = "";
2198  }
2199 
2200  ec->replacement_str = replacement;
2201  ec->replacement_len = len;
2202  ec->replacement_enc = repl_enc;
2203  ec->replacement_allocated = 0;
2204  return 0;
2205 }
2206 
2207 int
2209  const unsigned char *str, size_t len, const char *encname)
2210 {
2211  unsigned char *str2;
2212  size_t len2;
2213  const char *encname2;
2214 
2215  encname2 = rb_econv_encoding_to_insert_output(ec);
2216 
2217  if (encoding_equal(encname, encname2)) {
2218  str2 = xmalloc(len);
2219  MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2220  len2 = len;
2221  encname2 = encname;
2222  }
2223  else {
2224  str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2225  if (!str2)
2226  return -1;
2227  }
2228 
2229  if (ec->replacement_allocated) {
2230  xfree((void *)ec->replacement_str);
2231  }
2232  ec->replacement_allocated = 1;
2233  ec->replacement_str = str2;
2234  ec->replacement_len = len2;
2235  ec->replacement_enc = encname2;
2236  return 0;
2237 }
2238 
2239 static int
2241 {
2242  int ret;
2243 
2244  if (make_replacement(ec) == -1)
2245  return -1;
2246 
2248  if (ret == -1)
2249  return -1;
2250 
2251  return 0;
2252 }
2253 
2254 #if 1
2255 #define hash_fallback rb_hash_aref
2256 
2257 static VALUE
2259 {
2260  return rb_proc_call(fallback, rb_ary_new4(1, &c));
2261 }
2262 
2263 static VALUE
2265 {
2266  return rb_method_call(1, &c, fallback);
2267 }
2268 
2269 static VALUE
2271 {
2272  return rb_funcall3(fallback, sym_aref, 1, &c);
2273 }
2274 
2275 static void
2276 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2277  const unsigned char *in_stop, unsigned char *out_stop,
2278  VALUE destination,
2279  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2280  const char *src_encoding,
2281  const char *dst_encoding,
2282  int ecflags,
2283  VALUE ecopts)
2284 {
2285  rb_econv_t *ec;
2286  rb_transcoding *last_tc;
2287  rb_econv_result_t ret;
2288  unsigned char *out_start = *out_pos;
2289  int max_output;
2290  VALUE exc;
2291  VALUE fallback = Qnil;
2292  VALUE (*fallback_func)(VALUE, VALUE) = 0;
2293 
2294  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2295  if (!ec)
2296  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2297 
2298  if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH) {
2299  fallback = rb_hash_aref(ecopts, sym_fallback);
2300  if (RB_TYPE_P(fallback, T_HASH)) {
2301  fallback_func = hash_fallback;
2302  }
2303  else if (rb_obj_is_proc(fallback)) {
2304  fallback_func = proc_fallback;
2305  }
2306  else if (rb_obj_is_method(fallback)) {
2307  fallback_func = method_fallback;
2308  }
2309  else {
2310  fallback_func = aref_fallback;
2311  }
2312  }
2313  last_tc = ec->last_tc;
2314  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2315 
2316  resume:
2317  ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2318 
2319  if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2320  VALUE rep = rb_enc_str_new(
2321  (const char *)ec->last_error.error_bytes_start,
2324  rep = (*fallback_func)(fallback, rep);
2325  if (rep != Qundef && !NIL_P(rep)) {
2326  StringValue(rep);
2327  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2328  RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2329  if ((int)ret == -1) {
2330  rb_raise(rb_eArgError, "too big fallback string");
2331  }
2332  goto resume;
2333  }
2334  }
2335 
2336  if (ret == econv_invalid_byte_sequence ||
2337  ret == econv_incomplete_input ||
2338  ret == econv_undefined_conversion) {
2339  exc = make_econv_exception(ec);
2340  rb_econv_close(ec);
2341  rb_exc_raise(exc);
2342  }
2343 
2344  if (ret == econv_destination_buffer_full) {
2345  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2346  goto resume;
2347  }
2348 
2349  rb_econv_close(ec);
2350  return;
2351 }
2352 #else
2353 /* sample transcode_loop implementation in byte-by-byte stream style */
2354 static void
2355 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2356  const unsigned char *in_stop, unsigned char *out_stop,
2357  VALUE destination,
2358  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2359  const char *src_encoding,
2360  const char *dst_encoding,
2361  int ecflags,
2362  VALUE ecopts)
2363 {
2364  rb_econv_t *ec;
2365  rb_transcoding *last_tc;
2366  rb_econv_result_t ret;
2367  unsigned char *out_start = *out_pos;
2368  const unsigned char *ptr;
2369  int max_output;
2370  VALUE exc;
2371 
2372  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2373  if (!ec)
2374  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2375 
2376  last_tc = ec->last_tc;
2377  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2378 
2380  ptr = *in_pos;
2381  while (ret != econv_finished) {
2382  unsigned char input_byte;
2383  const unsigned char *p = &input_byte;
2384 
2385  if (ret == econv_source_buffer_empty) {
2386  if (ptr < in_stop) {
2387  input_byte = *ptr;
2388  ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2389  }
2390  else {
2391  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2392  }
2393  }
2394  else {
2395  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2396  }
2397  if (&input_byte != p)
2398  ptr += p - &input_byte;
2399  switch (ret) {
2403  exc = make_econv_exception(ec);
2404  rb_econv_close(ec);
2405  rb_exc_raise(exc);
2406  break;
2407 
2409  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2410  break;
2411 
2413  break;
2414 
2415  case econv_finished:
2416  break;
2417  }
2418  }
2419  rb_econv_close(ec);
2420  *in_pos = in_stop;
2421  return;
2422 }
2423 #endif
2424 
2425 
2426 /*
2427  * String-specific code
2428  */
2429 
2430 static unsigned char *
2431 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2432 {
2433  rb_str_resize(destination, new_len);
2434  return (unsigned char *)RSTRING_PTR(destination);
2435 }
2436 
2437 static int
2438 econv_opts(VALUE opt, int ecflags)
2439 {
2440  VALUE v;
2441 
2442  v = rb_hash_aref(opt, sym_invalid);
2443  if (NIL_P(v)) {
2444  }
2445  else if (v==sym_replace) {
2446  ecflags |= ECONV_INVALID_REPLACE;
2447  }
2448  else {
2449  rb_raise(rb_eArgError, "unknown value for invalid character option");
2450  }
2451 
2452  v = rb_hash_aref(opt, sym_undef);
2453  if (NIL_P(v)) {
2454  }
2455  else if (v==sym_replace) {
2456  ecflags |= ECONV_UNDEF_REPLACE;
2457  }
2458  else {
2459  rb_raise(rb_eArgError, "unknown value for undefined character option");
2460  }
2461 
2462  v = rb_hash_aref(opt, sym_replace);
2463  if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2464  ecflags |= ECONV_UNDEF_REPLACE;
2465  }
2466 
2467  v = rb_hash_aref(opt, sym_xml);
2468  if (!NIL_P(v)) {
2469  if (v==sym_text) {
2471  }
2472  else if (v==sym_attr) {
2474  }
2475  else if (TYPE(v) == T_SYMBOL) {
2476  rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
2477  }
2478  else {
2479  rb_raise(rb_eArgError, "unexpected value for xml option");
2480  }
2481  }
2482 
2483 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2484  v = rb_hash_aref(opt, sym_newline);
2485  if (!NIL_P(v)) {
2486  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2487  if (v == sym_universal) {
2489  }
2490  else if (v == sym_crlf) {
2491  ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2492  }
2493  else if (v == sym_cr) {
2494  ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2495  }
2496  else if (v == sym_lf) {
2497  /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2498  }
2499  else if (SYMBOL_P(v)) {
2500  rb_raise(rb_eArgError, "unexpected value for newline option: %s",
2501  rb_id2name(SYM2ID(v)));
2502  }
2503  else {
2504  rb_raise(rb_eArgError, "unexpected value for newline option");
2505  }
2506  }
2507  else
2508 #endif
2509  {
2510  int setflags = 0, newlineflag = 0;
2511 
2513  if (RTEST(v))
2515  newlineflag |= !NIL_P(v);
2516 
2517  v = rb_hash_aref(opt, sym_crlf_newline);
2518  if (RTEST(v))
2519  setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2520  newlineflag |= !NIL_P(v);
2521 
2522  v = rb_hash_aref(opt, sym_cr_newline);
2523  if (RTEST(v))
2524  setflags |= ECONV_CR_NEWLINE_DECORATOR;
2525  newlineflag |= !NIL_P(v);
2526 
2527  if (newlineflag) {
2528  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2529  ecflags |= setflags;
2530  }
2531  }
2532 
2533  return ecflags;
2534 }
2535 
2536 int
2537 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2538 {
2539  VALUE newhash = Qnil;
2540  VALUE v;
2541 
2542  if (NIL_P(opthash)) {
2543  *opts = Qnil;
2544  return ecflags;
2545  }
2546  ecflags = econv_opts(opthash, ecflags);
2547 
2548  v = rb_hash_aref(opthash, sym_replace);
2549  if (!NIL_P(v)) {
2550  StringValue(v);
2552  VALUE dumped = rb_str_dump(v);
2553  rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2554  StringValueCStr(dumped),
2555  rb_enc_name(rb_enc_get(v)));
2556  }
2557  v = rb_str_new_frozen(v);
2558  newhash = rb_hash_new();
2559  rb_hash_aset(newhash, sym_replace, v);
2560  }
2561 
2562  v = rb_hash_aref(opthash, sym_fallback);
2563  if (!NIL_P(v)) {
2564  VALUE h = rb_check_hash_type(v);
2565  if (NIL_P(h)
2567  : (v = h, 1)) {
2568  if (NIL_P(newhash))
2569  newhash = rb_hash_new();
2570  rb_hash_aset(newhash, sym_fallback, v);
2571  }
2572  }
2573 
2574  if (!NIL_P(newhash))
2575  rb_hash_freeze(newhash);
2576  *opts = newhash;
2577 
2578  return ecflags;
2579 }
2580 
2581 int
2583 {
2584  return rb_econv_prepare_options(opthash, opts, 0);
2585 }
2586 
2587 rb_econv_t *
2588 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2589 {
2590  rb_econv_t *ec;
2591  VALUE replacement;
2592 
2593  if (NIL_P(opthash)) {
2594  replacement = Qnil;
2595  }
2596  else {
2597  if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash))
2598  rb_bug("rb_econv_open_opts called with invalid opthash");
2599  replacement = rb_hash_aref(opthash, sym_replace);
2600  }
2601 
2602  ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2603  if (!ec)
2604  return ec;
2605 
2606  if (!NIL_P(replacement)) {
2607  int ret;
2608  rb_encoding *enc = rb_enc_get(replacement);
2609 
2610  ret = rb_econv_set_replacement(ec,
2611  (const unsigned char *)RSTRING_PTR(replacement),
2612  RSTRING_LEN(replacement),
2613  rb_enc_name(enc));
2614  if (ret == -1) {
2615  rb_econv_close(ec);
2616  return NULL;
2617  }
2618  }
2619  return ec;
2620 }
2621 
2622 static int
2623 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
2624 {
2625  rb_encoding *enc;
2626  const char *n;
2627  int encidx;
2628  VALUE encval;
2629 
2630  if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2631  !(enc = rb_enc_from_index(encidx))) {
2632  enc = NULL;
2633  encidx = 0;
2634  n = StringValueCStr(*arg);
2635  }
2636  else {
2637  n = rb_enc_name(enc);
2638  }
2639 
2640  *name_p = n;
2641  *enc_p = enc;
2642 
2643  return encidx;
2644 }
2645 
2646 static int
2647 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
2648  const char **sname_p, rb_encoding **senc_p,
2649  const char **dname_p, rb_encoding **denc_p)
2650 {
2651  rb_encoding *senc, *denc;
2652  const char *sname, *dname;
2653  int sencidx, dencidx;
2654 
2655  dencidx = enc_arg(arg1, &dname, &denc);
2656 
2657  if (NIL_P(*arg2)) {
2658  sencidx = rb_enc_get_index(str);
2659  senc = rb_enc_from_index(sencidx);
2660  sname = rb_enc_name(senc);
2661  }
2662  else {
2663  sencidx = enc_arg(arg2, &sname, &senc);
2664  }
2665 
2666  *sname_p = sname;
2667  *senc_p = senc;
2668  *dname_p = dname;
2669  *denc_p = denc;
2670  return dencidx;
2671 }
2672 
2673 static int
2674 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2675 {
2676  VALUE dest;
2677  VALUE str = *self;
2678  volatile VALUE arg1, arg2;
2679  long blen, slen;
2680  unsigned char *buf, *bp, *sp;
2681  const unsigned char *fromp;
2682  rb_encoding *senc, *denc;
2683  const char *sname, *dname;
2684  int dencidx;
2685 
2686  if (argc <0 || argc > 2) {
2687  rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
2688  }
2689 
2690  if (argc == 0) {
2691  arg1 = rb_enc_default_internal();
2692  if (NIL_P(arg1)) {
2693  if (!ecflags) return -1;
2694  arg1 = rb_obj_encoding(str);
2695  }
2697  }
2698  else {
2699  arg1 = argv[0];
2700  }
2701  arg2 = argc<=1 ? Qnil : argv[1];
2702  dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2703 
2704  if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2708  if (senc && senc == denc) {
2709  return NIL_P(arg2) ? -1 : dencidx;
2710  }
2711  if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2713  return dencidx;
2714  }
2715  }
2716  if (encoding_equal(sname, dname)) {
2717  return NIL_P(arg2) ? -1 : dencidx;
2718  }
2719  }
2720  else {
2721  if (encoding_equal(sname, dname)) {
2722  sname = "";
2723  dname = "";
2724  }
2725  }
2726 
2727  fromp = sp = (unsigned char *)RSTRING_PTR(str);
2728  slen = RSTRING_LEN(str);
2729  blen = slen + 30; /* len + margin */
2730  dest = rb_str_tmp_new(blen);
2731  bp = (unsigned char *)RSTRING_PTR(dest);
2732 
2733  transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2734  if (fromp != sp+slen) {
2735  rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2736  }
2737  buf = (unsigned char *)RSTRING_PTR(dest);
2738  *bp = '\0';
2739  rb_str_set_len(dest, bp - buf);
2740 
2741  /* set encoding */
2742  if (!denc) {
2743  dencidx = rb_define_dummy_encoding(dname);
2744  }
2745  *self = dest;
2746 
2747  return dencidx;
2748 }
2749 
2750 static int
2752 {
2753  VALUE opt;
2754  int ecflags = 0;
2755  VALUE ecopts = Qnil;
2756 
2757  argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2758  if (!NIL_P(opt)) {
2759  ecflags = rb_econv_prepare_opts(opt, &ecopts);
2760  }
2761  return str_transcode0(argc, argv, self, ecflags, ecopts);
2762 }
2763 
2764 static inline VALUE
2765 str_encode_associate(VALUE str, int encidx)
2766 {
2767  int cr = 0;
2768 
2769  rb_enc_associate_index(str, encidx);
2770 
2771  /* transcoded string never be broken. */
2772  if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2774  }
2775  else {
2776  cr = ENC_CODERANGE_VALID;
2777  }
2778  ENC_CODERANGE_SET(str, cr);
2779  return str;
2780 }
2781 
2782 /*
2783  * call-seq:
2784  * str.encode!(encoding [, options] ) -> str
2785  * str.encode!(dst_encoding, src_encoding [, options] ) -> str
2786  *
2787  * The first form transcodes the contents of <i>str</i> from
2788  * str.encoding to +encoding+.
2789  * The second form transcodes the contents of <i>str</i> from
2790  * src_encoding to dst_encoding.
2791  * The options Hash gives details for conversion. See String#encode
2792  * for details.
2793  * Returns the string even if no changes were made.
2794  */
2795 
2796 static VALUE
2798 {
2799  VALUE newstr;
2800  int encidx;
2801 
2802  rb_check_frozen(str);
2803 
2804  newstr = str;
2805  encidx = str_transcode(argc, argv, &newstr);
2806 
2807  if (encidx < 0) return str;
2808  if (newstr == str) {
2809  rb_enc_associate_index(str, encidx);
2810  return str;
2811  }
2812  rb_str_shared_replace(str, newstr);
2813  return str_encode_associate(str, encidx);
2814 }
2815 
2816 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2817 
2818 /*
2819  * call-seq:
2820  * str.encode(encoding [, options] ) -> str
2821  * str.encode(dst_encoding, src_encoding [, options] ) -> str
2822  * str.encode([options]) -> str
2823  *
2824  * The first form returns a copy of +str+ transcoded
2825  * to encoding +encoding+.
2826  * The second form returns a copy of +str+ transcoded
2827  * from src_encoding to dst_encoding.
2828  * The last form returns a copy of +str+ transcoded to
2829  * <tt>Encoding.default_internal</tt>.
2830  *
2831  * By default, the first and second form raise
2832  * Encoding::UndefinedConversionError for characters that are
2833  * undefined in the destination encoding, and
2834  * Encoding::InvalidByteSequenceError for invalid byte sequences
2835  * in the source encoding. The last form by default does not raise
2836  * exceptions but uses replacement strings.
2837  *
2838  * Please note that conversion from an encoding +enc+ to the
2839  * same encoding +enc+ is a no-op, i.e. the receiver is returned without
2840  * any changes, and no exceptions are raised, even if there are invalid bytes.
2841  *
2842  * The +options+ Hash gives details for conversion and can have the following
2843  * keys:
2844  *
2845  * :invalid ::
2846  * If the value is +:replace+, #encode replaces invalid byte sequences in
2847  * +str+ with the replacement character. The default is to raise the
2848  * Encoding::InvalidByteSequenceError exception
2849  * :undef ::
2850  * If the value is +:replace+, #encode replaces characters which are
2851  * undefined in the destination encoding with the replacement character.
2852  * The default is to raise the Encoding::UndefinedConversionError.
2853  * :replace ::
2854  * Sets the replacement string to the given value. The default replacement
2855  * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2856  * :fallback ::
2857  * Sets the replacement string by the given object for undefined
2858  * character. The object should be a Hash, a Proc, a Method, or an
2859  * object which has [] method.
2860  * Its key is an undefined character encoded in the source encoding
2861  * of current transcoder. Its value can be any encoding until it
2862  * can be converted into the destination encoding of the transcoder.
2863  * :xml ::
2864  * The value must be +:text+ or +:attr+.
2865  * If the value is +:text+ #encode replaces undefined characters with their
2866  * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2867  * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2868  * If the value is +:attr+, #encode also quotes the replacement result
2869  * (using '"'), and replaces '"' with "&quot;".
2870  * :cr_newline ::
2871  * Replaces LF ("\n") with CR ("\r") if value is true.
2872  * :crlf_newline ::
2873  * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2874  * :universal_newline ::
2875  * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2876  */
2877 
2878 static VALUE
2880 {
2881  VALUE newstr = str;
2882  int encidx = str_transcode(argc, argv, &newstr);
2883  return encoded_dup(newstr, str, encidx);
2884 }
2885 
2886 VALUE
2887 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2888 {
2889  int argc = 1;
2890  VALUE *argv = &to;
2891  VALUE newstr = str;
2892  int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2893  return encoded_dup(newstr, str, encidx);
2894 }
2895 
2896 static VALUE
2897 encoded_dup(VALUE newstr, VALUE str, int encidx)
2898 {
2899  if (encidx < 0) return rb_str_dup(str);
2900  if (newstr == str) {
2901  newstr = rb_str_dup(str);
2902  rb_enc_associate_index(newstr, encidx);
2903  return newstr;
2904  }
2905  else {
2906  RBASIC(newstr)->klass = rb_obj_class(str);
2907  }
2908  return str_encode_associate(newstr, encidx);
2909 }
2910 
2911 static void
2912 econv_free(void *ptr)
2913 {
2914  rb_econv_t *ec = ptr;
2915  rb_econv_close(ec);
2916 }
2917 
2918 static size_t
2919 econv_memsize(const void *ptr)
2920 {
2921  return ptr ? sizeof(rb_econv_t) : 0;
2922 }
2923 
2925  "econv",
2927 };
2928 
2929 static VALUE
2931 {
2932  return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2933 }
2934 
2935 static rb_encoding *
2937 {
2938  rb_encoding *enc;
2939  int idx;
2940  idx = rb_define_dummy_encoding(name);
2941  enc = rb_enc_from_index(idx);
2942  return enc;
2943 }
2944 
2945 static rb_encoding *
2946 make_encoding(const char *name)
2947 {
2948  rb_encoding *enc;
2949  enc = rb_enc_find(name);
2950  if (!enc)
2951  enc = make_dummy_encoding(name);
2952  return enc;
2953 }
2954 
2955 static VALUE
2956 make_encobj(const char *name)
2957 {
2958  return rb_enc_from_encoding(make_encoding(name));
2959 }
2960 
2961 /*
2962  * call-seq:
2963  * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2964  * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2965  *
2966  * Returns the corresponding ASCII compatible encoding.
2967  *
2968  * Returns nil if the argument is an ASCII compatible encoding.
2969  *
2970  * "corresponding ASCII compatible encoding" is a ASCII compatible encoding which
2971  * can represents exactly the same characters as the given ASCII incompatible encoding.
2972  * So, no conversion undefined error occurs when converting between the two encodings.
2973  *
2974  * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2975  * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2976  * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2977  *
2978  */
2979 static VALUE
2981 {
2982  const char *arg_name, *result_name;
2983  rb_encoding *arg_enc, *result_enc;
2984 
2985  enc_arg(&arg, &arg_name, &arg_enc);
2986 
2987  result_name = rb_econv_asciicompat_encoding(arg_name);
2988 
2989  if (result_name == NULL)
2990  return Qnil;
2991 
2992  result_enc = make_encoding(result_name);
2993 
2994  return rb_enc_from_encoding(result_enc);
2995 }
2996 
2997 static void
2999  volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
3000  const char **sname_p, const char **dname_p,
3001  rb_encoding **senc_p, rb_encoding **denc_p,
3002  int *ecflags_p,
3003  VALUE *ecopts_p)
3004 {
3005  VALUE opt, flags_v, ecopts;
3006  int sidx, didx;
3007  const char *sname, *dname;
3008  rb_encoding *senc, *denc;
3009  int ecflags;
3010 
3011  argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3012 
3013  if (!NIL_P(flags_v)) {
3014  if (!NIL_P(opt)) {
3015  rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)",
3016  argc + 1);
3017  }
3018  ecflags = NUM2INT(rb_to_int(flags_v));
3019  ecopts = Qnil;
3020  }
3021  else if (!NIL_P(opt)) {
3022  ecflags = rb_econv_prepare_opts(opt, &ecopts);
3023  }
3024  else {
3025  ecflags = 0;
3026  ecopts = Qnil;
3027  }
3028 
3029  senc = NULL;
3030  sidx = rb_to_encoding_index(*snamev_p);
3031  if (0 <= sidx) {
3032  senc = rb_enc_from_index(sidx);
3033  }
3034  else {
3035  StringValue(*snamev_p);
3036  }
3037 
3038  denc = NULL;
3039  didx = rb_to_encoding_index(*dnamev_p);
3040  if (0 <= didx) {
3041  denc = rb_enc_from_index(didx);
3042  }
3043  else {
3044  StringValue(*dnamev_p);
3045  }
3046 
3047  sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3048  dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3049 
3050  *sname_p = sname;
3051  *dname_p = dname;
3052  *senc_p = senc;
3053  *denc_p = denc;
3054  *ecflags_p = ecflags;
3055  *ecopts_p = ecopts;
3056 }
3057 
3058 static int
3059 decorate_convpath(VALUE convpath, int ecflags)
3060 {
3061  int num_decorators;
3062  const char *decorators[MAX_ECFLAGS_DECORATORS];
3063  int i;
3064  int n, len;
3065 
3066  num_decorators = decorator_names(ecflags, decorators);
3067  if (num_decorators == -1)
3068  return -1;
3069 
3070  len = n = RARRAY_LENINT(convpath);
3071  if (n != 0) {
3072  VALUE pair = RARRAY_PTR(convpath)[n-1];
3073  if (TYPE(pair) == T_ARRAY) {
3074  const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
3075  const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
3076  transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3077  const rb_transcoder *tr = load_transcoder_entry(entry);
3078  if (!tr)
3079  return -1;
3080  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3082  n--;
3083  rb_ary_store(convpath, len + num_decorators - 1, pair);
3084  }
3085  }
3086  else {
3087  rb_ary_store(convpath, len + num_decorators - 1, pair);
3088  }
3089  }
3090 
3091  for (i = 0; i < num_decorators; i++)
3092  rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3093 
3094  return 0;
3095 }
3096 
3097 static void
3098 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3099 {
3100  VALUE *ary_p = arg;
3101  VALUE v;
3102 
3103  if (*ary_p == Qnil) {
3104  *ary_p = rb_ary_new();
3105  }
3106 
3107  if (DECORATOR_P(sname, dname)) {
3108  v = rb_str_new_cstr(dname);
3109  }
3110  else {
3111  v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3112  }
3113  rb_ary_store(*ary_p, depth, v);
3114 }
3115 
3116 /*
3117  * call-seq:
3118  * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3119  * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3120  *
3121  * Returns a conversion path.
3122  *
3123  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3124  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3125  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3126  *
3127  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3128  * or
3129  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3130  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3131  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3132  * # "universal_newline"]
3133  *
3134  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3135  * or
3136  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3137  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3138  * # "universal_newline",
3139  * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3140  */
3141 static VALUE
3143 {
3144  volatile VALUE snamev, dnamev;
3145  const char *sname, *dname;
3146  rb_encoding *senc, *denc;
3147  int ecflags;
3148  VALUE ecopts;
3149  VALUE convpath;
3150 
3151  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3152 
3153  convpath = Qnil;
3154  transcode_search_path(sname, dname, search_convpath_i, &convpath);
3155 
3156  if (NIL_P(convpath))
3157  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3158 
3159  if (decorate_convpath(convpath, ecflags) == -1)
3160  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3161 
3162  return convpath;
3163 }
3164 
3165 /*
3166  * Check the existence of a conversion path.
3167  * Returns the number of converters in the conversion path.
3168  * result: >=0:success -1:failure
3169  */
3170 int
3171 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3172 {
3173  VALUE convpath = Qnil;
3174  transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3175  &convpath);
3176  return RTEST(convpath);
3177 }
3178 
3181  int index;
3182  int ret;
3183 };
3184 
3185 static void
3186 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3187 {
3188  struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3189  int ret;
3190 
3191  if (a->ret == -1)
3192  return;
3193 
3194  ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3195 
3196  a->ret = ret;
3197  return;
3198 }
3199 
3200 static rb_econv_t *
3202  const char **sname_p, const char **dname_p,
3203  rb_encoding **senc_p, rb_encoding**denc_p)
3204 {
3205  rb_econv_t *ec;
3206  long i;
3207  int ret, first=1;
3208  VALUE elt;
3209  rb_encoding *senc = 0, *denc = 0;
3210  const char *sname, *dname;
3211 
3212  ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3213  DATA_PTR(self) = ec;
3214 
3215  for (i = 0; i < RARRAY_LEN(convpath); i++) {
3216  volatile VALUE snamev, dnamev;
3217  VALUE pair;
3218  elt = rb_ary_entry(convpath, i);
3219  if (!NIL_P(pair = rb_check_array_type(elt))) {
3220  if (RARRAY_LEN(pair) != 2)
3221  rb_raise(rb_eArgError, "not a 2-element array in convpath");
3222  snamev = rb_ary_entry(pair, 0);
3223  enc_arg(&snamev, &sname, &senc);
3224  dnamev = rb_ary_entry(pair, 1);
3225  enc_arg(&dnamev, &dname, &denc);
3226  }
3227  else {
3228  sname = "";
3229  dname = StringValueCStr(elt);
3230  }
3231  if (DECORATOR_P(sname, dname)) {
3232  ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3233  if (ret == -1)
3234  rb_raise(rb_eArgError, "decoration failed: %s", dname);
3235  }
3236  else {
3237  int j = ec->num_trans;
3238  struct rb_econv_init_by_convpath_t arg;
3239  arg.ec = ec;
3240  arg.index = ec->num_trans;
3241  arg.ret = 0;
3242  ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3243  if (ret == -1 || arg.ret == -1)
3244  rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
3245  if (first) {
3246  first = 0;
3247  *senc_p = senc;
3248  *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3249  }
3250  *denc_p = denc;
3251  *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3252  }
3253  }
3254 
3255  if (first) {
3256  *senc_p = NULL;
3257  *denc_p = NULL;
3258  *sname_p = "";
3259  *dname_p = "";
3260  }
3261 
3262  ec->source_encoding_name = *sname_p;
3263  ec->destination_encoding_name = *dname_p;
3264 
3265  return ec;
3266 }
3267 
3268 /*
3269  * call-seq:
3270  * Encoding::Converter.new(source_encoding, destination_encoding)
3271  * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3272  * Encoding::Converter.new(convpath)
3273  *
3274  * possible options elements:
3275  * hash form:
3276  * :invalid => nil # raise error on invalid byte sequence (default)
3277  * :invalid => :replace # replace invalid byte sequence
3278  * :undef => nil # raise error on undefined conversion (default)
3279  * :undef => :replace # replace undefined conversion
3280  * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3281  * :newline => :universal # decorator for converting CRLF and CR to LF
3282  * :newline => :crlf # decorator for converting LF to CRLF
3283  * :newline => :cr # decorator for converting LF to CR
3284  * :universal_newline => true # decorator for converting CRLF and CR to LF
3285  * :crlf_newline => true # decorator for converting LF to CRLF
3286  * :cr_newline => true # decorator for converting LF to CR
3287  * :xml => :text # escape as XML CharData.
3288  * :xml => :attr # escape as XML AttValue
3289  * integer form:
3290  * Encoding::Converter::INVALID_REPLACE
3291  * Encoding::Converter::UNDEF_REPLACE
3292  * Encoding::Converter::UNDEF_HEX_CHARREF
3293  * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3294  * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3295  * Encoding::Converter::CR_NEWLINE_DECORATOR
3296  * Encoding::Converter::XML_TEXT_DECORATOR
3297  * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3298  * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3299  *
3300  * Encoding::Converter.new creates an instance of Encoding::Converter.
3301  *
3302  * Source_encoding and destination_encoding should be a string or
3303  * Encoding object.
3304  *
3305  * opt should be nil, a hash or an integer.
3306  *
3307  * convpath should be an array.
3308  * convpath may contain
3309  * - two-element arrays which contain encodings or encoding names, or
3310  * - strings representing decorator names.
3311  *
3312  * Encoding::Converter.new optionally takes an option.
3313  * The option should be a hash or an integer.
3314  * The option hash can contain :invalid => nil, etc.
3315  * The option integer should be logical-or of constants such as
3316  * Encoding::Converter::INVALID_REPLACE, etc.
3317  *
3318  * [:invalid => nil]
3319  * Raise error on invalid byte sequence. This is a default behavior.
3320  * [:invalid => :replace]
3321  * Replace invalid byte sequence by replacement string.
3322  * [:undef => nil]
3323  * Raise an error if a character in source_encoding is not defined in destination_encoding.
3324  * This is a default behavior.
3325  * [:undef => :replace]
3326  * Replace undefined character in destination_encoding with replacement string.
3327  * [:replace => string]
3328  * Specify the replacement string.
3329  * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3330  * [:universal_newline => true]
3331  * Convert CRLF and CR to LF.
3332  * [:crlf_newline => true]
3333  * Convert LF to CRLF.
3334  * [:cr_newline => true]
3335  * Convert LF to CR.
3336  * [:xml => :text]
3337  * Escape as XML CharData.
3338  * This form can be used as a HTML 4.0 #PCDATA.
3339  * - '&' -> '&amp;'
3340  * - '<' -> '&lt;'
3341  * - '>' -> '&gt;'
3342  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3343  * [:xml => :attr]
3344  * Escape as XML AttValue.
3345  * The converted result is quoted as "...".
3346  * This form can be used as a HTML 4.0 attribute value.
3347  * - '&' -> '&amp;'
3348  * - '<' -> '&lt;'
3349  * - '>' -> '&gt;'
3350  * - '"' -> '&quot;'
3351  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3352  *
3353  * Examples:
3354  * # UTF-16BE to UTF-8
3355  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3356  *
3357  * # Usually, decorators such as newline conversion are inserted last.
3358  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3359  * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3360  * # "universal_newline"]
3361  *
3362  * # But, if the last encoding is ASCII incompatible,
3363  * # decorators are inserted before the last conversion.
3364  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3365  * p ec.convpath #=> ["crlf_newline",
3366  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3367  *
3368  * # Conversion path can be specified directly.
3369  * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3370  * p ec.convpath #=> ["universal_newline",
3371  * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3372  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3373  */
3374 static VALUE
3376 {
3377  VALUE ecopts;
3378  volatile VALUE snamev, dnamev;
3379  const char *sname, *dname;
3380  rb_encoding *senc, *denc;
3381  rb_econv_t *ec;
3382  int ecflags;
3383  VALUE convpath;
3384 
3385  if (rb_check_typeddata(self, &econv_data_type)) {
3386  rb_raise(rb_eTypeError, "already initialized");
3387  }
3388 
3389  if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3390  ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3391  ecflags = 0;
3392  ecopts = Qnil;
3393  }
3394  else {
3395  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3396  ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3397  }
3398 
3399  if (!ec) {
3400  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3401  }
3402 
3403  if (!DECORATOR_P(sname, dname)) {
3404  if (!senc)
3405  senc = make_dummy_encoding(sname);
3406  if (!denc)
3407  denc = make_dummy_encoding(dname);
3408  }
3409 
3410  ec->source_encoding = senc;
3411  ec->destination_encoding = denc;
3412 
3413  DATA_PTR(self) = ec;
3414 
3415  return self;
3416 }
3417 
3418 /*
3419  * call-seq:
3420  * ec.inspect -> string
3421  *
3422  * Returns a printable version of <i>ec</i>
3423  *
3424  * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3425  * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3426  *
3427  */
3428 static VALUE
3430 {
3431  const char *cname = rb_obj_classname(self);
3432  rb_econv_t *ec;
3433 
3434  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3435  if (!ec)
3436  return rb_sprintf("#<%s: uninitialized>", cname);
3437  else {
3438  const char *sname = ec->source_encoding_name;
3439  const char *dname = ec->destination_encoding_name;
3440  VALUE str;
3441  str = rb_sprintf("#<%s: ", cname);
3442  econv_description(sname, dname, ec->flags, str);
3443  rb_str_cat2(str, ">");
3444  return str;
3445  }
3446 }
3447 
3448 static rb_econv_t *
3450 {
3451  rb_econv_t *ec;
3452 
3453  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3454  if (!ec) {
3455  rb_raise(rb_eTypeError, "uninitialized encoding converter");
3456  }
3457  return ec;
3458 }
3459 
3460 /*
3461  * call-seq:
3462  * ec.source_encoding -> encoding
3463  *
3464  * Returns the source encoding as an Encoding object.
3465  */
3466 static VALUE
3468 {
3469  rb_econv_t *ec = check_econv(self);
3470  if (!ec->source_encoding)
3471  return Qnil;
3473 }
3474 
3475 /*
3476  * call-seq:
3477  * ec.destination_encoding -> encoding
3478  *
3479  * Returns the destination encoding as an Encoding object.
3480  */
3481 static VALUE
3483 {
3484  rb_econv_t *ec = check_econv(self);
3485  if (!ec->destination_encoding)
3486  return Qnil;
3488 }
3489 
3490 /*
3491  * call-seq:
3492  * ec.convpath -> ary
3493  *
3494  * Returns the conversion path of ec.
3495  *
3496  * The result is an array of conversions.
3497  *
3498  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3499  * p ec.convpath
3500  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3501  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3502  * # "crlf_newline"]
3503  *
3504  * Each element of the array is a pair of encodings or a string.
3505  * A pair means an encoding conversion.
3506  * A string means a decorator.
3507  *
3508  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3509  * a converter from ISO-8859-1 to UTF-8.
3510  * "crlf_newline" means newline converter from LF to CRLF.
3511  */
3512 static VALUE
3514 {
3515  rb_econv_t *ec = check_econv(self);
3516  VALUE result;
3517  int i;
3518 
3519  result = rb_ary_new();
3520  for (i = 0; i < ec->num_trans; i++) {
3521  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3522  VALUE v;
3523  if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3524  v = rb_str_new_cstr(tr->dst_encoding);
3525  else
3527  rb_ary_push(result, v);
3528  }
3529  return result;
3530 }
3531 
3532 /*
3533  * call-seq:
3534  * ec == other -> true or false
3535  */
3536 static VALUE
3538 {
3539  rb_econv_t *ec1 = check_econv(self);
3540  rb_econv_t *ec2;
3541  int i;
3542 
3543  if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3544  return Qnil;
3545  }
3546  ec2 = DATA_PTR(other);
3547  if (!ec2) return Qfalse;
3548  if (ec1->source_encoding_name != ec2->source_encoding_name &&
3549  strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3550  return Qfalse;
3553  return Qfalse;
3554  if (ec1->flags != ec2->flags) return Qfalse;
3555  if (ec1->replacement_enc != ec2->replacement_enc &&
3556  strcmp(ec1->replacement_enc, ec2->replacement_enc))
3557  return Qfalse;
3558  if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3559  if (ec1->replacement_str != ec2->replacement_str &&
3561  return Qfalse;
3562 
3563  if (ec1->num_trans != ec2->num_trans) return Qfalse;
3564  for (i = 0; i < ec1->num_trans; i++) {
3565  if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3566  return Qfalse;
3567  }
3568  return Qtrue;
3569 }
3570 
3571 static VALUE
3573 {
3574  switch (res) {
3580  case econv_finished: return sym_finished;
3581  case econv_after_output: return sym_after_output;
3582  default: return INT2NUM(res); /* should not be reached */
3583  }
3584 }
3585 
3586 /*
3587  * call-seq:
3588  * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3589  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3590  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3591  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3592  *
3593  * possible opt elements:
3594  * hash form:
3595  * :partial_input => true # source buffer may be part of larger source
3596  * :after_output => true # stop conversion after output before input
3597  * integer form:
3598  * Encoding::Converter::PARTIAL_INPUT
3599  * Encoding::Converter::AFTER_OUTPUT
3600  *
3601  * possible results:
3602  * :invalid_byte_sequence
3603  * :incomplete_input
3604  * :undefined_conversion
3605  * :after_output
3606  * :destination_buffer_full
3607  * :source_buffer_empty
3608  * :finished
3609  *
3610  * primitive_convert converts source_buffer into destination_buffer.
3611  *
3612  * source_buffer should be a string or nil.
3613  * nil means a empty string.
3614  *
3615  * destination_buffer should be a string.
3616  *
3617  * destination_byteoffset should be an integer or nil.
3618  * nil means the end of destination_buffer.
3619  * If it is omitted, nil is assumed.
3620  *
3621  * destination_bytesize should be an integer or nil.
3622  * nil means unlimited.
3623  * If it is omitted, nil is assumed.
3624  *
3625  * opt should be nil, a hash or an integer.
3626  * nil means no flags.
3627  * If it is omitted, nil is assumed.
3628  *
3629  * primitive_convert converts the content of source_buffer from beginning
3630  * and store the result into destination_buffer.
3631  *
3632  * destination_byteoffset and destination_bytesize specify the region which
3633  * the converted result is stored.
3634  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3635  * If destination_byteoffset is nil,
3636  * destination_buffer.bytesize is used for appending the result.
3637  * destination_bytesize specifies maximum number of bytes.
3638  * If destination_bytesize is nil,
3639  * destination size is unlimited.
3640  * After conversion, destination_buffer is resized to
3641  * destination_byteoffset + actually produced number of bytes.
3642  * Also destination_buffer's encoding is set to destination_encoding.
3643  *
3644  * primitive_convert drops the converted part of source_buffer.
3645  * the dropped part is converted in destination_buffer or
3646  * buffered in Encoding::Converter object.
3647  *
3648  * primitive_convert stops conversion when one of following condition met.
3649  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3650  * - unexpected end of source buffer (:incomplete_input)
3651  * this occur only when :partial_input is not specified.
3652  * - character not representable in output encoding (:undefined_conversion)
3653  * - after some output is generated, before input is done (:after_output)
3654  * this occur only when :after_output is specified.
3655  * - destination buffer is full (:destination_buffer_full)
3656  * this occur only when destination_bytesize is non-nil.
3657  * - source buffer is empty (:source_buffer_empty)
3658  * this occur only when :partial_input is specified.
3659  * - conversion is finished (:finished)
3660  *
3661  * example:
3662  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3663  * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3664  * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3665  *
3666  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3667  * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3668  * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3669  * ret = ec.primitive_convert(src, dst="", nil, 1)
3670  * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3671  * ret = ec.primitive_convert(src, dst="", nil, 1)
3672  * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3673  * ret = ec.primitive_convert(src, dst="", nil, 1)
3674  * p [ret, src, dst] #=> [:finished, "", "i"]
3675  *
3676  */
3677 static VALUE
3679 {
3680  VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3681  rb_econv_t *ec = check_econv(self);
3682  rb_econv_result_t res;
3683  const unsigned char *ip, *is;
3684  unsigned char *op, *os;
3685  long output_byteoffset, output_bytesize;
3686  unsigned long output_byteend;
3687  int flags;
3688 
3689  argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3690 
3691  if (NIL_P(output_byteoffset_v))
3692  output_byteoffset = 0; /* dummy */
3693  else
3694  output_byteoffset = NUM2LONG(output_byteoffset_v);
3695 
3696  if (NIL_P(output_bytesize_v))
3697  output_bytesize = 0; /* dummy */
3698  else
3699  output_bytesize = NUM2LONG(output_bytesize_v);
3700 
3701  if (!NIL_P(flags_v)) {
3702  if (!NIL_P(opt)) {
3703  rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..5)",
3704  argc + 1);
3705  }
3706  flags = NUM2INT(rb_to_int(flags_v));
3707  }
3708  else if (!NIL_P(opt)) {
3709  VALUE v;
3710  flags = 0;
3711  v = rb_hash_aref(opt, sym_partial_input);
3712  if (RTEST(v))
3713  flags |= ECONV_PARTIAL_INPUT;
3714  v = rb_hash_aref(opt, sym_after_output);
3715  if (RTEST(v))
3716  flags |= ECONV_AFTER_OUTPUT;
3717  }
3718  else {
3719  flags = 0;
3720  }
3721 
3722  StringValue(output);
3723  if (!NIL_P(input))
3724  StringValue(input);
3725  rb_str_modify(output);
3726 
3727  if (NIL_P(output_bytesize_v)) {
3728  output_bytesize = RSTRING_EMBED_LEN_MAX;
3729  if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3730  output_bytesize = RSTRING_LEN(input);
3731  }
3732 
3733  retry:
3734 
3735  if (NIL_P(output_byteoffset_v))
3736  output_byteoffset = RSTRING_LEN(output);
3737 
3738  if (output_byteoffset < 0)
3739  rb_raise(rb_eArgError, "negative output_byteoffset");
3740 
3741  if (RSTRING_LEN(output) < output_byteoffset)
3742  rb_raise(rb_eArgError, "output_byteoffset too big");
3743 
3744  if (output_bytesize < 0)
3745  rb_raise(rb_eArgError, "negative output_bytesize");
3746 
3747  output_byteend = (unsigned long)output_byteoffset +
3748  (unsigned long)output_bytesize;
3749 
3750  if (output_byteend < (unsigned long)output_byteoffset ||
3751  LONG_MAX < output_byteend)
3752  rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3753 
3754  if (rb_str_capacity(output) < output_byteend)
3755  rb_str_resize(output, output_byteend);
3756 
3757  if (NIL_P(input)) {
3758  ip = is = NULL;
3759  }
3760  else {
3761  ip = (const unsigned char *)RSTRING_PTR(input);
3762  is = ip + RSTRING_LEN(input);
3763  }
3764 
3765  op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3766  os = op + output_bytesize;
3767 
3768  res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3769  rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3770  if (!NIL_P(input))
3771  rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3772 
3773  if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3774  if (LONG_MAX / 2 < output_bytesize)
3775  rb_raise(rb_eArgError, "too long conversion result");
3776  output_bytesize *= 2;
3777  output_byteoffset_v = Qnil;
3778  goto retry;
3779  }
3780 
3781  if (ec->destination_encoding) {
3783  }
3784 
3785  return econv_result_to_symbol(res);
3786 }
3787 
3788 /*
3789  * call-seq:
3790  * ec.convert(source_string) -> destination_string
3791  *
3792  * Convert source_string and return destination_string.
3793  *
3794  * source_string is assumed as a part of source.
3795  * i.e. :partial_input=>true is specified internally.
3796  * finish method should be used last.
3797  *
3798  * ec = Encoding::Converter.new("utf-8", "euc-jp")
3799  * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3800  * puts ec.finish.dump #=> ""
3801  *
3802  * ec = Encoding::Converter.new("euc-jp", "utf-8")
3803  * puts ec.convert("\xA4").dump #=> ""
3804  * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3805  * puts ec.finish.dump #=> ""
3806  *
3807  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3808  * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3809  * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3810  * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3811  * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3812  *
3813  * If a conversion error occur,
3814  * Encoding::UndefinedConversionError or
3815  * Encoding::InvalidByteSequenceError is raised.
3816  * Encoding::Converter#convert doesn't supply methods to recover or restart
3817  * from these exceptions.
3818  * When you want to handle these conversion errors,
3819  * use Encoding::Converter#primitive_convert.
3820  *
3821  */
3822 static VALUE
3823 econv_convert(VALUE self, VALUE source_string)
3824 {
3825  VALUE ret, dst;
3826  VALUE av[5];
3827  int ac;
3828  rb_econv_t *ec = check_econv(self);
3829 
3830  StringValue(source_string);
3831 
3832  dst = rb_str_new(NULL, 0);
3833 
3834  av[0] = rb_str_dup(source_string);
3835  av[1] = dst;
3836  av[2] = Qnil;
3837  av[3] = Qnil;
3838  av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3839  ac = 5;
3840 
3841  ret = econv_primitive_convert(ac, av, self);
3842 
3843  if (ret == sym_invalid_byte_sequence ||
3844  ret == sym_undefined_conversion ||
3845  ret == sym_incomplete_input) {
3846  VALUE exc = make_econv_exception(ec);
3847  rb_exc_raise(exc);
3848  }
3849 
3850  if (ret == sym_finished) {
3851  rb_raise(rb_eArgError, "converter already finished");
3852  }
3853 
3854  if (ret != sym_source_buffer_empty) {
3855  rb_bug("unexpected result of econv_primitive_convert");
3856  }
3857 
3858  return dst;
3859 }
3860 
3861 /*
3862  * call-seq:
3863  * ec.finish -> string
3864  *
3865  * Finishes the converter.
3866  * It returns the last part of the converted string.
3867  *
3868  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3869  * p ec.convert("\u3042") #=> "\e$B$\""
3870  * p ec.finish #=> "\e(B"
3871  */
3872 static VALUE
3874 {
3875  VALUE ret, dst;
3876  VALUE av[5];
3877  int ac;
3878  rb_econv_t *ec = check_econv(self);
3879 
3880  dst = rb_str_new(NULL, 0);
3881 
3882  av[0] = Qnil;
3883  av[1] = dst;
3884  av[2] = Qnil;
3885  av[3] = Qnil;
3886  av[4] = INT2NUM(0);
3887  ac = 5;
3888 
3889  ret = econv_primitive_convert(ac, av, self);
3890 
3891  if (ret == sym_invalid_byte_sequence ||
3892  ret == sym_undefined_conversion ||
3893  ret == sym_incomplete_input) {
3894  VALUE exc = make_econv_exception(ec);
3895  rb_exc_raise(exc);
3896  }
3897 
3898  if (ret != sym_finished) {
3899  rb_bug("unexpected result of econv_primitive_convert");
3900  }
3901 
3902  return dst;
3903 }
3904 
3905 /*
3906  * call-seq:
3907  * ec.primitive_errinfo -> array
3908  *
3909  * primitive_errinfo returns important information regarding the last error
3910  * as a 5-element array:
3911  *
3912  * [result, enc1, enc2, error_bytes, readagain_bytes]
3913  *
3914  * result is the last result of primitive_convert.
3915  *
3916  * Other elements are only meaningful when result is
3917  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3918  *
3919  * enc1 and enc2 indicate a conversion step as a pair of strings.
3920  * For example, a converter from EUC-JP to ISO-8859-1 converts
3921  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3922  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3923  *
3924  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3925  * error_bytes is discarded portion.
3926  * readagain_bytes is buffered portion which is read again on next conversion.
3927  *
3928  * Example:
3929  *
3930  * # \xff is invalid as EUC-JP.
3931  * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3932  * ec.primitive_convert(src="\xff", dst="", nil, 10)
3933  * p ec.primitive_errinfo
3934  * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
3935  *
3936  * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3937  * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3938  * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3939  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3940  * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3941  * p ec.primitive_errinfo
3942  * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3943  *
3944  * # partial character is invalid
3945  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3946  * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3947  * p ec.primitive_errinfo
3948  * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3949  *
3950  * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3951  * # partial characters.
3952  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3953  * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3954  * p ec.primitive_errinfo
3955  * #=> [:source_buffer_empty, nil, nil, nil, nil]
3956  *
3957  * # \xd8\x00\x00@ is invalid as UTF-16BE because
3958  * # no low surrogate after high surrogate (\xd8\x00).
3959  * # It is detected by 3rd byte (\00) which is part of next character.
3960  * # So the high surrogate (\xd8\x00) is discarded and
3961  * # the 3rd byte is read again later.
3962  * # Since the byte is buffered in ec, it is dropped from src.
3963  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3964  * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
3965  * p ec.primitive_errinfo
3966  * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
3967  * p src
3968  * #=> "@"
3969  *
3970  * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
3971  * # The problem is detected by 4th byte.
3972  * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
3973  * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
3974  * p ec.primitive_errinfo
3975  * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
3976  * p src
3977  * #=> ""
3978  *
3979  */
3980 static VALUE
3982 {
3983  rb_econv_t *ec = check_econv(self);
3984 
3985  VALUE ary;
3986 
3987  ary = rb_ary_new2(5);
3988 
3990  rb_ary_store(ary, 4, Qnil);
3991 
3992  if (ec->last_error.source_encoding)
3994 
3997 
3998  if (ec->last_error.error_bytes_start) {
4001  }
4002 
4003  return ary;
4004 }
4005 
4006 /*
4007  * call-seq:
4008  * ec.insert_output(string) -> nil
4009  *
4010  * Inserts string into the encoding converter.
4011  * The string will be converted to the destination encoding and
4012  * output on later conversions.
4013  *
4014  * If the destination encoding is stateful,
4015  * string is converted according to the state and the state is updated.
4016  *
4017  * This method should be used only when a conversion error occurs.
4018  *
4019  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4020  * src = "HIRAGANA LETTER A is \u{3042}."
4021  * dst = ""
4022  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4023  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4024  * ec.insert_output("<err>")
4025  * p ec.primitive_convert(src, dst) #=> :finished
4026  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4027  *
4028  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4029  * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4030  * dst = ""
4031  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4032  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4033  * ec.insert_output "?" # state change required to output "?".
4034  * p ec.primitive_convert(src, dst) #=> :finished
4035  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4036  *
4037  */
4038 static VALUE
4040 {
4041  const char *insert_enc;
4042 
4043  int ret;
4044 
4045  rb_econv_t *ec = check_econv(self);
4046 
4047  StringValue(string);
4048  insert_enc = rb_econv_encoding_to_insert_output(ec);
4049  string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4050 
4051  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4052  if (ret == -1) {
4053  rb_raise(rb_eArgError, "too big string");
4054  }
4055 
4056  return Qnil;
4057 }
4058 
4059 /*
4060  * call-seq
4061  * ec.putback -> string
4062  * ec.putback(max_numbytes) -> string
4063  *
4064  * Put back the bytes which will be converted.
4065  *
4066  * The bytes are caused by invalid_byte_sequence error.
4067  * When invalid_byte_sequence error, some bytes are discarded and
4068  * some bytes are buffered to be converted later.
4069  * The latter bytes can be put back.
4070  * It can be observed by
4071  * Encoding::InvalidByteSequenceError#readagain_bytes and
4072  * Encoding::Converter#primitive_errinfo.
4073  *
4074  * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4075  * src = "\x00\xd8\x61\x00"
4076  * dst = ""
4077  * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4078  * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4079  * p ec.putback #=> "a\x00"
4080  * p ec.putback #=> "" # no more bytes to put back
4081  *
4082  */
4083 static VALUE
4085 {
4086  rb_econv_t *ec = check_econv(self);
4087  int n;
4088  int putbackable;
4089  VALUE str, max;
4090 
4091  rb_scan_args(argc, argv, "01", &max);
4092 
4093  if (NIL_P(max))
4094  n = rb_econv_putbackable(ec);
4095  else {
4096  n = NUM2INT(max);
4097  putbackable = rb_econv_putbackable(ec);
4098  if (putbackable < n)
4099  n = putbackable;
4100  }
4101 
4102  str = rb_str_new(NULL, n);
4103  rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4104 
4105  if (ec->source_encoding) {
4107  }
4108 
4109  return str;
4110 }
4111 
4112 /*
4113  * call-seq:
4114  * ec.last_error -> exception or nil
4115  *
4116  * Returns an exception object for the last conversion.
4117  * Returns nil if the last conversion did not produce an error.
4118  *
4119  * "error" means that
4120  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4121  * Encoding::Converter#convert and
4122  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4123  * Encoding::Converter#primitive_convert.
4124  *
4125  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4126  * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4127  * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4128  * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4129  * p ec.last_error #=> nil
4130  *
4131  */
4132 static VALUE
4134 {
4135  rb_econv_t *ec = check_econv(self);
4136  VALUE exc;
4137 
4138  exc = make_econv_exception(ec);
4139  if (NIL_P(exc))
4140  return Qnil;
4141  return exc;
4142 }
4143 
4144 /*
4145  * call-seq:
4146  * ec.replacement -> string
4147  *
4148  * Returns the replacement string.
4149  *
4150  * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4151  * p ec.replacement #=> "?"
4152  *
4153  * ec = Encoding::Converter.new("euc-jp", "utf-8")
4154  * p ec.replacement #=> "\uFFFD"
4155  */
4156 static VALUE
4158 {
4159  rb_econv_t *ec = check_econv(self);
4160  int ret;
4161  rb_encoding *enc;
4162 
4163  ret = make_replacement(ec);
4164  if (ret == -1) {
4165  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4166  }
4167 
4168  enc = rb_enc_find(ec->replacement_enc);
4169  return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4170 }
4171 
4172 /*
4173  * call-seq:
4174  * ec.replacement = string
4175  *
4176  * Sets the replacement string.
4177  *
4178  * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4179  * ec.replacement = "<undef>"
4180  * p ec.convert("a \u3042 b") #=> "a <undef> b"
4181  */
4182 static VALUE
4184 {
4185  rb_econv_t *ec = check_econv(self);
4186  VALUE string = arg;
4187  int ret;
4188  rb_encoding *enc;
4189 
4190  StringValue(string);
4191  enc = rb_enc_get(string);
4192 
4193  ret = rb_econv_set_replacement(ec,
4194  (const unsigned char *)RSTRING_PTR(string),
4195  RSTRING_LEN(string),
4196  rb_enc_name(enc));
4197 
4198  if (ret == -1) {
4199  /* xxx: rb_eInvalidByteSequenceError? */
4200  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4201  }
4202 
4203  return arg;
4204 }
4205 
4206 VALUE
4208 {
4209  return make_econv_exception(ec);
4210 }
4211 
4212 void
4214 {
4215  VALUE exc;
4216 
4217  exc = make_econv_exception(ec);
4218  if (NIL_P(exc))
4219  return;
4220  rb_exc_raise(exc);
4221 }
4222 
4223 /*
4224  * call-seq:
4225  * ecerr.source_encoding_name -> string
4226  *
4227  * Returns the source encoding name as a string.
4228  */
4229 static VALUE
4231 {
4232  return rb_attr_get(self, rb_intern("source_encoding_name"));
4233 }
4234 
4235 /*
4236  * call-seq:
4237  * ecerr.source_encoding -> encoding
4238  *
4239  * Returns the source encoding as an encoding object.
4240  *
4241  * Note that the result may not be equal to the source encoding of
4242  * the encoding converter if the conversion has multiple steps.
4243  *
4244  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4245  * begin
4246  * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4247  * rescue Encoding::UndefinedConversionError
4248  * p $!.source_encoding #=> #<Encoding:UTF-8>
4249  * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4250  * p $!.source_encoding_name #=> "UTF-8"
4251  * p $!.destination_encoding_name #=> "EUC-JP"
4252  * end
4253  *
4254  */
4255 static VALUE
4257 {
4258  return rb_attr_get(self, rb_intern("source_encoding"));
4259 }
4260 
4261 /*
4262  * call-seq:
4263  * ecerr.destination_encoding_name -> string
4264  *
4265  * Returns the destination encoding name as a string.
4266  */
4267 static VALUE
4269 {
4270  return rb_attr_get(self, rb_intern("destination_encoding_name"));
4271 }
4272 
4273 /*
4274  * call-seq:
4275  * ecerr.destination_encoding -> string
4276  *
4277  * Returns the destination encoding as an encoding object.
4278  */
4279 static VALUE
4281 {
4282  return rb_attr_get(self, rb_intern("destination_encoding"));
4283 }
4284 
4285 /*
4286  * call-seq:
4287  * ecerr.error_char -> string
4288  *
4289  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4290  *
4291  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4292  * begin
4293  * ec.convert("\xa0")
4294  * rescue Encoding::UndefinedConversionError
4295  * puts $!.error_char.dump #=> "\xC2\xA0"
4296  * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4297  * end
4298  *
4299  */
4300 static VALUE
4302 {
4303  return rb_attr_get(self, rb_intern("error_char"));
4304 }
4305 
4306 /*
4307  * call-seq:
4308  * ecerr.error_bytes -> string
4309  *
4310  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4311  *
4312  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4313  * begin
4314  * ec.convert("abc\xA1\xFFdef")
4315  * rescue Encoding::InvalidByteSequenceError
4316  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4317  * puts $!.error_bytes.dump #=> "\xA1"
4318  * puts $!.readagain_bytes.dump #=> "\xFF"
4319  * end
4320  */
4321 static VALUE
4323 {
4324  return rb_attr_get(self, rb_intern("error_bytes"));
4325 }
4326 
4327 /*
4328  * call-seq:
4329  * ecerr.readagain_bytes -> string
4330  *
4331  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4332  */
4333 static VALUE
4335 {
4336  return rb_attr_get(self, rb_intern("readagain_bytes"));
4337 }
4338 
4339 /*
4340  * call-seq:
4341  * ecerr.incomplete_input? -> true or false
4342  *
4343  * Returns true if the invalid byte sequence error is caused by
4344  * premature end of string.
4345  *
4346  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4347  *
4348  * begin
4349  * ec.convert("abc\xA1z")
4350  * rescue Encoding::InvalidByteSequenceError
4351  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4352  * p $!.incomplete_input? #=> false
4353  * end
4354  *
4355  * begin
4356  * ec.convert("abc\xA1")
4357  * ec.finish
4358  * rescue Encoding::InvalidByteSequenceError
4359  * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4360  * p $!.incomplete_input? #=> true
4361  * end
4362  */
4363 static VALUE
4365 {
4366  return rb_attr_get(self, rb_intern("incomplete_input"));
4367 }
4368 
4369 /*
4370  * Document-class: Encoding::UndefinedConversionError
4371  *
4372  * Raised by Encoding and String methods when a transcoding operation
4373  * fails.
4374  */
4375 
4376 /*
4377  * Document-class: Encoding::InvalidByteSequenceError
4378  *
4379  * Raised by Encoding and String methods when the string being
4380  * transcoded contains a byte invalid for the either the source or
4381  * target encoding.
4382  */
4383 
4384 /*
4385  * Document-class: Encoding::ConverterNotFoundError
4386  *
4387  * Raised by transcoding methods when a named encoding does not
4388  * correspond with a known converter.
4389  */
4390 
4391 void
4393 {
4397 
4398  transcoder_table = st_init_strcasetable();
4399 
4400  sym_invalid = ID2SYM(rb_intern("invalid"));
4401  sym_undef = ID2SYM(rb_intern("undef"));
4402  sym_replace = ID2SYM(rb_intern("replace"));
4403  sym_fallback = ID2SYM(rb_intern("fallback"));
4404  sym_aref = ID2SYM(rb_intern("[]"));
4405  sym_xml = ID2SYM(rb_intern("xml"));
4406  sym_text = ID2SYM(rb_intern("text"));
4407  sym_attr = ID2SYM(rb_intern("attr"));
4408 
4409  sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
4410  sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
4411  sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
4412  sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
4413  sym_finished = ID2SYM(rb_intern("finished"));
4414  sym_after_output = ID2SYM(rb_intern("after_output"));
4415  sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
4416  sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
4417  sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
4418  sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
4419  sym_partial_input = ID2SYM(rb_intern("partial_input"));
4420 
4421 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4422  sym_newline = ID2SYM(rb_intern("newline"));
4423  sym_universal = ID2SYM(rb_intern("universal"));
4424  sym_crlf = ID2SYM(rb_intern("crlf"));
4425  sym_cr = ID2SYM(rb_intern("cr"));
4426  sym_lf = ID2SYM(rb_intern("lf"));
4427 #endif
4428 
4429  rb_define_method(rb_cString, "encode", str_encode, -1);
4430  rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4431 
4451 
4465 
4471 
4479 
4480  Init_newline();
4481 }
RUBY_EXTERN VALUE rb_cString
Definition: ruby.h:1276
#define BL_ACTION(byte)
#define FOURbt
static VALUE sym_replace
Definition: transcode.c:27
const char * ascii_incompat_name
Definition: transcode.c:1771
unsigned char ary[8]
Definition: transcode.c:67
#define RSTRING_LEN(string)
Definition: generator.h:45
static long NUM2LONG(VALUE x)
Definition: ruby.h:510
int rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
Definition: transcode.c:2582
#define ECONV_XML_TEXT_DECORATOR
Definition: encoding.h:320
#define T_SYMBOL
Definition: ruby.h:430
Definition: string.c:4963
#define FUNio
VALUE(* func_si)(void *, const unsigned char *, size_t)
search_path_queue_t * queue
Definition: transcode.c:252
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:651
void rb_econv_check_error(rb_econv_t *ec)
Definition: transcode.c:4213
VALUE next_info
Definition: transcode.c:60
RUBY_EXTERN VALUE rb_cData
Definition: ruby.h:1253
static VALUE econv_destination_encoding(VALUE self)
Definition: transcode.c:3482
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:135
static VALUE sym_undefined_conversion
Definition: transcode.c:38
#define NOMAP
VALUE rb_eConverterNotFoundError
Definition: transcode.c:23
VALUE rb_ary_new4(long n, const VALUE *elts)
Definition: array.c:366
rb_econv_result_t
Definition: encoding.h:238
int(* state_fini_func)(void *)
VALUE rb_ary_entry(VALUE ary, long offset)
Definition: array.c:956
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:136
unsigned char * in_buf_end
Definition: transcode.c:126
const unsigned char * error_bytes_start
Definition: transcode.c:139
void rb_bug(const char *fmt,...)
Definition: error.c:265
rb_econv_result_t last_result
Definition: transcode.c:108
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:152
VALUE rb_econv_make_exception(rb_econv_t *ec)
Definition: transcode.c:4207
const char * dst_encoding
rb_econv_result_t result
Definition: transcode.c:135
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:230
static VALUE sym_invalid_byte_sequence
Definition: transcode.c:37
size_t strlen(const char *)
struct search_path_queue_tag search_path_queue_t
#define DECORATOR_P(sname, dname)
Definition: transcode.c:154
int i
Definition: win32ole.c:776
Definition: st.h:77
#define GB4bt
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1452
VALUE rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:2045
Definition: st.h:100
VALUE rb_cEncoding
Definition: encoding.c:39
#define NUM2INT(x)
Definition: ruby.h:536
static int max(int a, int b)
#define ZERObt
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1342
static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, const char *dst_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2276
VALUE rb_eInvalidByteSequenceError
Definition: transcode.c:22
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Definition: encoding.h:321
static void econv_args(int argc, VALUE *argv, volatile VALUE *snamev_p, volatile VALUE *dnamev_p, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p, int *ecflags_p, VALUE *ecopts_p)
Definition: transcode.c:2998
int(* state_init_func)(void *)
#define getGB4bt1(a)
#define FL_TAINT
Definition: ruby.h:925
void rb_econv_binmode(rb_econv_t *ec)
Definition: transcode.c:1940
ssize_t writebuf_len
Definition: transcode.c:72
#define st_foreach
Definition: regint.h:150
static void rb_transcoding_close(rb_transcoding *tc)
Definition: transcode.c:825
rb_encoding * source_encoding
Definition: transcode.c:146
static VALUE sym_newline
Definition: transcode.c:33
#define Qtrue
Definition: ruby.h:366
unsigned char * out_data_start
Definition: transcode.c:105
static int decorate_convpath(VALUE convpath, int ecflags)
Definition: transcode.c:3059
static int enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
Definition: transcode.c:2623
static VALUE sym_crlf_newline
Definition: transcode.c:30
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: ruby.h:826
#define MAX_ECFLAGS_DECORATORS
Definition: transcode.c:1032
static size_t rb_transcoding_memsize(rb_transcoding *tc)
Definition: transcode.c:841
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:63
#define TypedData_Get_Struct(obj, type, data_type, sval)
Definition: ruby.h:840
int rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
Definition: transcode.c:2537
unsigned char * in_data_start
Definition: transcode.c:124
#define ECONV_ERROR_HANDLER_MASK
Definition: encoding.h:303
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1923
static int str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, const char **sname_p, rb_encoding **senc_p, const char **dname_p, rb_encoding **denc_p)
Definition: transcode.c:2647
VALUE rb_method_call(int, VALUE *, VALUE)
Definition: proc.c:1408
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:179
#define getBT3(a)
rb_encoding * destination_encoding
Definition: transcode.c:147
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Definition: encoding.h:324
struct rb_transcoding * tc
Definition: transcode.c:103
#define SUSPEND(ret, num)
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:102
#define next_byte
static VALUE sym_cr_newline
Definition: transcode.c:31
#define bp()
Definition: debug.h:27
VALUE rb_eTypeError
Definition: error.c:467
#define next_table
static rb_encoding * to_encoding(VALUE enc)
Definition: encoding.c:163
static int str_transcode(int argc, VALUE *argv, VALUE *self)
Definition: transcode.c:2751
static VALUE sym_aref
Definition: transcode.c:27
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
Definition: transcode.c:1819
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:740
VALUE rb_eEncodingError
Definition: error.c:473
static VALUE econv_last_error(VALUE self)
Definition: transcode.c:4133
VALUE rb_obj_is_method(VALUE)
Definition: proc.c:891
static VALUE INT2NUM(int v)
Definition: ruby.h:981
#define UNDEF
struct rb_transcoding * error_tc
Definition: transcode.c:136
static rb_econv_t * rb_econv_alloc(int n_hint)
Definition: transcode.c:859
#define RSTRING_PTR(string)
Definition: generator.h:42
int rb_enc_str_coderange(VALUE)
Definition: string.c:324
static rb_econv_t * rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
Definition: transcode.c:936
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:514
VALUE rb_to_int(VALUE)
Definition: object.c:2141
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:1574
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:727
unsigned int conv_tree_start
static void rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3186
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define T_HASH
Definition: ruby.h:421
const char * lib
Definition: transcode.c:159
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2887
#define RARRAY_LEN(ARRAY)
Definition: generator.h:39
#define THREEbt
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
Definition: transcode.c:2588
#define STR1
union rb_transcoding::@72 writebuf
#define DATA_PTR(dta)
Definition: ruby.h:795
const rb_transcoder * transcoder
Definition: transcode.c:160
#define next_info
static int output_replacement_character(rb_econv_t *ec)
Definition: transcode.c:2240
#define T_ARRAY
Definition: ruby.h:420
#define st_lookup
Definition: regint.h:149
const char * dname
Definition: transcode.c:158
static rb_econv_result_t rb_trans_conv(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int *result_position_ptr)
Definition: transcode.c:1181
void callback(ffi_cif *cif, void *resp, void **args, void *ctx)
Definition: closure.c:53
static rb_econv_result_t transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:760
static VALUE econv_finish(VALUE self)
Definition: transcode.c:3873
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1125
static VALUE econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
Definition: transcode.c:1987
static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:166
static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry)
Definition: transcode.c:364
VALUE rb_str_tmp_new(long)
static int transcode_search_path(const char *sname, const char *dname, void(*callback)(const char *sname, const char *dname, int depth, void *arg), void *arg)
Definition: transcode.c:279
#define ID2SYM(i)
Definition: cparse.c:63
unsigned char * in_buf_start
Definition: transcode.c:123
static rb_econv_t * rb_econv_open0(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:979
static void econv_free(void *ptr)
Definition: transcode.c:2912
const char * enc
Definition: transcode.c:247
static VALUE sym_source_buffer_empty
Definition: transcode.c:40
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Definition: transcode.c:1759
#define FUNsio
ssize_t(* func_so)(void *, const unsigned char *, size_t, unsigned char *, size_t)
#define ENC_CODERANGE_7BIT
Definition: encoding.h:58
size_t error_bytes_len
Definition: transcode.c:140
const char * rb_obj_classname(VALUE)
Definition: variable.c:318
#define getGB4bt2(a)
static VALUE sym_crlf
Definition: transcode.c:33
static VALUE econv_convert(VALUE self, VALUE source_string)
Definition: transcode.c:3823
static VALUE sym_partial_input
Definition: transcode.c:35
#define dp(v)
Definition: debug.h:23
static const char transcoder_lib_prefix[]
Definition: transcode.c:231
static rb_econv_t * rb_econv_init_by_convpath(VALUE self, VALUE convpath, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p)
Definition: transcode.c:3201
Win32OLEIDispatch * p
Definition: win32ole.c:778
void rb_exc_raise(VALUE mesg)
Definition: eval.c:460
static unsigned char * output
Definition: nkf.c:32
static const char * get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
Definition: transcode.c:396
static VALUE str_encode_associate(VALUE str, int encidx)
Definition: transcode.c:2765
st_table * st_init_strcasetable(void)
Definition: st.c:229
#define FUNii
st_table * visited
Definition: transcode.c:251
#define RB_TYPE_P(obj, type)
Definition: ruby.h:1353
static VALUE ecerr_incomplete_input(VALUE self)
Definition: transcode.c:4364
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Definition: transcode.c:3171
#define fail()
#define FL_UNTRUSTED
Definition: ruby.h:926
static unsigned char * str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
Definition: transcode.c:2431
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:145
ssize_t readagain_len
Definition: transcode.c:65
static VALUE econv_primitive_errinfo(VALUE self)
Definition: transcode.c:3981
ssize_t(* finish_func)(void *, unsigned char *, size_t)
unsigned int output_index
Definition: transcode.c:62
unsigned int input
Definition: nkf.c:3916
#define TRANSCODING_READBUF(tc)
Definition: transcode.c:84
static size_t econv_memsize(const void *ptr)
Definition: transcode.c:2919
#define ALLOC_N(type, n)
Definition: ruby.h:1034
void Init_transcode(void)
Definition: transcode.c:4392
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:1123
unsigned char * in_data_end
Definition: transcode.c:125
static VALUE str_encode_bang(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2797
Definition: transcode.c:156
static VALUE str_encode(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2879
int num_finished
Definition: transcode.c:130
const char * destination_encoding
Definition: transcode.c:138
static int rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
Definition: transcode.c:1900
#define MAX_TRANSCODER_LIBNAME_LEN
Definition: transcode.c:230
int resume_position
Definition: transcode.c:58
#define ECONV_INVALID_MASK
Definition: encoding.h:305
VALUE rb_eRuntimeError
Definition: error.c:466
#define SYM2ID(v)
Definition: cparse.c:66
#define RSTRING_END(str)
Definition: ruby.h:680
struct rb_econv_t rb_econv_t
Definition: encoding.h:248
#define SUSPEND_AFTER_OUTPUT(num)
#define getGB4bt3(a)
void Init_newline(void)
Definition: newline.c:183
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:430
VALUE rb_str_cat2(VALUE, const char *)
Definition: string.c:1900
#define ECONV_INVALID_REPLACE
Definition: encoding.h:306
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1708
VALUE rb_ary_new(void)
Definition: array.c:339
static VALUE econv_get_replacement(VALUE self)
Definition: transcode.c:4157
#define ECONV_PARTIAL_INPUT
Definition: encoding.h:335
#define ECONV_AFTER_OUTPUT
Definition: encoding.h:336
#define snprintf
Definition: subst.h:6
#define NIL_P(v)
Definition: ruby.h:374
static void more_output_buffer(VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), int max_output, unsigned char **out_start_ptr, unsigned char **out_pos, unsigned char **out_stop_ptr)
Definition: transcode.c:2157
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:1923
void rb_ary_store(VALUE ary, long idx, VALUE val)
Definition: array.c:635
static VALUE sym_attr
Definition: transcode.c:28
static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
Definition: transcode.c:3142
#define OBJ_FROZEN(x)
Definition: ruby.h:969
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Definition: transcode.c:1863
static st_table * transcoder_table
Definition: transcode.c:163
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Definition: transcode.c:1593
const char * sname
Definition: transcode.c:157
#define TYPE(x)
Definition: ruby.h:441
int argc
Definition: ruby.c:120
#define Qfalse
Definition: ruby.h:365
static VALUE make_econv_exception(rb_econv_t *ec)
Definition: transcode.c:2056
VALUE rb_cEncodingConverter
Definition: transcode.c:25
VALUE rb_require_safe(VALUE, int)
Definition: load.c:591
static const rb_data_type_t econv_data_type
Definition: transcode.c:2924
ssize_t(* func_sio)(void *, const unsigned char *, size_t, VALUE, unsigned char *, size_t)
#define ALLOCA_N(type, n)
Definition: ruby.h:1038
static VALUE econv_set_replacement(VALUE self, VALUE arg)
Definition: transcode.c:4183
#define TRANSCODING_STATE(tc)
Definition: transcode.c:97
#define LONG_MAX
Definition: ruby.h:185
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1053
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:60
static VALUE sym_fallback
Definition: transcode.c:27
char ary[sizeof(double) > sizeof(void *)?sizeof(double):sizeof(void *)]
Definition: transcode.c:80
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:709
int err
Definition: win32.c:78
arg
Definition: ripper.y:1283
#define OBJ_FREEZE(x)
Definition: ruby.h:970
static VALUE method_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2264
rb_transcoder_asciicompat_type_t asciicompat_type
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
Definition: transcode.c:234
#define PRIdPTRDIFF
Definition: ruby.h:155
static VALUE econv_equal(VALUE self, VALUE other)
Definition: transcode.c:3537
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1906
#define ENC_CODERANGE_VALID
Definition: encoding.h:59
#define ECONV_UNDEF_MASK
Definition: encoding.h:308
#define ALLOC(type)
Definition: ruby.h:1035
#define SUSPEND_OBUF(num)
VALUE rb_str_resize(VALUE, long)
Definition: string.c:1771
static int str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
Definition: transcode.c:2674
void rb_register_transcoder(const rb_transcoder *tr)
Definition: transcode.c:205
size_t rb_str_capacity(VALUE)
Definition: string.c:357
unsigned char * out_buf_start
Definition: transcode.c:104
static int transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:258
#define getGB4bt0(a)
static VALUE econv_putback(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:4084
ssize_t recognized_len
Definition: transcode.c:64
static VALUE sym_xml
Definition: transcode.c:28
int num_trans
Definition: transcode.c:129
#define FUNso
static void search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3098
static rb_econv_t * check_econv(VALUE self)
Definition: transcode.c:3449
int num_additional
Definition: transcode.c:964
#define REALLOC_N(var, type, n)
Definition: ruby.h:1036
VALUE rb_obj_is_proc(VALUE)
Definition: proc.c:88
static VALUE econv_s_allocate(VALUE klass)
Definition: transcode.c:2930
search_path_queue_t ** queue_last_ptr
Definition: transcode.c:253
VALUE rb_sprintf(const char *format,...)
Definition: sprintf.c:1203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:859
static VALUE econv_insert_output(VALUE self, VALUE string)
Definition: transcode.c:4039
static VALUE ecerr_destination_encoding(VALUE self)
Definition: transcode.c:4280
int rb_econv_putbackable(rb_econv_t *ec)
Definition: transcode.c:1748
#define rb_enc_name(enc)
Definition: encoding.h:121
#define RSTRING_EMBED_LEN_MAX
Definition: ruby.h:651
unsigned char * out_buf_end
Definition: transcode.c:107
static int decorator_names(int ecflags, const char **decorators_ret)
Definition: transcode.c:1035
unsigned char next_byte
Definition: transcode.c:61
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Definition: transcode.c:2208
struct rb_transcoding * last_tc
Definition: transcode.c:131
#define MEMMOVE(p1, p2, type, n)
Definition: ruby.h:1054
#define STR1_BYTEINDEX(w)
VALUE rb_hash_new(void)
Definition: hash.c:229
static VALUE aref_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2270
static VALUE make_encobj(const char *name)
Definition: transcode.c:2956
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1415
const char * base_enc
Definition: transcode.c:254
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1038
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:423
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:3913
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Definition: array.c:460
#define NULL
#define ECONV_CRLF_NEWLINE_DECORATOR
Definition: encoding.h:318
const char * source_encoding
Definition: transcode.c:137
#define Qnil
Definition: ruby.h:367
static VALUE sym_lf
Definition: transcode.c:33
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:375
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Definition: transcode.c:1869
static VALUE econv_init(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3375
unsigned long VALUE
Definition: ruby.h:88
static VALUE result
Definition: nkf.c:40
#define RBASIC(obj)
Definition: ruby.h:904
union rb_transcoding::@71 readbuf
static VALUE sym_universal_newline
Definition: transcode.c:29
union rb_transcoding::rb_transcoding_state_t state
#define ECONV_NEWLINE_DECORATOR_MASK
Definition: encoding.h:313
const char * src_encoding
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:831
register unsigned int len
Definition: name2ctype.h:22210
#define RARRAY_PTR(ARRAY)
Definition: generator.h:36
#define ECONV_UNDEF_HEX_CHARREF
Definition: encoding.h:310
#define getBT1(a)
static void trans_open_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:968
#define rb_enc_asciicompat(enc)
Definition: encoding.h:181
static VALUE sym_universal
Definition: transcode.c:33
VALUE rb_str_new_cstr(const char *)
Definition: string.c:432
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
static VALUE ecerr_error_char(VALUE self)
Definition: transcode.c:4301
VALUE rb_str_dump(VALUE)
Definition: string.c:4522
VALUE rb_proc_call(VALUE, VALUE)
Definition: proc.c:574
const char * ascii_compat_name
Definition: transcode.c:1770
unsigned char * ptr
Definition: transcode.c:68
static rb_encoding * make_encoding(const char *name)
Definition: transcode.c:2946
#define ECONV_CR_NEWLINE_DECORATOR
Definition: encoding.h:319
#define RARRAY_LENINT(ary)
Definition: ruby.h:718
VALUE rb_str_dup(VALUE)
Definition: string.c:905
static VALUE econv_source_encoding(VALUE self)
Definition: transcode.c:3467
static VALUE proc_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2258
static VALUE sym_cr
Definition: transcode.c:33
static VALUE sym_finished
Definition: transcode.c:41
VALUE rb_funcall3(VALUE, ID, int, const VALUE *)
Calls a method.
Definition: vm_eval.c:684
long st_data_t
Definition: syck.h:69
VALUE rb_hash_freeze(VALUE hash)
Definition: hash.c:30
#define FUNsi
void xfree(void *)
#define FL_UNSET(x, f)
Definition: ruby.h:960
#define INVALID
#define BL_MIN_BYTE
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1231
#define StringValueCStr(v)
Definition: ruby.h:468
static int make_replacement(rb_econv_t *ec)
Definition: transcode.c:2173
#define writebuf_len
static rb_econv_result_t transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:435
#define ONEbt
#define SYMBOL_P(v)
Definition: cparse.c:69
#define ECONV_UNDEF_REPLACE
Definition: encoding.h:309
void rb_str_modify(VALUE)
Definition: string.c:1319
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:733
static VALUE sym_after_output
Definition: transcode.c:42
int size
Definition: encoding.c:51
static VALUE econv_inspect(VALUE self)
Definition: transcode.c:3429
#define INT2FIX(i)
Definition: ruby.h:225
static rb_transcoding * rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
Definition: transcode.c:786
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:440
VALUE rb_exc_new3(VALUE etype, VALUE str)
Definition: error.c:504
unsigned char * out_data_end
Definition: transcode.c:106
static rb_econv_result_t rb_econv_convert0(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1272
#define xmalloc
Definition: defines.h:64
#define SIZE_MAX
Definition: ruby.h:266
static int asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:1775
size_t rb_econv_memsize(rb_econv_t *ec)
Definition: transcode.c:1726
rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:1073
#define TRANSCODING_WRITEBUF(tc)
Definition: transcode.c:88
static const unsigned char * transcode_char_start(rb_transcoding *tc, const unsigned char *in_start, const unsigned char *inchar_start, const unsigned char *in_p, size_t *char_len_ptr)
Definition: transcode.c:415
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:472
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:508
static VALUE ecerr_error_bytes(VALUE self)
Definition: transcode.c:4322
static rb_econv_result_t rb_transcoding_convert(rb_transcoding *tc, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:813
VALUE rb_str_catf(VALUE str, const char *format,...)
Definition: sprintf.c:1239
uint8_t key[16]
Definition: random.c:1284
void rb_str_shared_replace(VALUE, VALUE)
Definition: string.c:816
#define RTEST(v)
Definition: ruby.h:373
#define rb_str_set_len(str, length)
Definition: ruby_missing.h:30
static void declare_transcoder(const char *sname, const char *dname, const char *lib)
Definition: transcode.c:222
unsigned int next_table
Definition: transcode.c:59
size_t readagain_len
Definition: transcode.c:141
static int rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
Definition: transcode.c:1881
static VALUE sym_invalid
Definition: transcode.c:27
#define st_add_direct
Definition: regint.h:151
static int rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
Definition: transcode.c:897
v
Definition: win32ole.c:790
#define getBT2(a)
static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
Definition: transcode.c:2980
int num_allocated
Definition: transcode.c:128
#define BYTE_ADDR(index)
const char * destination_encoding_name
Definition: transcode.c:114
static VALUE econv_convpath(VALUE self)
Definition: transcode.c:3513
static int trans_sweep(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int start)
Definition: transcode.c:1100
struct rb_econv_t::@73 last_error
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1339
VALUE rb_ary_new2(long capa)
Definition: array.c:332
const char * rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
Definition: transcode.c:1792
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:424
#define rb_safe_level()
Definition: tcltklib.c:90
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Definition: transcode.c:1509
static VALUE ecerr_source_encoding(VALUE self)
Definition: transcode.c:4256
static int output_hex_charref(rb_econv_t *ec)
Definition: transcode.c:1396
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Definition: transcode.c:1875
#define hash_fallback
Definition: transcode.c:2255
static VALUE ecerr_readagain_bytes(VALUE self)
Definition: transcode.c:4334
const char * name
Definition: nkf.c:208
#define xrealloc
Definition: defines.h:67
VALUE rb_eUndefinedConversionError
Definition: transcode.c:21
const char * rb_id2name(ID id)
Definition: ripper.c:15493
int started
Definition: transcode.c:116
rb_econv_elem_t * elems
Definition: transcode.c:127
static VALUE sym_text
Definition: transcode.c:28
const char * replacement_enc
Definition: transcode.c:120
VALUE rb_str_new_frozen(VALUE)
Definition: string.c:672
VALUE rb_str_drop_bytes(VALUE, long)
Definition: string.c:3238
const char * source_encoding_name
Definition: transcode.c:113
#define st_free_table
Definition: regint.h:152
size_t replacement_len
Definition: transcode.c:119
int replacement_allocated
Definition: transcode.c:121
static VALUE sym_undef
Definition: transcode.c:27
#define BL_MAX_BYTE
struct search_path_queue_tag * next
Definition: transcode.c:246
int rb_enc_find_index(const char *name)
Definition: encoding.c:596
static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx)
Definition: transcode.c:2897
static int econv_opts(VALUE opt, int ecflags)
Definition: transcode.c:2438
#define rb_check_frozen(obj)
Definition: intern.h:242
static VALUE sym_destination_buffer_full
Definition: transcode.c:39
#define getBT0(a)
static unsigned char * allocate_converted_string(const char *sname, const char *dname, const unsigned char *str, size_t len, unsigned char *caller_dst_buf, size_t caller_dst_bufsize, size_t *dst_len_ptr)
Definition: transcode.c:1525
#define long
Definition: name2ctype.h:37
const rb_transcoder * transcoder
Definition: transcode.c:54
static transcoder_entry_t * get_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:189
#define rb_intern(str)
ssize_t writebuf_off
Definition: transcode.c:71
VALUE rb_str_buf_new(long)
Definition: string.c:736
#define TWObt
struct rb_transcoding rb_transcoding
#define Qundef
Definition: ruby.h:368
st_index_t num_entries
Definition: st.h:93
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1209
const unsigned char * replacement_str
Definition: transcode.c:118
VALUE rb_str_new2(const char *)
#define STR1_LENGTH(byte_addr)
VALUE(* func_ii)(void *, VALUE)
#define encoding_equal(enc1, enc2)
Definition: transcode.c:243
#define TRANSCODING_WRITEBUF_SIZE(tc)
Definition: transcode.c:92
static rb_encoding * make_dummy_encoding(const char *name)
Definition: transcode.c:2936
VALUE rb_eArgError
Definition: error.c:468
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Definition: encoding.h:317
#define writebuf_off
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:620
transcoder_entry_t ** entries
Definition: transcode.c:963
static VALUE econv_result_to_symbol(rb_econv_result_t res)
Definition: transcode.c:3572
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1032
char ** argv
Definition: ruby.c:121
#define StringValue(v)
Definition: ruby.h:466
static VALUE ecerr_source_encoding_name(VALUE self)
Definition: transcode.c:4230
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:512
static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3678
ssize_t(* func_io)(void *, VALUE, const unsigned char *, size_t)
VALUE rb_obj_class(VALUE)
Definition: object.c:177
VALUE rb_str_new(const char *, long)
Definition: string.c:410
static VALUE ecerr_destination_encoding_name(VALUE self)
Definition: transcode.c:4268
static VALUE sym_incomplete_input
Definition: transcode.c:43