Ruby  2.0.0p648(2015-12-16revision53162)
transcode.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  transcode.c -
4 
5  $Author: drbrain $
6  created at: Tue Oct 30 16:10:22 JST 2007
7 
8  Copyright (C) 2007 Martin Duerst
9 
10 **********************************************************************/
11 
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "internal.h"
15 #include "transcode_data.h"
16 #include <ctype.h>
17 
18 #define ENABLE_ECONV_NEWLINE_OPTION 1
19 
20 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
24 
26 
32 #ifdef ENABLE_ECONV_NEWLINE_OPTION
34 #endif
36 
44 
45 static unsigned char *
46 allocate_converted_string(const char *sname, const char *dname,
47  const unsigned char *str, size_t len,
48  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
49  size_t *dst_len_ptr);
50 
51 /* dynamic structure, one per conversion (similar to iconv_t) */
52 /* may carry conversion state (e.g. for iso-2022-jp) */
53 typedef struct rb_transcoding {
55 
56  int flags;
57 
59  unsigned int next_table;
61  unsigned char next_byte;
62  unsigned int output_index;
63 
64  ssize_t recognized_len; /* already interpreted */
65  ssize_t readagain_len; /* not yet interpreted */
66  union {
67  unsigned char ary[8]; /* max_input <= sizeof(ary) */
68  unsigned char *ptr; /* length: max_input */
69  } readbuf; /* recognized_len + readagain_len used */
70 
71  ssize_t writebuf_off;
72  ssize_t writebuf_len;
73  union {
74  unsigned char ary[8]; /* max_output <= sizeof(ary) */
75  unsigned char *ptr; /* length: max_output */
76  } writebuf;
77 
78  union rb_transcoding_state_t { /* opaque data for stateful encoding */
79  void *ptr;
80  char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
82  } state;
84 #define TRANSCODING_READBUF(tc) \
85  ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
86  (tc)->readbuf.ary : \
87  (tc)->readbuf.ptr)
88 #define TRANSCODING_WRITEBUF(tc) \
89  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
90  (tc)->writebuf.ary : \
91  (tc)->writebuf.ptr)
92 #define TRANSCODING_WRITEBUF_SIZE(tc) \
93  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
94  sizeof((tc)->writebuf.ary) : \
95  (size_t)(tc)->transcoder->max_output)
96 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
97 #define TRANSCODING_STATE(tc) \
98  ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
99  (tc)->state.ary : \
100  (tc)->state.ptr)
101 
102 typedef struct {
104  unsigned char *out_buf_start;
105  unsigned char *out_data_start;
106  unsigned char *out_data_end;
107  unsigned char *out_buf_end;
110 
111 struct rb_econv_t {
112  int flags;
113  const char *source_encoding_name;
115 
116  int started;
117 
118  const unsigned char *replacement_str;
120  const char *replacement_enc;
122 
123  unsigned char *in_buf_start;
124  unsigned char *in_data_start;
125  unsigned char *in_data_end;
126  unsigned char *in_buf_end;
132 
133  /* last error */
134  struct {
137  const char *source_encoding;
138  const char *destination_encoding;
139  const unsigned char *error_bytes_start;
142  } last_error;
143 
144  /* The following fields are only for Encoding::Converter.
145  * rb_econv_open set them NULL. */
148 };
149 
150 /*
151  * Dispatch data and logic
152  */
153 
154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
155 
156 typedef struct {
157  const char *sname;
158  const char *dname;
159  const char *lib; /* null means means no need to load a library */
162 
164 
165 static transcoder_entry_t *
166 make_transcoder_entry(const char *sname, const char *dname)
167 {
168  st_data_t val;
169  st_table *table2;
170 
171  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
173  st_add_direct(transcoder_table, (st_data_t)sname, val);
174  }
175  table2 = (st_table *)val;
176  if (!st_lookup(table2, (st_data_t)dname, &val)) {
178  entry->sname = sname;
179  entry->dname = dname;
180  entry->lib = NULL;
181  entry->transcoder = NULL;
182  val = (st_data_t)entry;
183  st_add_direct(table2, (st_data_t)dname, val);
184  }
185  return (transcoder_entry_t *)val;
186 }
187 
188 static transcoder_entry_t *
189 get_transcoder_entry(const char *sname, const char *dname)
190 {
191  st_data_t val;
192  st_table *table2;
193 
194  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
195  return NULL;
196  }
197  table2 = (st_table *)val;
198  if (!st_lookup(table2, (st_data_t)dname, &val)) {
199  return NULL;
200  }
201  return (transcoder_entry_t *)val;
202 }
203 
204 void
206 {
207  const char *const sname = tr->src_encoding;
208  const char *const dname = tr->dst_encoding;
209 
210  transcoder_entry_t *entry;
211 
212  entry = make_transcoder_entry(sname, dname);
213  if (entry->transcoder) {
214  rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
215  sname, dname);
216  }
217 
218  entry->transcoder = tr;
219 }
220 
221 static void
222 declare_transcoder(const char *sname, const char *dname, const char *lib)
223 {
224  transcoder_entry_t *entry;
225 
226  entry = make_transcoder_entry(sname, dname);
227  entry->lib = lib;
228 }
229 
230 static const char transcoder_lib_prefix[] = "enc/trans/";
231 
232 void
233 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
234 {
235  if (!lib) {
236  rb_raise(rb_eArgError, "invalid library name - (null)");
237  }
238  declare_transcoder(enc1, enc2, lib);
239 }
240 
241 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
242 
243 typedef struct search_path_queue_tag {
245  const char *enc;
247 
248 typedef struct {
252  const char *base_enc;
254 
255 static int
257 {
258  const char *dname = (const char *)key;
259  search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
261 
262  if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
263  return ST_CONTINUE;
264  }
265 
267  q->enc = dname;
268  q->next = NULL;
269  *bfs->queue_last_ptr = q;
270  bfs->queue_last_ptr = &q->next;
271 
272  st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
273  return ST_CONTINUE;
274 }
275 
276 static int
277 transcode_search_path(const char *sname, const char *dname,
278  void (*callback)(const char *sname, const char *dname, int depth, void *arg),
279  void *arg)
280 {
281  search_path_bfs_t bfs;
283  st_data_t val;
284  st_table *table2;
285  int found;
286  int pathlen = -1;
287 
288  if (encoding_equal(sname, dname))
289  return -1;
290 
292  q->enc = sname;
293  q->next = NULL;
294  bfs.queue_last_ptr = &q->next;
295  bfs.queue = q;
296 
299 
300  while (bfs.queue) {
301  q = bfs.queue;
302  bfs.queue = q->next;
303  if (!bfs.queue)
304  bfs.queue_last_ptr = &bfs.queue;
305 
306  if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
307  xfree(q);
308  continue;
309  }
310  table2 = (st_table *)val;
311 
312  if (st_lookup(table2, (st_data_t)dname, &val)) {
313  st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
314  xfree(q);
315  found = 1;
316  goto cleanup;
317  }
318 
319  bfs.base_enc = q->enc;
321  bfs.base_enc = NULL;
322 
323  xfree(q);
324  }
325  found = 0;
326 
327  cleanup:
328  while (bfs.queue) {
329  q = bfs.queue;
330  bfs.queue = q->next;
331  xfree(q);
332  }
333 
334  if (found) {
335  const char *enc = dname;
336  int depth;
337  pathlen = 0;
338  while (1) {
339  st_lookup(bfs.visited, (st_data_t)enc, &val);
340  if (!val)
341  break;
342  pathlen++;
343  enc = (const char *)val;
344  }
345  depth = pathlen;
346  enc = dname;
347  while (1) {
348  st_lookup(bfs.visited, (st_data_t)enc, &val);
349  if (!val)
350  break;
351  callback((const char *)val, enc, --depth, arg);
352  enc = (const char *)val;
353  }
354  }
355 
356  st_free_table(bfs.visited);
357 
358  return pathlen; /* is -1 if not found */
359 }
360 
361 static const rb_transcoder *
363 {
364  if (entry->transcoder)
365  return entry->transcoder;
366 
367  if (entry->lib) {
368  const char *const lib = entry->lib;
369  const size_t len = strlen(lib);
370  const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
371  const VALUE fn = rb_str_new(0, total_len);
372  char *const path = RSTRING_PTR(fn);
373  const int safe = rb_safe_level();
374 
375  entry->lib = NULL;
376 
377  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
378  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
379  rb_str_set_len(fn, total_len);
381  OBJ_FREEZE(fn);
382  if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
383  return NULL;
384  }
385 
386  if (entry->transcoder)
387  return entry->transcoder;
388 
389  return NULL;
390 }
391 
392 static const char*
393 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
394 {
395  if (encoding_equal(encname, "UTF-8")) {
396  *len_ret = 3;
397  *repl_encname_ptr = "UTF-8";
398  return "\xEF\xBF\xBD";
399  }
400  else {
401  *len_ret = 1;
402  *repl_encname_ptr = "US-ASCII";
403  return "?";
404  }
405 }
406 
407 /*
408  * Transcoding engine logic
409  */
410 
411 static const unsigned char *
413  const unsigned char *in_start,
414  const unsigned char *inchar_start,
415  const unsigned char *in_p,
416  size_t *char_len_ptr)
417 {
418  const unsigned char *ptr;
419  if (inchar_start - in_start < tc->recognized_len) {
421  inchar_start, unsigned char, in_p - inchar_start);
422  ptr = TRANSCODING_READBUF(tc);
423  }
424  else {
425  ptr = inchar_start - tc->recognized_len;
426  }
427  *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
428  return ptr;
429 }
430 
431 static rb_econv_result_t
432 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
433  const unsigned char *in_stop, unsigned char *out_stop,
434  rb_transcoding *tc,
435  const int opt)
436 {
437  const rb_transcoder *tr = tc->transcoder;
438  int unitlen = tr->input_unit_length;
439  ssize_t readagain_len = 0;
440 
441  const unsigned char *inchar_start;
442  const unsigned char *in_p;
443 
444  unsigned char *out_p;
445 
446  in_p = inchar_start = *in_pos;
447 
448  out_p = *out_pos;
449 
450 #define SUSPEND(ret, num) \
451  do { \
452  tc->resume_position = (num); \
453  if (0 < in_p - inchar_start) \
454  MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
455  inchar_start, unsigned char, in_p - inchar_start); \
456  *in_pos = in_p; \
457  *out_pos = out_p; \
458  tc->recognized_len += in_p - inchar_start; \
459  if (readagain_len) { \
460  tc->recognized_len -= readagain_len; \
461  tc->readagain_len = readagain_len; \
462  } \
463  return (ret); \
464  resume_label ## num:; \
465  } while (0)
466 #define SUSPEND_OBUF(num) \
467  do { \
468  while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
469  } while (0)
470 
471 #define SUSPEND_AFTER_OUTPUT(num) \
472  if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
473  SUSPEND(econv_after_output, num); \
474  }
475 
476 #define next_table (tc->next_table)
477 #define next_info (tc->next_info)
478 #define next_byte (tc->next_byte)
479 #define writebuf_len (tc->writebuf_len)
480 #define writebuf_off (tc->writebuf_off)
481 
482  switch (tc->resume_position) {
483  case 0: break;
484  case 1: goto resume_label1;
485  case 2: goto resume_label2;
486  case 3: goto resume_label3;
487  case 4: goto resume_label4;
488  case 5: goto resume_label5;
489  case 6: goto resume_label6;
490  case 7: goto resume_label7;
491  case 8: goto resume_label8;
492  case 9: goto resume_label9;
493  case 10: goto resume_label10;
494  case 11: goto resume_label11;
495  case 12: goto resume_label12;
496  case 13: goto resume_label13;
497  case 14: goto resume_label14;
498  case 15: goto resume_label15;
499  case 16: goto resume_label16;
500  case 17: goto resume_label17;
501  case 18: goto resume_label18;
502  case 19: goto resume_label19;
503  case 20: goto resume_label20;
504  case 21: goto resume_label21;
505  case 22: goto resume_label22;
506  case 23: goto resume_label23;
507  case 24: goto resume_label24;
508  case 25: goto resume_label25;
509  case 26: goto resume_label26;
510  case 27: goto resume_label27;
511  case 28: goto resume_label28;
512  case 29: goto resume_label29;
513  case 30: goto resume_label30;
514  case 31: goto resume_label31;
515  case 32: goto resume_label32;
516  case 33: goto resume_label33;
517  case 34: goto resume_label34;
518  }
519 
520  while (1) {
521  inchar_start = in_p;
522  tc->recognized_len = 0;
523  next_table = tr->conv_tree_start;
524 
526 
527  if (in_stop <= in_p) {
528  if (!(opt & ECONV_PARTIAL_INPUT))
529  break;
531  continue;
532  }
533 
534 #define BYTE_ADDR(index) (tr->byte_array + (index))
535 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
536 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
537 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
538 #define BL_MIN_BYTE (BL_BASE[0])
539 #define BL_MAX_BYTE (BL_BASE[1])
540 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
541 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
542 
543  next_byte = (unsigned char)*in_p++;
544  follow_byte:
545  if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
546  next_info = INVALID;
547  else {
548  next_info = (VALUE)BL_ACTION(next_byte);
549  }
550  follow_info:
551  switch (next_info & 0x1F) {
552  case NOMAP:
553  {
554  const unsigned char *p = inchar_start;
555  writebuf_off = 0;
556  while (p < in_p) {
557  TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
558  }
559  writebuf_len = writebuf_off;
560  writebuf_off = 0;
561  while (writebuf_off < writebuf_len) {
562  SUSPEND_OBUF(3);
563  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
564  }
565  }
566  continue;
567  case 0x00: case 0x04: case 0x08: case 0x0C:
568  case 0x10: case 0x14: case 0x18: case 0x1C:
570  while (in_p >= in_stop) {
571  if (!(opt & ECONV_PARTIAL_INPUT))
572  goto incomplete;
574  }
575  next_byte = (unsigned char)*in_p++;
576  next_table = (unsigned int)next_info;
577  goto follow_byte;
578  case ZERObt: /* drop input */
579  continue;
580  case ONEbt:
581  SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
582  continue;
583  case TWObt:
584  SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
585  SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
586  continue;
587  case THREEbt:
588  SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
589  SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
590  SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
591  continue;
592  case FOURbt:
593  SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
594  SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
595  SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
596  SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
597  continue;
598  case GB4bt:
599  SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
600  SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
601  SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
602  SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
603  continue;
604  case STR1:
605  tc->output_index = 0;
606  while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
607  SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
608  tc->output_index++;
609  }
610  continue;
611  case FUNii:
612  next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
613  goto follow_info;
614  case FUNsi:
615  {
616  const unsigned char *char_start;
617  size_t char_len;
618  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
619  next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
620  goto follow_info;
621  }
622  case FUNio:
623  SUSPEND_OBUF(13);
624  if (tr->max_output <= out_stop - out_p)
625  out_p += tr->func_io(TRANSCODING_STATE(tc),
626  next_info, out_p, out_stop - out_p);
627  else {
628  writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
629  next_info,
631  writebuf_off = 0;
632  while (writebuf_off < writebuf_len) {
633  SUSPEND_OBUF(20);
634  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
635  }
636  }
637  break;
638  case FUNso:
639  {
640  const unsigned char *char_start;
641  size_t char_len;
642  SUSPEND_OBUF(14);
643  if (tr->max_output <= out_stop - out_p) {
644  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
645  out_p += tr->func_so(TRANSCODING_STATE(tc),
646  char_start, (size_t)char_len,
647  out_p, out_stop - out_p);
648  }
649  else {
650  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
651  writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
652  char_start, (size_t)char_len,
654  writebuf_off = 0;
655  while (writebuf_off < writebuf_len) {
656  SUSPEND_OBUF(22);
657  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
658  }
659  }
660  break;
661  }
662  case FUNsio:
663  {
664  const unsigned char *char_start;
665  size_t char_len;
666  SUSPEND_OBUF(33);
667  if (tr->max_output <= out_stop - out_p) {
668  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
669  out_p += tr->func_sio(TRANSCODING_STATE(tc),
670  char_start, (size_t)char_len, next_info,
671  out_p, out_stop - out_p);
672  }
673  else {
674  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
675  writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
676  char_start, (size_t)char_len, next_info,
678  writebuf_off = 0;
679  while (writebuf_off < writebuf_len) {
680  SUSPEND_OBUF(34);
681  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
682  }
683  }
684  break;
685  }
686  case INVALID:
687  if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
688  if (tc->recognized_len + (in_p - inchar_start) < unitlen)
690  while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
691  in_p = in_stop;
693  }
694  if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
695  in_p = in_stop;
696  }
697  else {
698  in_p = inchar_start + (unitlen - tc->recognized_len);
699  }
700  }
701  else {
702  ssize_t invalid_len; /* including the last byte which causes invalid */
703  ssize_t discard_len;
704  invalid_len = tc->recognized_len + (in_p - inchar_start);
705  discard_len = ((invalid_len - 1) / unitlen) * unitlen;
706  readagain_len = invalid_len - discard_len;
707  }
708  goto invalid;
709  case UNDEF:
710  goto undef;
711  default:
712  rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
713  }
714  continue;
715 
716  invalid:
718  continue;
719 
720  incomplete:
722  continue;
723 
724  undef:
726  continue;
727  }
728 
729  /* cleanup */
730  if (tr->finish_func) {
731  SUSPEND_OBUF(4);
732  if (tr->max_output <= out_stop - out_p) {
733  out_p += tr->finish_func(TRANSCODING_STATE(tc),
734  out_p, out_stop - out_p);
735  }
736  else {
737  writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
739  writebuf_off = 0;
740  while (writebuf_off < writebuf_len) {
741  SUSPEND_OBUF(23);
742  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
743  }
744  }
745  }
746  while (1)
748 #undef SUSPEND
749 #undef next_table
750 #undef next_info
751 #undef next_byte
752 #undef writebuf_len
753 #undef writebuf_off
754 }
755 
756 static rb_econv_result_t
757 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
758  const unsigned char *in_stop, unsigned char *out_stop,
759  rb_transcoding *tc,
760  const int opt)
761 {
762  if (tc->readagain_len) {
763  unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
764  const unsigned char *readagain_pos = readagain_buf;
765  const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
766  rb_econv_result_t res;
767 
768  MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
769  unsigned char, tc->readagain_len);
770  tc->readagain_len = 0;
771  res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
772  if (res != econv_source_buffer_empty) {
774  readagain_pos, unsigned char, readagain_stop - readagain_pos);
775  tc->readagain_len += readagain_stop - readagain_pos;
776  return res;
777  }
778  }
779  return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
780 }
781 
782 static rb_transcoding *
784 {
785  rb_transcoding *tc;
786 
787  tc = ALLOC(rb_transcoding);
788  tc->transcoder = tr;
789  tc->flags = flags;
790  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
791  tc->state.ptr = xmalloc(tr->state_size);
792  if (tr->state_init_func) {
793  (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
794  }
795  tc->resume_position = 0;
796  tc->recognized_len = 0;
797  tc->readagain_len = 0;
798  tc->writebuf_len = 0;
799  tc->writebuf_off = 0;
800  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
801  tc->readbuf.ptr = xmalloc(tr->max_input);
802  }
803  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
804  tc->writebuf.ptr = xmalloc(tr->max_output);
805  }
806  return tc;
807 }
808 
809 static rb_econv_result_t
811  const unsigned char **input_ptr, const unsigned char *input_stop,
812  unsigned char **output_ptr, unsigned char *output_stop,
813  int flags)
814 {
815  return transcode_restartable(
816  input_ptr, output_ptr,
817  input_stop, output_stop,
818  tc, flags);
819 }
820 
821 static void
823 {
824  const rb_transcoder *tr = tc->transcoder;
825  if (tr->state_fini_func) {
826  (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
827  }
828  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
829  xfree(tc->state.ptr);
830  if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
831  xfree(tc->readbuf.ptr);
832  if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
833  xfree(tc->writebuf.ptr);
834  xfree(tc);
835 }
836 
837 static size_t
839 {
840  size_t size = sizeof(rb_transcoding);
841  const rb_transcoder *tr = tc->transcoder;
842 
843  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
844  size += tr->state_size;
845  }
846  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
847  size += tr->max_input;
848  }
849  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
850  size += tr->max_output;
851  }
852  return size;
853 }
854 
855 static rb_econv_t *
856 rb_econv_alloc(int n_hint)
857 {
858  rb_econv_t *ec;
859 
860  if (n_hint <= 0)
861  n_hint = 1;
862 
863  ec = ALLOC(rb_econv_t);
864  ec->flags = 0;
867  ec->started = 0;
868  ec->replacement_str = NULL;
869  ec->replacement_len = 0;
870  ec->replacement_enc = NULL;
871  ec->replacement_allocated = 0;
872  ec->in_buf_start = NULL;
873  ec->in_data_start = NULL;
874  ec->in_data_end = NULL;
875  ec->in_buf_end = NULL;
876  ec->num_allocated = n_hint;
877  ec->num_trans = 0;
879  ec->num_finished = 0;
880  ec->last_tc = NULL;
882  ec->last_error.error_tc = NULL;
886  ec->last_error.error_bytes_len = 0;
887  ec->last_error.readagain_len = 0;
888  ec->source_encoding = NULL;
890  return ec;
891 }
892 
893 static int
895 {
896  int n, j;
897  int bufsize = 4096;
898  unsigned char *p;
899 
900  if (ec->num_trans == ec->num_allocated) {
901  n = ec->num_allocated * 2;
903  ec->num_allocated = n;
904  }
905 
906  p = xmalloc(bufsize);
907 
908  MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
909 
911  ec->elems[i].out_buf_start = p;
912  ec->elems[i].out_buf_end = p + bufsize;
913  ec->elems[i].out_data_start = p;
914  ec->elems[i].out_data_end = p;
916 
917  ec->num_trans++;
918 
919  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
920  for (j = ec->num_trans-1; i <= j; j--) {
921  rb_transcoding *tc = ec->elems[j].tc;
922  const rb_transcoder *tr2 = tc->transcoder;
923  if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
924  ec->last_tc = tc;
925  break;
926  }
927  }
928 
929  return 0;
930 }
931 
932 static rb_econv_t *
934 {
935  rb_econv_t *ec;
936  int i, ret;
937 
938  for (i = 0; i < n; i++) {
939  const rb_transcoder *tr;
940  tr = load_transcoder_entry(entries[i]);
941  if (!tr)
942  return NULL;
943  }
944 
945  ec = rb_econv_alloc(n);
946 
947  for (i = 0; i < n; i++) {
948  const rb_transcoder *tr = load_transcoder_entry(entries[i]);
949  ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
950  if (ret == -1) {
951  rb_econv_close(ec);
952  return NULL;
953  }
954  }
955 
956  return ec;
957 }
958 
959 struct trans_open_t {
962 };
963 
964 static void
965 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
966 {
967  struct trans_open_t *toarg = arg;
968 
969  if (!toarg->entries) {
970  toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
971  }
972  toarg->entries[depth] = get_transcoder_entry(sname, dname);
973 }
974 
975 static rb_econv_t *
976 rb_econv_open0(const char *sname, const char *dname, int ecflags)
977 {
979  int num_trans;
980  rb_econv_t *ec;
981 
982  int sidx, didx;
983 
984  if (*sname) {
985  sidx = rb_enc_find_index(sname);
986  if (0 <= sidx) {
987  rb_enc_from_index(sidx);
988  }
989  }
990 
991  if (*dname) {
992  didx = rb_enc_find_index(dname);
993  if (0 <= didx) {
994  rb_enc_from_index(didx);
995  }
996  }
997 
998  if (*sname == '\0' && *dname == '\0') {
999  num_trans = 0;
1000  entries = NULL;
1001  }
1002  else {
1003  struct trans_open_t toarg;
1004  toarg.entries = NULL;
1005  toarg.num_additional = 0;
1006  num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1007  entries = toarg.entries;
1008  if (num_trans < 0) {
1009  xfree(entries);
1010  return NULL;
1011  }
1012  }
1013 
1014  ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1015  xfree(entries);
1016  if (!ec)
1017  return NULL;
1018 
1019  ec->flags = ecflags;
1020  ec->source_encoding_name = sname;
1021  ec->destination_encoding_name = dname;
1022 
1023  return ec;
1024 }
1025 
1026 #define MAX_ECFLAGS_DECORATORS 32
1027 
1028 static int
1029 decorator_names(int ecflags, const char **decorators_ret)
1030 {
1031  int num_decorators;
1032 
1033  switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1037  case 0:
1038  break;
1039  default:
1040  return -1;
1041  }
1042 
1043  if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1045  return -1;
1046 
1047  num_decorators = 0;
1048 
1049  if (ecflags & ECONV_XML_TEXT_DECORATOR)
1050  decorators_ret[num_decorators++] = "xml_text_escape";
1051  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1052  decorators_ret[num_decorators++] = "xml_attr_content_escape";
1053  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1054  decorators_ret[num_decorators++] = "xml_attr_quote";
1055 
1056  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1057  decorators_ret[num_decorators++] = "crlf_newline";
1058  if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1059  decorators_ret[num_decorators++] = "cr_newline";
1060  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1061  decorators_ret[num_decorators++] = "universal_newline";
1062 
1063  return num_decorators;
1064 }
1065 
1066 rb_econv_t *
1067 rb_econv_open(const char *sname, const char *dname, int ecflags)
1068 {
1069  rb_econv_t *ec;
1070  int num_decorators;
1071  const char *decorators[MAX_ECFLAGS_DECORATORS];
1072  int i;
1073 
1074  num_decorators = decorator_names(ecflags, decorators);
1075  if (num_decorators == -1)
1076  return NULL;
1077 
1078  ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1079  if (!ec)
1080  return NULL;
1081 
1082  for (i = 0; i < num_decorators; i++)
1083  if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1084  rb_econv_close(ec);
1085  return NULL;
1086  }
1087 
1088  ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1089 
1090  return ec;
1091 }
1092 
1093 static int
1095  const unsigned char **input_ptr, const unsigned char *input_stop,
1096  unsigned char **output_ptr, unsigned char *output_stop,
1097  int flags,
1098  int start)
1099 {
1100  int try;
1101  int i, f;
1102 
1103  const unsigned char **ipp, *is, *iold;
1104  unsigned char **opp, *os, *oold;
1105  rb_econv_result_t res;
1106 
1107  try = 1;
1108  while (try) {
1109  try = 0;
1110  for (i = start; i < ec->num_trans; i++) {
1111  rb_econv_elem_t *te = &ec->elems[i];
1112 
1113  if (i == 0) {
1114  ipp = input_ptr;
1115  is = input_stop;
1116  }
1117  else {
1118  rb_econv_elem_t *prev_te = &ec->elems[i-1];
1119  ipp = (const unsigned char **)&prev_te->out_data_start;
1120  is = prev_te->out_data_end;
1121  }
1122 
1123  if (i == ec->num_trans-1) {
1124  opp = output_ptr;
1125  os = output_stop;
1126  }
1127  else {
1128  if (te->out_buf_start != te->out_data_start) {
1129  ssize_t len = te->out_data_end - te->out_data_start;
1130  ssize_t off = te->out_data_start - te->out_buf_start;
1131  MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1132  te->out_data_start = te->out_buf_start;
1133  te->out_data_end -= off;
1134  }
1135  opp = &te->out_data_end;
1136  os = te->out_buf_end;
1137  }
1138 
1139  f = flags;
1140  if (ec->num_finished != i)
1141  f |= ECONV_PARTIAL_INPUT;
1142  if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1143  start = 1;
1144  flags &= ~ECONV_AFTER_OUTPUT;
1145  }
1146  if (i != 0)
1147  f &= ~ECONV_AFTER_OUTPUT;
1148  iold = *ipp;
1149  oold = *opp;
1150  te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1151  if (iold != *ipp || oold != *opp)
1152  try = 1;
1153 
1154  switch (res) {
1158  case econv_after_output:
1159  return i;
1160 
1163  break;
1164 
1165  case econv_finished:
1166  ec->num_finished = i+1;
1167  break;
1168  }
1169  }
1170  }
1171  return -1;
1172 }
1173 
1174 static rb_econv_result_t
1176  const unsigned char **input_ptr, const unsigned char *input_stop,
1177  unsigned char **output_ptr, unsigned char *output_stop,
1178  int flags,
1179  int *result_position_ptr)
1180 {
1181  int i;
1182  int needreport_index;
1183  int sweep_start;
1184 
1185  unsigned char empty_buf;
1186  unsigned char *empty_ptr = &empty_buf;
1187 
1188  if (!input_ptr) {
1189  input_ptr = (const unsigned char **)&empty_ptr;
1190  input_stop = empty_ptr;
1191  }
1192 
1193  if (!output_ptr) {
1194  output_ptr = &empty_ptr;
1195  output_stop = empty_ptr;
1196  }
1197 
1198  if (ec->elems[0].last_result == econv_after_output)
1200 
1201  needreport_index = -1;
1202  for (i = ec->num_trans-1; 0 <= i; i--) {
1203  switch (ec->elems[i].last_result) {
1207  case econv_after_output:
1208  case econv_finished:
1209  sweep_start = i+1;
1210  needreport_index = i;
1211  goto found_needreport;
1212 
1215  break;
1216 
1217  default:
1218  rb_bug("unexpected transcode last result");
1219  }
1220  }
1221 
1222  /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1223 
1225  (flags & ECONV_AFTER_OUTPUT)) {
1226  rb_econv_result_t res;
1227 
1228  res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1229  (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
1230  result_position_ptr);
1231 
1232  if (res == econv_source_buffer_empty)
1233  return econv_after_output;
1234  return res;
1235  }
1236 
1237  sweep_start = 0;
1238 
1239  found_needreport:
1240 
1241  do {
1242  needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1243  sweep_start = needreport_index + 1;
1244  } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1245 
1246  for (i = ec->num_trans-1; 0 <= i; i--) {
1248  rb_econv_result_t res = ec->elems[i].last_result;
1249  if (res == econv_invalid_byte_sequence ||
1250  res == econv_incomplete_input ||
1251  res == econv_undefined_conversion ||
1252  res == econv_after_output) {
1254  }
1255  if (result_position_ptr)
1256  *result_position_ptr = i;
1257  return res;
1258  }
1259  }
1260  if (result_position_ptr)
1261  *result_position_ptr = -1;
1263 }
1264 
1265 static rb_econv_result_t
1267  const unsigned char **input_ptr, const unsigned char *input_stop,
1268  unsigned char **output_ptr, unsigned char *output_stop,
1269  int flags)
1270 {
1271  rb_econv_result_t res;
1272  int result_position;
1273  int has_output = 0;
1274 
1275  memset(&ec->last_error, 0, sizeof(ec->last_error));
1276 
1277  if (ec->num_trans == 0) {
1278  size_t len;
1279  if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1280  if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1281  len = output_stop - *output_ptr;
1282  memcpy(*output_ptr, ec->in_data_start, len);
1283  *output_ptr = output_stop;
1284  ec->in_data_start += len;
1286  goto gotresult;
1287  }
1288  len = ec->in_data_end - ec->in_data_start;
1289  memcpy(*output_ptr, ec->in_data_start, len);
1290  *output_ptr += len;
1291  ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1292  if (flags & ECONV_AFTER_OUTPUT) {
1293  res = econv_after_output;
1294  goto gotresult;
1295  }
1296  }
1297  if (output_stop - *output_ptr < input_stop - *input_ptr) {
1298  len = output_stop - *output_ptr;
1299  }
1300  else {
1301  len = input_stop - *input_ptr;
1302  }
1303  if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1304  *(*output_ptr)++ = *(*input_ptr)++;
1305  res = econv_after_output;
1306  goto gotresult;
1307  }
1308  memcpy(*output_ptr, *input_ptr, len);
1309  *output_ptr += len;
1310  *input_ptr += len;
1311  if (*input_ptr != input_stop)
1313  else if (flags & ECONV_PARTIAL_INPUT)
1315  else
1316  res = econv_finished;
1317  goto gotresult;
1318  }
1319 
1320  if (ec->elems[ec->num_trans-1].out_data_start) {
1321  unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1322  unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1323  if (data_start != data_end) {
1324  size_t len;
1325  if (output_stop - *output_ptr < data_end - data_start) {
1326  len = output_stop - *output_ptr;
1327  memcpy(*output_ptr, data_start, len);
1328  *output_ptr = output_stop;
1329  ec->elems[ec->num_trans-1].out_data_start += len;
1331  goto gotresult;
1332  }
1333  len = data_end - data_start;
1334  memcpy(*output_ptr, data_start, len);
1335  *output_ptr += len;
1336  ec->elems[ec->num_trans-1].out_data_start =
1337  ec->elems[ec->num_trans-1].out_data_end =
1338  ec->elems[ec->num_trans-1].out_buf_start;
1339  has_output = 1;
1340  }
1341  }
1342 
1343  if (ec->in_buf_start &&
1344  ec->in_data_start != ec->in_data_end) {
1345  res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1346  (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1347  if (res != econv_source_buffer_empty)
1348  goto gotresult;
1349  }
1350 
1351  if (has_output &&
1352  (flags & ECONV_AFTER_OUTPUT) &&
1353  *input_ptr != input_stop) {
1354  input_stop = *input_ptr;
1355  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1356  if (res == econv_source_buffer_empty)
1357  res = econv_after_output;
1358  }
1359  else if ((flags & ECONV_AFTER_OUTPUT) ||
1360  ec->num_trans == 1) {
1361  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1362  }
1363  else {
1364  flags |= ECONV_AFTER_OUTPUT;
1365  do {
1366  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1367  } while (res == econv_after_output);
1368  }
1369 
1370  gotresult:
1371  ec->last_error.result = res;
1372  if (res == econv_invalid_byte_sequence ||
1373  res == econv_incomplete_input ||
1374  res == econv_undefined_conversion) {
1375  rb_transcoding *error_tc = ec->elems[result_position].tc;
1376  ec->last_error.error_tc = error_tc;
1380  ec->last_error.error_bytes_len = error_tc->recognized_len;
1381  ec->last_error.readagain_len = error_tc->readagain_len;
1382  }
1383 
1384  return res;
1385 }
1386 
1388 
1389 static int
1391 {
1392  int ret;
1393  unsigned char utfbuf[1024];
1394  const unsigned char *utf;
1395  size_t utf_len;
1396  int utf_allocated = 0;
1397  char charef_buf[16];
1398  const unsigned char *p;
1399 
1400  if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1401  utf = ec->last_error.error_bytes_start;
1402  utf_len = ec->last_error.error_bytes_len;
1403  }
1404  else {
1407  utfbuf, sizeof(utfbuf),
1408  &utf_len);
1409  if (!utf)
1410  return -1;
1411  if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1412  utf_allocated = 1;
1413  }
1414 
1415  if (utf_len % 4 != 0)
1416  goto fail;
1417 
1418  p = utf;
1419  while (4 <= utf_len) {
1420  unsigned int u = 0;
1421  u += p[0] << 24;
1422  u += p[1] << 16;
1423  u += p[2] << 8;
1424  u += p[3];
1425  snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1426 
1427  ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1428  if (ret == -1)
1429  goto fail;
1430 
1431  p += 4;
1432  utf_len -= 4;
1433  }
1434 
1435  if (utf_allocated)
1436  xfree((void *)utf);
1437  return 0;
1438 
1439  fail:
1440  if (utf_allocated)
1441  xfree((void *)utf);
1442  return -1;
1443 }
1444 
1447  const unsigned char **input_ptr, const unsigned char *input_stop,
1448  unsigned char **output_ptr, unsigned char *output_stop,
1449  int flags)
1450 {
1451  rb_econv_result_t ret;
1452 
1453  unsigned char empty_buf;
1454  unsigned char *empty_ptr = &empty_buf;
1455 
1456  ec->started = 1;
1457 
1458  if (!input_ptr) {
1459  input_ptr = (const unsigned char **)&empty_ptr;
1460  input_stop = empty_ptr;
1461  }
1462 
1463  if (!output_ptr) {
1464  output_ptr = &empty_ptr;
1465  output_stop = empty_ptr;
1466  }
1467 
1468  resume:
1469  ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1470 
1471  if (ret == econv_invalid_byte_sequence ||
1472  ret == econv_incomplete_input) {
1473  /* deal with invalid byte sequence */
1474  /* todo: add more alternative behaviors */
1475  switch (ec->flags & ECONV_INVALID_MASK) {
1476  case ECONV_INVALID_REPLACE:
1477  if (output_replacement_character(ec) == 0)
1478  goto resume;
1479  }
1480  }
1481 
1482  if (ret == econv_undefined_conversion) {
1483  /* valid character in source encoding
1484  * but no related character(s) in destination encoding */
1485  /* todo: add more alternative behaviors */
1486  switch (ec->flags & ECONV_UNDEF_MASK) {
1487  case ECONV_UNDEF_REPLACE:
1488  if (output_replacement_character(ec) == 0)
1489  goto resume;
1490  break;
1491 
1493  if (output_hex_charref(ec) == 0)
1494  goto resume;
1495  break;
1496  }
1497  }
1498 
1499  return ret;
1500 }
1501 
1502 const char *
1504 {
1505  rb_transcoding *tc = ec->last_tc;
1506  const rb_transcoder *tr;
1507 
1508  if (tc == NULL)
1509  return "";
1510 
1511  tr = tc->transcoder;
1512 
1514  return tr->src_encoding;
1515  return tr->dst_encoding;
1516 }
1517 
1518 static unsigned char *
1519 allocate_converted_string(const char *sname, const char *dname,
1520  const unsigned char *str, size_t len,
1521  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1522  size_t *dst_len_ptr)
1523 {
1524  unsigned char *dst_str;
1525  size_t dst_len;
1526  size_t dst_bufsize;
1527 
1528  rb_econv_t *ec;
1529  rb_econv_result_t res;
1530 
1531  const unsigned char *sp;
1532  unsigned char *dp;
1533 
1534  if (caller_dst_buf)
1535  dst_bufsize = caller_dst_bufsize;
1536  else if (len == 0)
1537  dst_bufsize = 1;
1538  else
1539  dst_bufsize = len;
1540 
1541  ec = rb_econv_open(sname, dname, 0);
1542  if (ec == NULL)
1543  return NULL;
1544  if (caller_dst_buf)
1545  dst_str = caller_dst_buf;
1546  else
1547  dst_str = xmalloc(dst_bufsize);
1548  dst_len = 0;
1549  sp = str;
1550  dp = dst_str+dst_len;
1551  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1552  dst_len = dp - dst_str;
1553  while (res == econv_destination_buffer_full) {
1554  if (SIZE_MAX/2 < dst_bufsize) {
1555  goto fail;
1556  }
1557  dst_bufsize *= 2;
1558  if (dst_str == caller_dst_buf) {
1559  unsigned char *tmp;
1560  tmp = xmalloc(dst_bufsize);
1561  memcpy(tmp, dst_str, dst_bufsize/2);
1562  dst_str = tmp;
1563  }
1564  else {
1565  dst_str = xrealloc(dst_str, dst_bufsize);
1566  }
1567  dp = dst_str+dst_len;
1568  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1569  dst_len = dp - dst_str;
1570  }
1571  if (res != econv_finished) {
1572  goto fail;
1573  }
1574  rb_econv_close(ec);
1575  *dst_len_ptr = dst_len;
1576  return dst_str;
1577 
1578  fail:
1579  if (dst_str != caller_dst_buf)
1580  xfree(dst_str);
1581  rb_econv_close(ec);
1582  return NULL;
1583 }
1584 
1585 /* result: 0:success -1:failure */
1586 int
1588  const unsigned char *str, size_t len, const char *str_encoding)
1589 {
1590  const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1591  unsigned char insert_buf[4096];
1592  const unsigned char *insert_str = NULL;
1593  size_t insert_len;
1594 
1595  int last_trans_index;
1596  rb_transcoding *tc;
1597 
1598  unsigned char **buf_start_p;
1599  unsigned char **data_start_p;
1600  unsigned char **data_end_p;
1601  unsigned char **buf_end_p;
1602 
1603  size_t need;
1604 
1605  ec->started = 1;
1606 
1607  if (len == 0)
1608  return 0;
1609 
1610  if (encoding_equal(insert_encoding, str_encoding)) {
1611  insert_str = str;
1612  insert_len = len;
1613  }
1614  else {
1615  insert_str = allocate_converted_string(str_encoding, insert_encoding,
1616  str, len, insert_buf, sizeof(insert_buf), &insert_len);
1617  if (insert_str == NULL)
1618  return -1;
1619  }
1620 
1621  need = insert_len;
1622 
1623  last_trans_index = ec->num_trans-1;
1624  if (ec->num_trans == 0) {
1625  tc = NULL;
1626  buf_start_p = &ec->in_buf_start;
1627  data_start_p = &ec->in_data_start;
1628  data_end_p = &ec->in_data_end;
1629  buf_end_p = &ec->in_buf_end;
1630  }
1631  else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1632  tc = ec->elems[last_trans_index].tc;
1633  need += tc->readagain_len;
1634  if (need < insert_len)
1635  goto fail;
1636  if (last_trans_index == 0) {
1637  buf_start_p = &ec->in_buf_start;
1638  data_start_p = &ec->in_data_start;
1639  data_end_p = &ec->in_data_end;
1640  buf_end_p = &ec->in_buf_end;
1641  }
1642  else {
1643  rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1644  buf_start_p = &ee->out_buf_start;
1645  data_start_p = &ee->out_data_start;
1646  data_end_p = &ee->out_data_end;
1647  buf_end_p = &ee->out_buf_end;
1648  }
1649  }
1650  else {
1651  rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1652  buf_start_p = &ee->out_buf_start;
1653  data_start_p = &ee->out_data_start;
1654  data_end_p = &ee->out_data_end;
1655  buf_end_p = &ee->out_buf_end;
1656  tc = ec->elems[last_trans_index].tc;
1657  }
1658 
1659  if (*buf_start_p == NULL) {
1660  unsigned char *buf = xmalloc(need);
1661  *buf_start_p = buf;
1662  *data_start_p = buf;
1663  *data_end_p = buf;
1664  *buf_end_p = buf+need;
1665  }
1666  else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1667  MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1668  *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1669  *data_start_p = *buf_start_p;
1670  if ((size_t)(*buf_end_p - *data_end_p) < need) {
1671  unsigned char *buf;
1672  size_t s = (*data_end_p - *buf_start_p) + need;
1673  if (s < need)
1674  goto fail;
1675  buf = xrealloc(*buf_start_p, s);
1676  *data_start_p = buf;
1677  *data_end_p = buf + (*data_end_p - *buf_start_p);
1678  *buf_start_p = buf;
1679  *buf_end_p = buf + s;
1680  }
1681  }
1682 
1683  memcpy(*data_end_p, insert_str, insert_len);
1684  *data_end_p += insert_len;
1685  if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1686  memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1687  *data_end_p += tc->readagain_len;
1688  tc->readagain_len = 0;
1689  }
1690 
1691  if (insert_str != str && insert_str != insert_buf)
1692  xfree((void*)insert_str);
1693  return 0;
1694 
1695  fail:
1696  if (insert_str != str && insert_str != insert_buf)
1697  xfree((void*)insert_str);
1698  return -1;
1699 }
1700 
1701 void
1703 {
1704  int i;
1705 
1706  if (ec->replacement_allocated) {
1707  xfree((void *)ec->replacement_str);
1708  }
1709  for (i = 0; i < ec->num_trans; i++) {
1710  rb_transcoding_close(ec->elems[i].tc);
1711  if (ec->elems[i].out_buf_start)
1712  xfree(ec->elems[i].out_buf_start);
1713  }
1714  xfree(ec->in_buf_start);
1715  xfree(ec->elems);
1716  xfree(ec);
1717 }
1718 
1719 size_t
1721 {
1722  size_t size = sizeof(rb_econv_t);
1723  int i;
1724 
1725  if (ec->replacement_allocated) {
1726  size += ec->replacement_len;
1727  }
1728  for (i = 0; i < ec->num_trans; i++) {
1729  size += rb_transcoding_memsize(ec->elems[i].tc);
1730 
1731  if (ec->elems[i].out_buf_start) {
1732  size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1733  }
1734  }
1735  size += ec->in_buf_end - ec->in_buf_start;
1736  size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1737 
1738  return size;
1739 }
1740 
1741 int
1743 {
1744  if (ec->num_trans == 0)
1745  return 0;
1746 #if SIZEOF_SIZE_T > SIZEOF_INT
1747  if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1748 #endif
1749  return (int)ec->elems[0].tc->readagain_len;
1750 }
1751 
1752 void
1753 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1754 {
1755  rb_transcoding *tc;
1756  if (ec->num_trans == 0 || n == 0)
1757  return;
1758  tc = ec->elems[0].tc;
1759  memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1760  tc->readagain_len -= n;
1761 }
1762 
1764  const char *ascii_compat_name;
1765  const char *ascii_incompat_name;
1766 };
1767 
1768 static int
1770 {
1771  struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1772  transcoder_entry_t *entry = (transcoder_entry_t *)val;
1773  const rb_transcoder *tr;
1774 
1775  if (DECORATOR_P(entry->sname, entry->dname))
1776  return ST_CONTINUE;
1777  tr = load_transcoder_entry(entry);
1778  if (tr && tr->asciicompat_type == asciicompat_decoder) {
1779  data->ascii_compat_name = tr->dst_encoding;
1780  return ST_STOP;
1781  }
1782  return ST_CONTINUE;
1783 }
1784 
1785 const char *
1787 {
1788  st_data_t v;
1789  st_table *table2;
1790  struct asciicompat_encoding_t data;
1791 
1792  if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1793  return NULL;
1794  table2 = (st_table *)v;
1795 
1796  /*
1797  * Assumption:
1798  * There is at most one transcoder for
1799  * converting from ASCII incompatible encoding.
1800  *
1801  * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1802  */
1803  if (table2->num_entries != 1)
1804  return NULL;
1805 
1807  data.ascii_compat_name = NULL;
1808  st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1809  return data.ascii_compat_name;
1810 }
1811 
1812 VALUE
1813 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1814 {
1815  unsigned const char *ss, *sp, *se;
1816  unsigned char *ds, *dp, *de;
1817  rb_econv_result_t res;
1818  int max_output;
1819 
1820  if (NIL_P(dst)) {
1821  dst = rb_str_buf_new(len);
1822  if (ec->destination_encoding)
1824  }
1825 
1826  if (ec->last_tc)
1827  max_output = ec->last_tc->transcoder->max_output;
1828  else
1829  max_output = 1;
1830 
1832  while (res == econv_destination_buffer_full) {
1833  long dlen = RSTRING_LEN(dst);
1834  if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1835  unsigned long new_capa = (unsigned long)dlen + len + max_output;
1836  if (LONG_MAX < new_capa)
1837  rb_raise(rb_eArgError, "too long string");
1838  rb_str_resize(dst, new_capa);
1839  rb_str_set_len(dst, dlen);
1840  }
1841  ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
1842  se = ss + len;
1843  ds = (unsigned char *)RSTRING_PTR(dst);
1844  de = ds + rb_str_capacity(dst);
1845  dp = ds += dlen;
1846  res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1847  off += sp - ss;
1848  len -= sp - ss;
1849  rb_str_set_len(dst, dlen + (dp - ds));
1851  }
1852 
1853  return dst;
1854 }
1855 
1856 VALUE
1857 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1858 {
1859  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1860 }
1861 
1862 VALUE
1863 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1864 {
1865  return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1866 }
1867 
1868 VALUE
1870 {
1871  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1872 }
1873 
1874 static int
1875 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1876 {
1877  transcoder_entry_t *entry;
1878  const rb_transcoder *tr;
1879 
1880  if (ec->started != 0)
1881  return -1;
1882 
1883  entry = get_transcoder_entry(sname, dname);
1884  if (!entry)
1885  return -1;
1886 
1887  tr = load_transcoder_entry(entry);
1888  if (!tr) return -1;
1889 
1890  return rb_econv_add_transcoder_at(ec, tr, n);
1891 }
1892 
1893 static int
1894 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1895 {
1896  return rb_econv_add_converter(ec, "", decorator_name, n);
1897 }
1898 
1899 int
1900 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1901 {
1902  const rb_transcoder *tr;
1903 
1904  if (ec->num_trans == 0)
1905  return rb_econv_decorate_at(ec, decorator_name, 0);
1906 
1907  tr = ec->elems[0].tc->transcoder;
1908 
1909  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1911  return rb_econv_decorate_at(ec, decorator_name, 1);
1912 
1913  return rb_econv_decorate_at(ec, decorator_name, 0);
1914 }
1915 
1916 int
1917 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1918 {
1919  const rb_transcoder *tr;
1920 
1921  if (ec->num_trans == 0)
1922  return rb_econv_decorate_at(ec, decorator_name, 0);
1923 
1924  tr = ec->elems[ec->num_trans-1].tc->transcoder;
1925 
1926  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1928  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1929 
1930  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1931 }
1932 
1933 void
1935 {
1936  const char *dname = 0;
1937 
1938  switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1940  dname = "universal_newline";
1941  break;
1943  dname = "crlf_newline";
1944  break;
1946  dname = "cr_newline";
1947  break;
1948  }
1949 
1950  if (dname) {
1951  const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1952  int num_trans = ec->num_trans;
1953  int i, j = 0;
1954 
1955  for (i=0; i < num_trans; i++) {
1956  if (transcoder == ec->elems[i].tc->transcoder) {
1957  rb_transcoding_close(ec->elems[i].tc);
1958  xfree(ec->elems[i].out_buf_start);
1959  ec->num_trans--;
1960  }
1961  else
1962  ec->elems[j++] = ec->elems[i];
1963  }
1964  }
1965 
1967 }
1968 
1969 static VALUE
1970 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1971 {
1972  int has_description = 0;
1973 
1974  if (NIL_P(mesg))
1975  mesg = rb_str_new(NULL, 0);
1976 
1977  if (*sname != '\0' || *dname != '\0') {
1978  if (*sname == '\0')
1979  rb_str_cat2(mesg, dname);
1980  else if (*dname == '\0')
1981  rb_str_cat2(mesg, sname);
1982  else
1983  rb_str_catf(mesg, "%s to %s", sname, dname);
1984  has_description = 1;
1985  }
1986 
1987  if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1991  const char *pre = "";
1992  if (has_description)
1993  rb_str_cat2(mesg, " with ");
1994  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
1995  rb_str_cat2(mesg, pre); pre = ",";
1996  rb_str_cat2(mesg, "universal_newline");
1997  }
1998  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
1999  rb_str_cat2(mesg, pre); pre = ",";
2000  rb_str_cat2(mesg, "crlf_newline");
2001  }
2002  if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2003  rb_str_cat2(mesg, pre); pre = ",";
2004  rb_str_cat2(mesg, "cr_newline");
2005  }
2006  if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2007  rb_str_cat2(mesg, pre); pre = ",";
2008  rb_str_cat2(mesg, "xml_text");
2009  }
2010  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2011  rb_str_cat2(mesg, pre); pre = ",";
2012  rb_str_cat2(mesg, "xml_attr_content");
2013  }
2014  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2015  rb_str_cat2(mesg, pre); pre = ",";
2016  rb_str_cat2(mesg, "xml_attr_quote");
2017  }
2018  has_description = 1;
2019  }
2020  if (!has_description) {
2021  rb_str_cat2(mesg, "no-conversion");
2022  }
2023 
2024  return mesg;
2025 }
2026 
2027 VALUE
2028 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2029 {
2030  VALUE mesg, exc;
2031  mesg = rb_str_new_cstr("code converter not found (");
2032  econv_description(sname, dname, ecflags, mesg);
2033  rb_str_cat2(mesg, ")");
2035  return exc;
2036 }
2037 
2038 static VALUE
2040 {
2041  VALUE mesg, exc;
2044  const char *err = (const char *)ec->last_error.error_bytes_start;
2045  size_t error_len = ec->last_error.error_bytes_len;
2046  VALUE bytes = rb_str_new(err, error_len);
2047  VALUE dumped = rb_str_dump(bytes);
2048  size_t readagain_len = ec->last_error.readagain_len;
2049  VALUE bytes2 = Qnil;
2050  VALUE dumped2;
2051  int idx;
2053  mesg = rb_sprintf("incomplete %s on %s",
2054  StringValueCStr(dumped),
2056  }
2057  else if (readagain_len) {
2058  bytes2 = rb_str_new(err+error_len, readagain_len);
2059  dumped2 = rb_str_dump(bytes2);
2060  mesg = rb_sprintf("%s followed by %s on %s",
2061  StringValueCStr(dumped),
2062  StringValueCStr(dumped2),
2064  }
2065  else {
2066  mesg = rb_sprintf("%s on %s",
2067  StringValueCStr(dumped),
2069  }
2070 
2072  rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
2073  rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
2074  rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
2075 
2076  set_encs:
2077  rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
2078  rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
2080  if (0 <= idx)
2081  rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2083  if (0 <= idx)
2084  rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2085  return exc;
2086  }
2088  VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2090  VALUE dumped = Qnil;
2091  int idx;
2092  if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2093  rb_encoding *utf8 = rb_utf8_encoding();
2094  const char *start, *end;
2095  int n;
2096  start = (const char *)ec->last_error.error_bytes_start;
2097  end = start + ec->last_error.error_bytes_len;
2098  n = rb_enc_precise_mbclen(start, end, utf8);
2099  if (MBCLEN_CHARFOUND_P(n) &&
2100  (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2101  unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2102  dumped = rb_sprintf("U+%04X", cc);
2103  }
2104  }
2105  if (dumped == Qnil)
2106  dumped = rb_str_dump(bytes);
2107  if (strcmp(ec->last_error.source_encoding,
2108  ec->source_encoding_name) == 0 &&
2109  strcmp(ec->last_error.destination_encoding,
2110  ec->destination_encoding_name) == 0) {
2111  mesg = rb_sprintf("%s from %s to %s",
2112  StringValueCStr(dumped),
2115  }
2116  else {
2117  int i;
2118  mesg = rb_sprintf("%s to %s in conversion from %s",
2119  StringValueCStr(dumped),
2121  ec->source_encoding_name);
2122  for (i = 0; i < ec->num_trans; i++) {
2123  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2124  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2125  rb_str_catf(mesg, " to %s",
2126  ec->elems[i].tc->transcoder->dst_encoding);
2127  }
2128  }
2131  if (0 <= idx)
2132  rb_enc_associate_index(bytes, idx);
2133  rb_ivar_set(exc, rb_intern("error_char"), bytes);
2134  goto set_encs;
2135  }
2136  return Qnil;
2137 }
2138 
2139 static void
2141  VALUE destination,
2142  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2143  int max_output,
2144  unsigned char **out_start_ptr,
2145  unsigned char **out_pos,
2146  unsigned char **out_stop_ptr)
2147 {
2148  size_t len = (*out_pos - *out_start_ptr);
2149  size_t new_len = (len + max_output) * 2;
2150  *out_start_ptr = resize_destination(destination, len, new_len);
2151  *out_pos = *out_start_ptr + len;
2152  *out_stop_ptr = *out_start_ptr + new_len;
2153 }
2154 
2155 static int
2157 {
2158  rb_transcoding *tc;
2159  const rb_transcoder *tr;
2160  const unsigned char *replacement;
2161  const char *repl_enc;
2162  const char *ins_enc;
2163  size_t len;
2164 
2165  if (ec->replacement_str)
2166  return 0;
2167 
2168  ins_enc = rb_econv_encoding_to_insert_output(ec);
2169 
2170  tc = ec->last_tc;
2171  if (*ins_enc) {
2172  tr = tc->transcoder;
2174  replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2175  }
2176  else {
2177  replacement = (unsigned char *)"?";
2178  len = 1;
2179  repl_enc = "";
2180  }
2181 
2182  ec->replacement_str = replacement;
2183  ec->replacement_len = len;
2184  ec->replacement_enc = repl_enc;
2185  ec->replacement_allocated = 0;
2186  return 0;
2187 }
2188 
2189 int
2191  const unsigned char *str, size_t len, const char *encname)
2192 {
2193  unsigned char *str2;
2194  size_t len2;
2195  const char *encname2;
2196 
2197  encname2 = rb_econv_encoding_to_insert_output(ec);
2198 
2199  if (encoding_equal(encname, encname2)) {
2200  str2 = xmalloc(len);
2201  MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2202  len2 = len;
2203  encname2 = encname;
2204  }
2205  else {
2206  str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2207  if (!str2)
2208  return -1;
2209  }
2210 
2211  if (ec->replacement_allocated) {
2212  xfree((void *)ec->replacement_str);
2213  }
2214  ec->replacement_allocated = 1;
2215  ec->replacement_str = str2;
2216  ec->replacement_len = len2;
2217  ec->replacement_enc = encname2;
2218  return 0;
2219 }
2220 
2221 static int
2223 {
2224  int ret;
2225 
2226  if (make_replacement(ec) == -1)
2227  return -1;
2228 
2230  if (ret == -1)
2231  return -1;
2232 
2233  return 0;
2234 }
2235 
2236 #if 1
2237 #define hash_fallback rb_hash_aref
2238 
2239 static VALUE
2241 {
2242  return rb_proc_call(fallback, rb_ary_new4(1, &c));
2243 }
2244 
2245 static VALUE
2247 {
2248  return rb_method_call(1, &c, fallback);
2249 }
2250 
2251 static VALUE
2253 {
2254  return rb_funcall3(fallback, sym_aref, 1, &c);
2255 }
2256 
2257 static void
2258 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2259  const unsigned char *in_stop, unsigned char *out_stop,
2260  VALUE destination,
2261  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2262  const char *src_encoding,
2263  const char *dst_encoding,
2264  int ecflags,
2265  VALUE ecopts)
2266 {
2267  rb_econv_t *ec;
2268  rb_transcoding *last_tc;
2269  rb_econv_result_t ret;
2270  unsigned char *out_start = *out_pos;
2271  int max_output;
2272  VALUE exc;
2273  VALUE fallback = Qnil;
2274  VALUE (*fallback_func)(VALUE, VALUE) = 0;
2275 
2276  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2277  if (!ec)
2278  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2279 
2280  if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2281  fallback = rb_hash_aref(ecopts, sym_fallback);
2282  if (RB_TYPE_P(fallback, T_HASH)) {
2283  fallback_func = hash_fallback;
2284  }
2285  else if (rb_obj_is_proc(fallback)) {
2286  fallback_func = proc_fallback;
2287  }
2288  else if (rb_obj_is_method(fallback)) {
2289  fallback_func = method_fallback;
2290  }
2291  else {
2292  fallback_func = aref_fallback;
2293  }
2294  }
2295  last_tc = ec->last_tc;
2296  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2297 
2298  resume:
2299  ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2300 
2301  if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2302  VALUE rep = rb_enc_str_new(
2303  (const char *)ec->last_error.error_bytes_start,
2306  rep = (*fallback_func)(fallback, rep);
2307  if (rep != Qundef && !NIL_P(rep)) {
2308  StringValue(rep);
2309  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2310  RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2311  if ((int)ret == -1) {
2312  rb_raise(rb_eArgError, "too big fallback string");
2313  }
2314  goto resume;
2315  }
2316  }
2317 
2318  if (ret == econv_invalid_byte_sequence ||
2319  ret == econv_incomplete_input ||
2320  ret == econv_undefined_conversion) {
2321  exc = make_econv_exception(ec);
2322  rb_econv_close(ec);
2323  rb_exc_raise(exc);
2324  }
2325 
2326  if (ret == econv_destination_buffer_full) {
2327  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2328  goto resume;
2329  }
2330 
2331  rb_econv_close(ec);
2332  return;
2333 }
2334 #else
2335 /* sample transcode_loop implementation in byte-by-byte stream style */
2336 static void
2337 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2338  const unsigned char *in_stop, unsigned char *out_stop,
2339  VALUE destination,
2340  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2341  const char *src_encoding,
2342  const char *dst_encoding,
2343  int ecflags,
2344  VALUE ecopts)
2345 {
2346  rb_econv_t *ec;
2347  rb_transcoding *last_tc;
2348  rb_econv_result_t ret;
2349  unsigned char *out_start = *out_pos;
2350  const unsigned char *ptr;
2351  int max_output;
2352  VALUE exc;
2353 
2354  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2355  if (!ec)
2356  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2357 
2358  last_tc = ec->last_tc;
2359  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2360 
2362  ptr = *in_pos;
2363  while (ret != econv_finished) {
2364  unsigned char input_byte;
2365  const unsigned char *p = &input_byte;
2366 
2367  if (ret == econv_source_buffer_empty) {
2368  if (ptr < in_stop) {
2369  input_byte = *ptr;
2370  ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2371  }
2372  else {
2373  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2374  }
2375  }
2376  else {
2377  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2378  }
2379  if (&input_byte != p)
2380  ptr += p - &input_byte;
2381  switch (ret) {
2385  exc = make_econv_exception(ec);
2386  rb_econv_close(ec);
2387  rb_exc_raise(exc);
2388  break;
2389 
2391  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2392  break;
2393 
2395  break;
2396 
2397  case econv_finished:
2398  break;
2399  }
2400  }
2401  rb_econv_close(ec);
2402  *in_pos = in_stop;
2403  return;
2404 }
2405 #endif
2406 
2407 
2408 /*
2409  * String-specific code
2410  */
2411 
2412 static unsigned char *
2413 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2414 {
2415  rb_str_resize(destination, new_len);
2416  return (unsigned char *)RSTRING_PTR(destination);
2417 }
2418 
2419 static int
2420 econv_opts(VALUE opt, int ecflags)
2421 {
2422  VALUE v;
2423 
2424  v = rb_hash_aref(opt, sym_invalid);
2425  if (NIL_P(v)) {
2426  }
2427  else if (v==sym_replace) {
2428  ecflags |= ECONV_INVALID_REPLACE;
2429  }
2430  else {
2431  rb_raise(rb_eArgError, "unknown value for invalid character option");
2432  }
2433 
2434  v = rb_hash_aref(opt, sym_undef);
2435  if (NIL_P(v)) {
2436  }
2437  else if (v==sym_replace) {
2438  ecflags |= ECONV_UNDEF_REPLACE;
2439  }
2440  else {
2441  rb_raise(rb_eArgError, "unknown value for undefined character option");
2442  }
2443 
2444  v = rb_hash_aref(opt, sym_replace);
2445  if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2446  ecflags |= ECONV_UNDEF_REPLACE;
2447  }
2448 
2449  v = rb_hash_aref(opt, sym_xml);
2450  if (!NIL_P(v)) {
2451  if (v==sym_text) {
2453  }
2454  else if (v==sym_attr) {
2456  }
2457  else if (RB_TYPE_P(v, T_SYMBOL)) {
2458  rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
2459  }
2460  else {
2461  rb_raise(rb_eArgError, "unexpected value for xml option");
2462  }
2463  }
2464 
2465 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2466  v = rb_hash_aref(opt, sym_newline);
2467  if (!NIL_P(v)) {
2468  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2469  if (v == sym_universal) {
2471  }
2472  else if (v == sym_crlf) {
2473  ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2474  }
2475  else if (v == sym_cr) {
2476  ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2477  }
2478  else if (v == sym_lf) {
2479  /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2480  }
2481  else if (SYMBOL_P(v)) {
2482  rb_raise(rb_eArgError, "unexpected value for newline option: %s",
2483  rb_id2name(SYM2ID(v)));
2484  }
2485  else {
2486  rb_raise(rb_eArgError, "unexpected value for newline option");
2487  }
2488  }
2489  else
2490 #endif
2491  {
2492  int setflags = 0, newlineflag = 0;
2493 
2495  if (RTEST(v))
2497  newlineflag |= !NIL_P(v);
2498 
2499  v = rb_hash_aref(opt, sym_crlf_newline);
2500  if (RTEST(v))
2501  setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2502  newlineflag |= !NIL_P(v);
2503 
2504  v = rb_hash_aref(opt, sym_cr_newline);
2505  if (RTEST(v))
2506  setflags |= ECONV_CR_NEWLINE_DECORATOR;
2507  newlineflag |= !NIL_P(v);
2508 
2509  if (newlineflag) {
2510  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2511  ecflags |= setflags;
2512  }
2513  }
2514 
2515  return ecflags;
2516 }
2517 
2518 int
2519 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2520 {
2521  VALUE newhash = Qnil;
2522  VALUE v;
2523 
2524  if (NIL_P(opthash)) {
2525  *opts = Qnil;
2526  return ecflags;
2527  }
2528  ecflags = econv_opts(opthash, ecflags);
2529 
2530  v = rb_hash_aref(opthash, sym_replace);
2531  if (!NIL_P(v)) {
2532  StringValue(v);
2534  VALUE dumped = rb_str_dump(v);
2535  rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2536  StringValueCStr(dumped),
2537  rb_enc_name(rb_enc_get(v)));
2538  }
2539  v = rb_str_new_frozen(v);
2540  newhash = rb_hash_new();
2541  rb_hash_aset(newhash, sym_replace, v);
2542  }
2543 
2544  v = rb_hash_aref(opthash, sym_fallback);
2545  if (!NIL_P(v)) {
2546  VALUE h = rb_check_hash_type(v);
2547  if (NIL_P(h)
2549  : (v = h, 1)) {
2550  if (NIL_P(newhash))
2551  newhash = rb_hash_new();
2552  rb_hash_aset(newhash, sym_fallback, v);
2553  }
2554  }
2555 
2556  if (!NIL_P(newhash))
2557  rb_hash_freeze(newhash);
2558  *opts = newhash;
2559 
2560  return ecflags;
2561 }
2562 
2563 int
2565 {
2566  return rb_econv_prepare_options(opthash, opts, 0);
2567 }
2568 
2569 rb_econv_t *
2570 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2571 {
2572  rb_econv_t *ec;
2573  VALUE replacement;
2574 
2575  if (NIL_P(opthash)) {
2576  replacement = Qnil;
2577  }
2578  else {
2579  if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2580  rb_bug("rb_econv_open_opts called with invalid opthash");
2581  replacement = rb_hash_aref(opthash, sym_replace);
2582  }
2583 
2584  ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2585  if (!ec)
2586  return ec;
2587 
2588  if (!NIL_P(replacement)) {
2589  int ret;
2590  rb_encoding *enc = rb_enc_get(replacement);
2591 
2592  ret = rb_econv_set_replacement(ec,
2593  (const unsigned char *)RSTRING_PTR(replacement),
2594  RSTRING_LEN(replacement),
2595  rb_enc_name(enc));
2596  if (ret == -1) {
2597  rb_econv_close(ec);
2598  return NULL;
2599  }
2600  }
2601  return ec;
2602 }
2603 
2604 static int
2605 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
2606 {
2607  rb_encoding *enc;
2608  const char *n;
2609  int encidx;
2610  VALUE encval;
2611 
2612  if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2613  !(enc = rb_enc_from_index(encidx))) {
2614  enc = NULL;
2615  encidx = 0;
2616  n = StringValueCStr(*arg);
2617  }
2618  else {
2619  n = rb_enc_name(enc);
2620  }
2621 
2622  *name_p = n;
2623  *enc_p = enc;
2624 
2625  return encidx;
2626 }
2627 
2628 static int
2629 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
2630  const char **sname_p, rb_encoding **senc_p,
2631  const char **dname_p, rb_encoding **denc_p)
2632 {
2633  rb_encoding *senc, *denc;
2634  const char *sname, *dname;
2635  int sencidx, dencidx;
2636 
2637  dencidx = enc_arg(arg1, &dname, &denc);
2638 
2639  if (NIL_P(*arg2)) {
2640  sencidx = rb_enc_get_index(str);
2641  senc = rb_enc_from_index(sencidx);
2642  sname = rb_enc_name(senc);
2643  }
2644  else {
2645  sencidx = enc_arg(arg2, &sname, &senc);
2646  }
2647 
2648  *sname_p = sname;
2649  *senc_p = senc;
2650  *dname_p = dname;
2651  *denc_p = denc;
2652  return dencidx;
2653 }
2654 
2655 static int
2656 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2657 {
2658  VALUE dest;
2659  VALUE str = *self;
2660  volatile VALUE arg1, arg2;
2661  long blen, slen;
2662  unsigned char *buf, *bp, *sp;
2663  const unsigned char *fromp;
2664  rb_encoding *senc, *denc;
2665  const char *sname, *dname;
2666  int dencidx;
2667 
2668  rb_check_arity(argc, 0, 2);
2669 
2670  if (argc == 0) {
2671  arg1 = rb_enc_default_internal();
2672  if (NIL_P(arg1)) {
2673  if (!ecflags) return -1;
2674  arg1 = rb_obj_encoding(str);
2675  }
2677  }
2678  else {
2679  arg1 = argv[0];
2680  }
2681  arg2 = argc<=1 ? Qnil : argv[1];
2682  dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2683 
2684  if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2688  if (senc && senc == denc) {
2689  return NIL_P(arg2) ? -1 : dencidx;
2690  }
2691  if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2693  return dencidx;
2694  }
2695  }
2696  if (encoding_equal(sname, dname)) {
2697  return NIL_P(arg2) ? -1 : dencidx;
2698  }
2699  }
2700  else {
2701  if (encoding_equal(sname, dname)) {
2702  sname = "";
2703  dname = "";
2704  }
2705  }
2706 
2707  fromp = sp = (unsigned char *)RSTRING_PTR(str);
2708  slen = RSTRING_LEN(str);
2709  blen = slen + 30; /* len + margin */
2710  dest = rb_str_tmp_new(blen);
2711  bp = (unsigned char *)RSTRING_PTR(dest);
2712 
2713  transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2714  if (fromp != sp+slen) {
2715  rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2716  }
2717  buf = (unsigned char *)RSTRING_PTR(dest);
2718  *bp = '\0';
2719  rb_str_set_len(dest, bp - buf);
2720 
2721  /* set encoding */
2722  if (!denc) {
2723  dencidx = rb_define_dummy_encoding(dname);
2724  }
2725  *self = dest;
2726 
2727  return dencidx;
2728 }
2729 
2730 static int
2732 {
2733  VALUE opt;
2734  int ecflags = 0;
2735  VALUE ecopts = Qnil;
2736 
2737  argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2738  if (!NIL_P(opt)) {
2739  ecflags = rb_econv_prepare_opts(opt, &ecopts);
2740  }
2741  return str_transcode0(argc, argv, self, ecflags, ecopts);
2742 }
2743 
2744 static inline VALUE
2745 str_encode_associate(VALUE str, int encidx)
2746 {
2747  int cr = 0;
2748 
2749  rb_enc_associate_index(str, encidx);
2750 
2751  /* transcoded string never be broken. */
2752  if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2754  }
2755  else {
2756  cr = ENC_CODERANGE_VALID;
2757  }
2758  ENC_CODERANGE_SET(str, cr);
2759  return str;
2760 }
2761 
2762 /*
2763  * call-seq:
2764  * str.encode!(encoding [, options] ) -> str
2765  * str.encode!(dst_encoding, src_encoding [, options] ) -> str
2766  *
2767  * The first form transcodes the contents of <i>str</i> from
2768  * str.encoding to +encoding+.
2769  * The second form transcodes the contents of <i>str</i> from
2770  * src_encoding to dst_encoding.
2771  * The options Hash gives details for conversion. See String#encode
2772  * for details.
2773  * Returns the string even if no changes were made.
2774  */
2775 
2776 static VALUE
2778 {
2779  VALUE newstr;
2780  int encidx;
2781 
2782  rb_check_frozen(str);
2783 
2784  newstr = str;
2785  encidx = str_transcode(argc, argv, &newstr);
2786 
2787  if (encidx < 0) return str;
2788  if (newstr == str) {
2789  rb_enc_associate_index(str, encidx);
2790  return str;
2791  }
2792  rb_str_shared_replace(str, newstr);
2793  return str_encode_associate(str, encidx);
2794 }
2795 
2796 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2797 
2798 /*
2799  * call-seq:
2800  * str.encode(encoding [, options] ) -> str
2801  * str.encode(dst_encoding, src_encoding [, options] ) -> str
2802  * str.encode([options]) -> str
2803  *
2804  * The first form returns a copy of +str+ transcoded
2805  * to encoding +encoding+.
2806  * The second form returns a copy of +str+ transcoded
2807  * from src_encoding to dst_encoding.
2808  * The last form returns a copy of +str+ transcoded to
2809  * <tt>Encoding.default_internal</tt>.
2810  *
2811  * By default, the first and second form raise
2812  * Encoding::UndefinedConversionError for characters that are
2813  * undefined in the destination encoding, and
2814  * Encoding::InvalidByteSequenceError for invalid byte sequences
2815  * in the source encoding. The last form by default does not raise
2816  * exceptions but uses replacement strings.
2817  *
2818  * Please note that conversion from an encoding +enc+ to the
2819  * same encoding +enc+ is a no-op, i.e. the receiver is returned without
2820  * any changes, and no exceptions are raised, even if there are invalid bytes.
2821  *
2822  * The +options+ Hash gives details for conversion and can have the following
2823  * keys:
2824  *
2825  * :invalid ::
2826  * If the value is +:replace+, #encode replaces invalid byte sequences in
2827  * +str+ with the replacement character. The default is to raise the
2828  * Encoding::InvalidByteSequenceError exception
2829  * :undef ::
2830  * If the value is +:replace+, #encode replaces characters which are
2831  * undefined in the destination encoding with the replacement character.
2832  * The default is to raise the Encoding::UndefinedConversionError.
2833  * :replace ::
2834  * Sets the replacement string to the given value. The default replacement
2835  * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2836  * :fallback ::
2837  * Sets the replacement string by the given object for undefined
2838  * character. The object should be a Hash, a Proc, a Method, or an
2839  * object which has [] method.
2840  * Its key is an undefined character encoded in the source encoding
2841  * of current transcoder. Its value can be any encoding until it
2842  * can be converted into the destination encoding of the transcoder.
2843  * :xml ::
2844  * The value must be +:text+ or +:attr+.
2845  * If the value is +:text+ #encode replaces undefined characters with their
2846  * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2847  * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2848  * If the value is +:attr+, #encode also quotes the replacement result
2849  * (using '"'), and replaces '"' with "&quot;".
2850  * :cr_newline ::
2851  * Replaces LF ("\n") with CR ("\r") if value is true.
2852  * :crlf_newline ::
2853  * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2854  * :universal_newline ::
2855  * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2856  */
2857 
2858 static VALUE
2860 {
2861  VALUE newstr = str;
2862  int encidx = str_transcode(argc, argv, &newstr);
2863  return encoded_dup(newstr, str, encidx);
2864 }
2865 
2866 VALUE
2867 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2868 {
2869  int argc = 1;
2870  VALUE *argv = &to;
2871  VALUE newstr = str;
2872  int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2873  return encoded_dup(newstr, str, encidx);
2874 }
2875 
2876 static VALUE
2877 encoded_dup(VALUE newstr, VALUE str, int encidx)
2878 {
2879  if (encidx < 0) return rb_str_dup(str);
2880  if (newstr == str) {
2881  newstr = rb_str_dup(str);
2882  rb_enc_associate_index(newstr, encidx);
2883  return newstr;
2884  }
2885  else {
2886  RBASIC(newstr)->klass = rb_obj_class(str);
2887  }
2888  return str_encode_associate(newstr, encidx);
2889 }
2890 
2891 static void
2892 econv_free(void *ptr)
2893 {
2894  rb_econv_t *ec = ptr;
2895  rb_econv_close(ec);
2896 }
2897 
2898 static size_t
2899 econv_memsize(const void *ptr)
2900 {
2901  return ptr ? sizeof(rb_econv_t) : 0;
2902 }
2903 
2905  "econv",
2907 };
2908 
2909 static VALUE
2911 {
2912  return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2913 }
2914 
2915 static rb_encoding *
2917 {
2918  rb_encoding *enc;
2919  int idx;
2920  idx = rb_define_dummy_encoding(name);
2921  enc = rb_enc_from_index(idx);
2922  return enc;
2923 }
2924 
2925 static rb_encoding *
2926 make_encoding(const char *name)
2927 {
2928  rb_encoding *enc;
2929  enc = rb_enc_find(name);
2930  if (!enc)
2931  enc = make_dummy_encoding(name);
2932  return enc;
2933 }
2934 
2935 static VALUE
2936 make_encobj(const char *name)
2937 {
2938  return rb_enc_from_encoding(make_encoding(name));
2939 }
2940 
2941 /*
2942  * call-seq:
2943  * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2944  * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2945  *
2946  * Returns the corresponding ASCII compatible encoding.
2947  *
2948  * Returns nil if the argument is an ASCII compatible encoding.
2949  *
2950  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2951  * can represents exactly the same characters as the given ASCII incompatible encoding.
2952  * So, no conversion undefined error occurs when converting between the two encodings.
2953  *
2954  * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2955  * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2956  * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2957  *
2958  */
2959 static VALUE
2961 {
2962  const char *arg_name, *result_name;
2963  rb_encoding *arg_enc, *result_enc;
2964 
2965  enc_arg(&arg, &arg_name, &arg_enc);
2966 
2967  result_name = rb_econv_asciicompat_encoding(arg_name);
2968 
2969  if (result_name == NULL)
2970  return Qnil;
2971 
2972  result_enc = make_encoding(result_name);
2973 
2974  return rb_enc_from_encoding(result_enc);
2975 }
2976 
2977 static void
2979  volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
2980  const char **sname_p, const char **dname_p,
2981  rb_encoding **senc_p, rb_encoding **denc_p,
2982  int *ecflags_p,
2983  VALUE *ecopts_p)
2984 {
2985  VALUE opt, flags_v, ecopts;
2986  int sidx, didx;
2987  const char *sname, *dname;
2988  rb_encoding *senc, *denc;
2989  int ecflags;
2990 
2991  argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
2992 
2993  if (!NIL_P(flags_v)) {
2994  if (!NIL_P(opt)) {
2995  rb_error_arity(argc + 1, 2, 3);
2996  }
2997  ecflags = NUM2INT(rb_to_int(flags_v));
2998  ecopts = Qnil;
2999  }
3000  else if (!NIL_P(opt)) {
3001  ecflags = rb_econv_prepare_opts(opt, &ecopts);
3002  }
3003  else {
3004  ecflags = 0;
3005  ecopts = Qnil;
3006  }
3007 
3008  senc = NULL;
3009  sidx = rb_to_encoding_index(*snamev_p);
3010  if (0 <= sidx) {
3011  senc = rb_enc_from_index(sidx);
3012  }
3013  else {
3014  StringValue(*snamev_p);
3015  }
3016 
3017  denc = NULL;
3018  didx = rb_to_encoding_index(*dnamev_p);
3019  if (0 <= didx) {
3020  denc = rb_enc_from_index(didx);
3021  }
3022  else {
3023  StringValue(*dnamev_p);
3024  }
3025 
3026  sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3027  dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3028 
3029  *sname_p = sname;
3030  *dname_p = dname;
3031  *senc_p = senc;
3032  *denc_p = denc;
3033  *ecflags_p = ecflags;
3034  *ecopts_p = ecopts;
3035 }
3036 
3037 static int
3038 decorate_convpath(VALUE convpath, int ecflags)
3039 {
3040  int num_decorators;
3041  const char *decorators[MAX_ECFLAGS_DECORATORS];
3042  int i;
3043  int n, len;
3044 
3045  num_decorators = decorator_names(ecflags, decorators);
3046  if (num_decorators == -1)
3047  return -1;
3048 
3049  len = n = RARRAY_LENINT(convpath);
3050  if (n != 0) {
3051  VALUE pair = RARRAY_PTR(convpath)[n-1];
3052  if (RB_TYPE_P(pair, T_ARRAY)) {
3053  const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
3054  const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
3055  transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3056  const rb_transcoder *tr = load_transcoder_entry(entry);
3057  if (!tr)
3058  return -1;
3059  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3061  n--;
3062  rb_ary_store(convpath, len + num_decorators - 1, pair);
3063  }
3064  }
3065  else {
3066  rb_ary_store(convpath, len + num_decorators - 1, pair);
3067  }
3068  }
3069 
3070  for (i = 0; i < num_decorators; i++)
3071  rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3072 
3073  return 0;
3074 }
3075 
3076 static void
3077 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3078 {
3079  VALUE *ary_p = arg;
3080  VALUE v;
3081 
3082  if (*ary_p == Qnil) {
3083  *ary_p = rb_ary_new();
3084  }
3085 
3086  if (DECORATOR_P(sname, dname)) {
3087  v = rb_str_new_cstr(dname);
3088  }
3089  else {
3090  v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3091  }
3092  rb_ary_store(*ary_p, depth, v);
3093 }
3094 
3095 /*
3096  * call-seq:
3097  * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3098  * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3099  *
3100  * Returns a conversion path.
3101  *
3102  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3103  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3104  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3105  *
3106  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3107  * or
3108  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3109  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3110  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3111  * # "universal_newline"]
3112  *
3113  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3114  * or
3115  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3116  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3117  * # "universal_newline",
3118  * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3119  */
3120 static VALUE
3122 {
3123  volatile VALUE snamev, dnamev;
3124  const char *sname, *dname;
3125  rb_encoding *senc, *denc;
3126  int ecflags;
3127  VALUE ecopts;
3128  VALUE convpath;
3129 
3130  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3131 
3132  convpath = Qnil;
3133  transcode_search_path(sname, dname, search_convpath_i, &convpath);
3134 
3135  if (NIL_P(convpath))
3136  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3137 
3138  if (decorate_convpath(convpath, ecflags) == -1)
3139  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3140 
3141  return convpath;
3142 }
3143 
3144 /*
3145  * Check the existence of a conversion path.
3146  * Returns the number of converters in the conversion path.
3147  * result: >=0:success -1:failure
3148  */
3149 int
3150 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3151 {
3152  VALUE convpath = Qnil;
3153  transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3154  &convpath);
3155  return RTEST(convpath);
3156 }
3157 
3160  int index;
3161  int ret;
3162 };
3163 
3164 static void
3165 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3166 {
3167  struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3168  int ret;
3169 
3170  if (a->ret == -1)
3171  return;
3172 
3173  ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3174 
3175  a->ret = ret;
3176  return;
3177 }
3178 
3179 static rb_econv_t *
3181  const char **sname_p, const char **dname_p,
3182  rb_encoding **senc_p, rb_encoding**denc_p)
3183 {
3184  rb_econv_t *ec;
3185  long i;
3186  int ret, first=1;
3187  VALUE elt;
3188  rb_encoding *senc = 0, *denc = 0;
3189  const char *sname, *dname;
3190 
3191  ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3192  DATA_PTR(self) = ec;
3193 
3194  for (i = 0; i < RARRAY_LEN(convpath); i++) {
3195  volatile VALUE snamev, dnamev;
3196  VALUE pair;
3197  elt = rb_ary_entry(convpath, i);
3198  if (!NIL_P(pair = rb_check_array_type(elt))) {
3199  if (RARRAY_LEN(pair) != 2)
3200  rb_raise(rb_eArgError, "not a 2-element array in convpath");
3201  snamev = rb_ary_entry(pair, 0);
3202  enc_arg(&snamev, &sname, &senc);
3203  dnamev = rb_ary_entry(pair, 1);
3204  enc_arg(&dnamev, &dname, &denc);
3205  }
3206  else {
3207  sname = "";
3208  dname = StringValueCStr(elt);
3209  }
3210  if (DECORATOR_P(sname, dname)) {
3211  ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3212  if (ret == -1)
3213  rb_raise(rb_eArgError, "decoration failed: %s", dname);
3214  }
3215  else {
3216  int j = ec->num_trans;
3217  struct rb_econv_init_by_convpath_t arg;
3218  arg.ec = ec;
3219  arg.index = ec->num_trans;
3220  arg.ret = 0;
3221  ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3222  if (ret == -1 || arg.ret == -1)
3223  rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
3224  if (first) {
3225  first = 0;
3226  *senc_p = senc;
3227  *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3228  }
3229  *denc_p = denc;
3230  *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3231  }
3232  }
3233 
3234  if (first) {
3235  *senc_p = NULL;
3236  *denc_p = NULL;
3237  *sname_p = "";
3238  *dname_p = "";
3239  }
3240 
3241  ec->source_encoding_name = *sname_p;
3242  ec->destination_encoding_name = *dname_p;
3243 
3244  return ec;
3245 }
3246 
3247 /*
3248  * call-seq:
3249  * Encoding::Converter.new(source_encoding, destination_encoding)
3250  * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3251  * Encoding::Converter.new(convpath)
3252  *
3253  * possible options elements:
3254  * hash form:
3255  * :invalid => nil # raise error on invalid byte sequence (default)
3256  * :invalid => :replace # replace invalid byte sequence
3257  * :undef => nil # raise error on undefined conversion (default)
3258  * :undef => :replace # replace undefined conversion
3259  * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3260  * :newline => :universal # decorator for converting CRLF and CR to LF
3261  * :newline => :crlf # decorator for converting LF to CRLF
3262  * :newline => :cr # decorator for converting LF to CR
3263  * :universal_newline => true # decorator for converting CRLF and CR to LF
3264  * :crlf_newline => true # decorator for converting LF to CRLF
3265  * :cr_newline => true # decorator for converting LF to CR
3266  * :xml => :text # escape as XML CharData.
3267  * :xml => :attr # escape as XML AttValue
3268  * integer form:
3269  * Encoding::Converter::INVALID_REPLACE
3270  * Encoding::Converter::UNDEF_REPLACE
3271  * Encoding::Converter::UNDEF_HEX_CHARREF
3272  * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3273  * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3274  * Encoding::Converter::CR_NEWLINE_DECORATOR
3275  * Encoding::Converter::XML_TEXT_DECORATOR
3276  * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3277  * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3278  *
3279  * Encoding::Converter.new creates an instance of Encoding::Converter.
3280  *
3281  * Source_encoding and destination_encoding should be a string or
3282  * Encoding object.
3283  *
3284  * opt should be nil, a hash or an integer.
3285  *
3286  * convpath should be an array.
3287  * convpath may contain
3288  * - two-element arrays which contain encodings or encoding names, or
3289  * - strings representing decorator names.
3290  *
3291  * Encoding::Converter.new optionally takes an option.
3292  * The option should be a hash or an integer.
3293  * The option hash can contain :invalid => nil, etc.
3294  * The option integer should be logical-or of constants such as
3295  * Encoding::Converter::INVALID_REPLACE, etc.
3296  *
3297  * [:invalid => nil]
3298  * Raise error on invalid byte sequence. This is a default behavior.
3299  * [:invalid => :replace]
3300  * Replace invalid byte sequence by replacement string.
3301  * [:undef => nil]
3302  * Raise an error if a character in source_encoding is not defined in destination_encoding.
3303  * This is a default behavior.
3304  * [:undef => :replace]
3305  * Replace undefined character in destination_encoding with replacement string.
3306  * [:replace => string]
3307  * Specify the replacement string.
3308  * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3309  * [:universal_newline => true]
3310  * Convert CRLF and CR to LF.
3311  * [:crlf_newline => true]
3312  * Convert LF to CRLF.
3313  * [:cr_newline => true]
3314  * Convert LF to CR.
3315  * [:xml => :text]
3316  * Escape as XML CharData.
3317  * This form can be used as a HTML 4.0 #PCDATA.
3318  * - '&' -> '&amp;'
3319  * - '<' -> '&lt;'
3320  * - '>' -> '&gt;'
3321  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3322  * [:xml => :attr]
3323  * Escape as XML AttValue.
3324  * The converted result is quoted as "...".
3325  * This form can be used as a HTML 4.0 attribute value.
3326  * - '&' -> '&amp;'
3327  * - '<' -> '&lt;'
3328  * - '>' -> '&gt;'
3329  * - '"' -> '&quot;'
3330  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3331  *
3332  * Examples:
3333  * # UTF-16BE to UTF-8
3334  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3335  *
3336  * # Usually, decorators such as newline conversion are inserted last.
3337  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3338  * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3339  * # "universal_newline"]
3340  *
3341  * # But, if the last encoding is ASCII incompatible,
3342  * # decorators are inserted before the last conversion.
3343  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3344  * p ec.convpath #=> ["crlf_newline",
3345  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3346  *
3347  * # Conversion path can be specified directly.
3348  * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3349  * p ec.convpath #=> ["universal_newline",
3350  * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3351  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3352  */
3353 static VALUE
3355 {
3356  VALUE ecopts;
3357  volatile VALUE snamev, dnamev;
3358  const char *sname, *dname;
3359  rb_encoding *senc, *denc;
3360  rb_econv_t *ec;
3361  int ecflags;
3362  VALUE convpath;
3363 
3364  if (rb_check_typeddata(self, &econv_data_type)) {
3365  rb_raise(rb_eTypeError, "already initialized");
3366  }
3367 
3368  if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3369  ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3370  ecflags = 0;
3371  ecopts = Qnil;
3372  }
3373  else {
3374  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3375  ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3376  }
3377 
3378  if (!ec) {
3379  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3380  }
3381 
3382  if (!DECORATOR_P(sname, dname)) {
3383  if (!senc)
3384  senc = make_dummy_encoding(sname);
3385  if (!denc)
3386  denc = make_dummy_encoding(dname);
3387  }
3388 
3389  ec->source_encoding = senc;
3390  ec->destination_encoding = denc;
3391 
3392  DATA_PTR(self) = ec;
3393 
3394  return self;
3395 }
3396 
3397 /*
3398  * call-seq:
3399  * ec.inspect -> string
3400  *
3401  * Returns a printable version of <i>ec</i>
3402  *
3403  * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3404  * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3405  *
3406  */
3407 static VALUE
3409 {
3410  const char *cname = rb_obj_classname(self);
3411  rb_econv_t *ec;
3412 
3413  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3414  if (!ec)
3415  return rb_sprintf("#<%s: uninitialized>", cname);
3416  else {
3417  const char *sname = ec->source_encoding_name;
3418  const char *dname = ec->destination_encoding_name;
3419  VALUE str;
3420  str = rb_sprintf("#<%s: ", cname);
3421  econv_description(sname, dname, ec->flags, str);
3422  rb_str_cat2(str, ">");
3423  return str;
3424  }
3425 }
3426 
3427 static rb_econv_t *
3429 {
3430  rb_econv_t *ec;
3431 
3432  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3433  if (!ec) {
3434  rb_raise(rb_eTypeError, "uninitialized encoding converter");
3435  }
3436  return ec;
3437 }
3438 
3439 /*
3440  * call-seq:
3441  * ec.source_encoding -> encoding
3442  *
3443  * Returns the source encoding as an Encoding object.
3444  */
3445 static VALUE
3447 {
3448  rb_econv_t *ec = check_econv(self);
3449  if (!ec->source_encoding)
3450  return Qnil;
3452 }
3453 
3454 /*
3455  * call-seq:
3456  * ec.destination_encoding -> encoding
3457  *
3458  * Returns the destination encoding as an Encoding object.
3459  */
3460 static VALUE
3462 {
3463  rb_econv_t *ec = check_econv(self);
3464  if (!ec->destination_encoding)
3465  return Qnil;
3467 }
3468 
3469 /*
3470  * call-seq:
3471  * ec.convpath -> ary
3472  *
3473  * Returns the conversion path of ec.
3474  *
3475  * The result is an array of conversions.
3476  *
3477  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3478  * p ec.convpath
3479  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3480  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3481  * # "crlf_newline"]
3482  *
3483  * Each element of the array is a pair of encodings or a string.
3484  * A pair means an encoding conversion.
3485  * A string means a decorator.
3486  *
3487  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3488  * a converter from ISO-8859-1 to UTF-8.
3489  * "crlf_newline" means newline converter from LF to CRLF.
3490  */
3491 static VALUE
3493 {
3494  rb_econv_t *ec = check_econv(self);
3495  VALUE result;
3496  int i;
3497 
3498  result = rb_ary_new();
3499  for (i = 0; i < ec->num_trans; i++) {
3500  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3501  VALUE v;
3502  if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3503  v = rb_str_new_cstr(tr->dst_encoding);
3504  else
3506  rb_ary_push(result, v);
3507  }
3508  return result;
3509 }
3510 
3511 /*
3512  * call-seq:
3513  * ec == other -> true or false
3514  */
3515 static VALUE
3517 {
3518  rb_econv_t *ec1 = check_econv(self);
3519  rb_econv_t *ec2;
3520  int i;
3521 
3522  if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3523  return Qnil;
3524  }
3525  ec2 = DATA_PTR(other);
3526  if (!ec2) return Qfalse;
3527  if (ec1->source_encoding_name != ec2->source_encoding_name &&
3528  strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3529  return Qfalse;
3532  return Qfalse;
3533  if (ec1->flags != ec2->flags) return Qfalse;
3534  if (ec1->replacement_enc != ec2->replacement_enc &&
3535  strcmp(ec1->replacement_enc, ec2->replacement_enc))
3536  return Qfalse;
3537  if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3538  if (ec1->replacement_str != ec2->replacement_str &&
3540  return Qfalse;
3541 
3542  if (ec1->num_trans != ec2->num_trans) return Qfalse;
3543  for (i = 0; i < ec1->num_trans; i++) {
3544  if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3545  return Qfalse;
3546  }
3547  return Qtrue;
3548 }
3549 
3550 static VALUE
3552 {
3553  switch (res) {
3559  case econv_finished: return sym_finished;
3560  case econv_after_output: return sym_after_output;
3561  default: return INT2NUM(res); /* should not be reached */
3562  }
3563 }
3564 
3565 /*
3566  * call-seq:
3567  * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3568  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3569  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3570  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3571  *
3572  * possible opt elements:
3573  * hash form:
3574  * :partial_input => true # source buffer may be part of larger source
3575  * :after_output => true # stop conversion after output before input
3576  * integer form:
3577  * Encoding::Converter::PARTIAL_INPUT
3578  * Encoding::Converter::AFTER_OUTPUT
3579  *
3580  * possible results:
3581  * :invalid_byte_sequence
3582  * :incomplete_input
3583  * :undefined_conversion
3584  * :after_output
3585  * :destination_buffer_full
3586  * :source_buffer_empty
3587  * :finished
3588  *
3589  * primitive_convert converts source_buffer into destination_buffer.
3590  *
3591  * source_buffer should be a string or nil.
3592  * nil means an empty string.
3593  *
3594  * destination_buffer should be a string.
3595  *
3596  * destination_byteoffset should be an integer or nil.
3597  * nil means the end of destination_buffer.
3598  * If it is omitted, nil is assumed.
3599  *
3600  * destination_bytesize should be an integer or nil.
3601  * nil means unlimited.
3602  * If it is omitted, nil is assumed.
3603  *
3604  * opt should be nil, a hash or an integer.
3605  * nil means no flags.
3606  * If it is omitted, nil is assumed.
3607  *
3608  * primitive_convert converts the content of source_buffer from beginning
3609  * and store the result into destination_buffer.
3610  *
3611  * destination_byteoffset and destination_bytesize specify the region which
3612  * the converted result is stored.
3613  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3614  * If destination_byteoffset is nil,
3615  * destination_buffer.bytesize is used for appending the result.
3616  * destination_bytesize specifies maximum number of bytes.
3617  * If destination_bytesize is nil,
3618  * destination size is unlimited.
3619  * After conversion, destination_buffer is resized to
3620  * destination_byteoffset + actually produced number of bytes.
3621  * Also destination_buffer's encoding is set to destination_encoding.
3622  *
3623  * primitive_convert drops the converted part of source_buffer.
3624  * the dropped part is converted in destination_buffer or
3625  * buffered in Encoding::Converter object.
3626  *
3627  * primitive_convert stops conversion when one of following condition met.
3628  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3629  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3630  * - unexpected end of source buffer (:incomplete_input)
3631  * this occur only when :partial_input is not specified.
3632  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3633  * - character not representable in output encoding (:undefined_conversion)
3634  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3635  * - after some output is generated, before input is done (:after_output)
3636  * this occur only when :after_output is specified.
3637  * - destination buffer is full (:destination_buffer_full)
3638  * this occur only when destination_bytesize is non-nil.
3639  * - source buffer is empty (:source_buffer_empty)
3640  * this occur only when :partial_input is specified.
3641  * - conversion is finished (:finished)
3642  *
3643  * example:
3644  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3645  * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3646  * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3647  *
3648  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3649  * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3650  * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3651  * ret = ec.primitive_convert(src, dst="", nil, 1)
3652  * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3653  * ret = ec.primitive_convert(src, dst="", nil, 1)
3654  * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3655  * ret = ec.primitive_convert(src, dst="", nil, 1)
3656  * p [ret, src, dst] #=> [:finished, "", "i"]
3657  *
3658  */
3659 static VALUE
3661 {
3662  VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3663  rb_econv_t *ec = check_econv(self);
3664  rb_econv_result_t res;
3665  const unsigned char *ip, *is;
3666  unsigned char *op, *os;
3667  long output_byteoffset, output_bytesize;
3668  unsigned long output_byteend;
3669  int flags;
3670 
3671  argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3672 
3673  if (NIL_P(output_byteoffset_v))
3674  output_byteoffset = 0; /* dummy */
3675  else
3676  output_byteoffset = NUM2LONG(output_byteoffset_v);
3677 
3678  if (NIL_P(output_bytesize_v))
3679  output_bytesize = 0; /* dummy */
3680  else
3681  output_bytesize = NUM2LONG(output_bytesize_v);
3682 
3683  if (!NIL_P(flags_v)) {
3684  if (!NIL_P(opt)) {
3685  rb_error_arity(argc + 1, 2, 5);
3686  }
3687  flags = NUM2INT(rb_to_int(flags_v));
3688  }
3689  else if (!NIL_P(opt)) {
3690  VALUE v;
3691  flags = 0;
3692  v = rb_hash_aref(opt, sym_partial_input);
3693  if (RTEST(v))
3694  flags |= ECONV_PARTIAL_INPUT;
3695  v = rb_hash_aref(opt, sym_after_output);
3696  if (RTEST(v))
3697  flags |= ECONV_AFTER_OUTPUT;
3698  }
3699  else {
3700  flags = 0;
3701  }
3702 
3703  StringValue(output);
3704  if (!NIL_P(input))
3705  StringValue(input);
3706  rb_str_modify(output);
3707 
3708  if (NIL_P(output_bytesize_v)) {
3709  output_bytesize = RSTRING_EMBED_LEN_MAX;
3710  if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3711  output_bytesize = RSTRING_LEN(input);
3712  }
3713 
3714  retry:
3715 
3716  if (NIL_P(output_byteoffset_v))
3717  output_byteoffset = RSTRING_LEN(output);
3718 
3719  if (output_byteoffset < 0)
3720  rb_raise(rb_eArgError, "negative output_byteoffset");
3721 
3722  if (RSTRING_LEN(output) < output_byteoffset)
3723  rb_raise(rb_eArgError, "output_byteoffset too big");
3724 
3725  if (output_bytesize < 0)
3726  rb_raise(rb_eArgError, "negative output_bytesize");
3727 
3728  output_byteend = (unsigned long)output_byteoffset +
3729  (unsigned long)output_bytesize;
3730 
3731  if (output_byteend < (unsigned long)output_byteoffset ||
3732  LONG_MAX < output_byteend)
3733  rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3734 
3735  if (rb_str_capacity(output) < output_byteend)
3736  rb_str_resize(output, output_byteend);
3737 
3738  if (NIL_P(input)) {
3739  ip = is = NULL;
3740  }
3741  else {
3742  ip = (const unsigned char *)RSTRING_PTR(input);
3743  is = ip + RSTRING_LEN(input);
3744  }
3745 
3746  op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3747  os = op + output_bytesize;
3748 
3749  res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3750  rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3751  if (!NIL_P(input))
3752  rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3753 
3754  if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3755  if (LONG_MAX / 2 < output_bytesize)
3756  rb_raise(rb_eArgError, "too long conversion result");
3757  output_bytesize *= 2;
3758  output_byteoffset_v = Qnil;
3759  goto retry;
3760  }
3761 
3762  if (ec->destination_encoding) {
3764  }
3765 
3766  return econv_result_to_symbol(res);
3767 }
3768 
3769 /*
3770  * call-seq:
3771  * ec.convert(source_string) -> destination_string
3772  *
3773  * Convert source_string and return destination_string.
3774  *
3775  * source_string is assumed as a part of source.
3776  * i.e. :partial_input=>true is specified internally.
3777  * finish method should be used last.
3778  *
3779  * ec = Encoding::Converter.new("utf-8", "euc-jp")
3780  * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3781  * puts ec.finish.dump #=> ""
3782  *
3783  * ec = Encoding::Converter.new("euc-jp", "utf-8")
3784  * puts ec.convert("\xA4").dump #=> ""
3785  * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3786  * puts ec.finish.dump #=> ""
3787  *
3788  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3789  * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3790  * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3791  * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3792  * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3793  *
3794  * If a conversion error occur,
3795  * Encoding::UndefinedConversionError or
3796  * Encoding::InvalidByteSequenceError is raised.
3797  * Encoding::Converter#convert doesn't supply methods to recover or restart
3798  * from these exceptions.
3799  * When you want to handle these conversion errors,
3800  * use Encoding::Converter#primitive_convert.
3801  *
3802  */
3803 static VALUE
3804 econv_convert(VALUE self, VALUE source_string)
3805 {
3806  VALUE ret, dst;
3807  VALUE av[5];
3808  int ac;
3809  rb_econv_t *ec = check_econv(self);
3810 
3811  StringValue(source_string);
3812 
3813  dst = rb_str_new(NULL, 0);
3814 
3815  av[0] = rb_str_dup(source_string);
3816  av[1] = dst;
3817  av[2] = Qnil;
3818  av[3] = Qnil;
3819  av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3820  ac = 5;
3821 
3822  ret = econv_primitive_convert(ac, av, self);
3823 
3824  if (ret == sym_invalid_byte_sequence ||
3825  ret == sym_undefined_conversion ||
3826  ret == sym_incomplete_input) {
3827  VALUE exc = make_econv_exception(ec);
3828  rb_exc_raise(exc);
3829  }
3830 
3831  if (ret == sym_finished) {
3832  rb_raise(rb_eArgError, "converter already finished");
3833  }
3834 
3835  if (ret != sym_source_buffer_empty) {
3836  rb_bug("unexpected result of econv_primitive_convert");
3837  }
3838 
3839  return dst;
3840 }
3841 
3842 /*
3843  * call-seq:
3844  * ec.finish -> string
3845  *
3846  * Finishes the converter.
3847  * It returns the last part of the converted string.
3848  *
3849  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3850  * p ec.convert("\u3042") #=> "\e$B$\""
3851  * p ec.finish #=> "\e(B"
3852  */
3853 static VALUE
3855 {
3856  VALUE ret, dst;
3857  VALUE av[5];
3858  int ac;
3859  rb_econv_t *ec = check_econv(self);
3860 
3861  dst = rb_str_new(NULL, 0);
3862 
3863  av[0] = Qnil;
3864  av[1] = dst;
3865  av[2] = Qnil;
3866  av[3] = Qnil;
3867  av[4] = INT2NUM(0);
3868  ac = 5;
3869 
3870  ret = econv_primitive_convert(ac, av, self);
3871 
3872  if (ret == sym_invalid_byte_sequence ||
3873  ret == sym_undefined_conversion ||
3874  ret == sym_incomplete_input) {
3875  VALUE exc = make_econv_exception(ec);
3876  rb_exc_raise(exc);
3877  }
3878 
3879  if (ret != sym_finished) {
3880  rb_bug("unexpected result of econv_primitive_convert");
3881  }
3882 
3883  return dst;
3884 }
3885 
3886 /*
3887  * call-seq:
3888  * ec.primitive_errinfo -> array
3889  *
3890  * primitive_errinfo returns important information regarding the last error
3891  * as a 5-element array:
3892  *
3893  * [result, enc1, enc2, error_bytes, readagain_bytes]
3894  *
3895  * result is the last result of primitive_convert.
3896  *
3897  * Other elements are only meaningful when result is
3898  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3899  *
3900  * enc1 and enc2 indicate a conversion step as a pair of strings.
3901  * For example, a converter from EUC-JP to ISO-8859-1 converts
3902  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3903  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3904  *
3905  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3906  * error_bytes is discarded portion.
3907  * readagain_bytes is buffered portion which is read again on next conversion.
3908  *
3909  * Example:
3910  *
3911  * # \xff is invalid as EUC-JP.
3912  * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3913  * ec.primitive_convert(src="\xff", dst="", nil, 10)
3914  * p ec.primitive_errinfo
3915  * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
3916  *
3917  * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3918  * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3919  * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3920  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3921  * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3922  * p ec.primitive_errinfo
3923  * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3924  *
3925  * # partial character is invalid
3926  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3927  * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3928  * p ec.primitive_errinfo
3929  * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3930  *
3931  * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3932  * # partial characters.
3933  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3934  * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3935  * p ec.primitive_errinfo
3936  * #=> [:source_buffer_empty, nil, nil, nil, nil]
3937  *
3938  * # \xd8\x00\x00@ is invalid as UTF-16BE because
3939  * # no low surrogate after high surrogate (\xd8\x00).
3940  * # It is detected by 3rd byte (\00) which is part of next character.
3941  * # So the high surrogate (\xd8\x00) is discarded and
3942  * # the 3rd byte is read again later.
3943  * # Since the byte is buffered in ec, it is dropped from src.
3944  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3945  * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
3946  * p ec.primitive_errinfo
3947  * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
3948  * p src
3949  * #=> "@"
3950  *
3951  * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
3952  * # The problem is detected by 4th byte.
3953  * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
3954  * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
3955  * p ec.primitive_errinfo
3956  * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
3957  * p src
3958  * #=> ""
3959  *
3960  */
3961 static VALUE
3963 {
3964  rb_econv_t *ec = check_econv(self);
3965 
3966  VALUE ary;
3967 
3968  ary = rb_ary_new2(5);
3969 
3971  rb_ary_store(ary, 4, Qnil);
3972 
3973  if (ec->last_error.source_encoding)
3975 
3978 
3979  if (ec->last_error.error_bytes_start) {
3982  }
3983 
3984  return ary;
3985 }
3986 
3987 /*
3988  * call-seq:
3989  * ec.insert_output(string) -> nil
3990  *
3991  * Inserts string into the encoding converter.
3992  * The string will be converted to the destination encoding and
3993  * output on later conversions.
3994  *
3995  * If the destination encoding is stateful,
3996  * string is converted according to the state and the state is updated.
3997  *
3998  * This method should be used only when a conversion error occurs.
3999  *
4000  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4001  * src = "HIRAGANA LETTER A is \u{3042}."
4002  * dst = ""
4003  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4004  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4005  * ec.insert_output("<err>")
4006  * p ec.primitive_convert(src, dst) #=> :finished
4007  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4008  *
4009  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4010  * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4011  * dst = ""
4012  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4013  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4014  * ec.insert_output "?" # state change required to output "?".
4015  * p ec.primitive_convert(src, dst) #=> :finished
4016  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4017  *
4018  */
4019 static VALUE
4021 {
4022  const char *insert_enc;
4023 
4024  int ret;
4025 
4026  rb_econv_t *ec = check_econv(self);
4027 
4028  StringValue(string);
4029  insert_enc = rb_econv_encoding_to_insert_output(ec);
4030  string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4031 
4032  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4033  if (ret == -1) {
4034  rb_raise(rb_eArgError, "too big string");
4035  }
4036 
4037  return Qnil;
4038 }
4039 
4040 /*
4041  * call-seq
4042  * ec.putback -> string
4043  * ec.putback(max_numbytes) -> string
4044  *
4045  * Put back the bytes which will be converted.
4046  *
4047  * The bytes are caused by invalid_byte_sequence error.
4048  * When invalid_byte_sequence error, some bytes are discarded and
4049  * some bytes are buffered to be converted later.
4050  * The latter bytes can be put back.
4051  * It can be observed by
4052  * Encoding::InvalidByteSequenceError#readagain_bytes and
4053  * Encoding::Converter#primitive_errinfo.
4054  *
4055  * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4056  * src = "\x00\xd8\x61\x00"
4057  * dst = ""
4058  * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4059  * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4060  * p ec.putback #=> "a\x00"
4061  * p ec.putback #=> "" # no more bytes to put back
4062  *
4063  */
4064 static VALUE
4066 {
4067  rb_econv_t *ec = check_econv(self);
4068  int n;
4069  int putbackable;
4070  VALUE str, max;
4071 
4072  rb_scan_args(argc, argv, "01", &max);
4073 
4074  if (NIL_P(max))
4075  n = rb_econv_putbackable(ec);
4076  else {
4077  n = NUM2INT(max);
4078  putbackable = rb_econv_putbackable(ec);
4079  if (putbackable < n)
4080  n = putbackable;
4081  }
4082 
4083  str = rb_str_new(NULL, n);
4084  rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4085 
4086  if (ec->source_encoding) {
4088  }
4089 
4090  return str;
4091 }
4092 
4093 /*
4094  * call-seq:
4095  * ec.last_error -> exception or nil
4096  *
4097  * Returns an exception object for the last conversion.
4098  * Returns nil if the last conversion did not produce an error.
4099  *
4100  * "error" means that
4101  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4102  * Encoding::Converter#convert and
4103  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4104  * Encoding::Converter#primitive_convert.
4105  *
4106  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4107  * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4108  * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4109  * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4110  * p ec.last_error #=> nil
4111  *
4112  */
4113 static VALUE
4115 {
4116  rb_econv_t *ec = check_econv(self);
4117  VALUE exc;
4118 
4119  exc = make_econv_exception(ec);
4120  if (NIL_P(exc))
4121  return Qnil;
4122  return exc;
4123 }
4124 
4125 /*
4126  * call-seq:
4127  * ec.replacement -> string
4128  *
4129  * Returns the replacement string.
4130  *
4131  * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4132  * p ec.replacement #=> "?"
4133  *
4134  * ec = Encoding::Converter.new("euc-jp", "utf-8")
4135  * p ec.replacement #=> "\uFFFD"
4136  */
4137 static VALUE
4139 {
4140  rb_econv_t *ec = check_econv(self);
4141  int ret;
4142  rb_encoding *enc;
4143 
4144  ret = make_replacement(ec);
4145  if (ret == -1) {
4146  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4147  }
4148 
4149  enc = rb_enc_find(ec->replacement_enc);
4150  return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4151 }
4152 
4153 /*
4154  * call-seq:
4155  * ec.replacement = string
4156  *
4157  * Sets the replacement string.
4158  *
4159  * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4160  * ec.replacement = "<undef>"
4161  * p ec.convert("a \u3042 b") #=> "a <undef> b"
4162  */
4163 static VALUE
4165 {
4166  rb_econv_t *ec = check_econv(self);
4167  VALUE string = arg;
4168  int ret;
4169  rb_encoding *enc;
4170 
4171  StringValue(string);
4172  enc = rb_enc_get(string);
4173 
4174  ret = rb_econv_set_replacement(ec,
4175  (const unsigned char *)RSTRING_PTR(string),
4176  RSTRING_LEN(string),
4177  rb_enc_name(enc));
4178 
4179  if (ret == -1) {
4180  /* xxx: rb_eInvalidByteSequenceError? */
4181  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4182  }
4183 
4184  return arg;
4185 }
4186 
4187 VALUE
4189 {
4190  return make_econv_exception(ec);
4191 }
4192 
4193 void
4195 {
4196  VALUE exc;
4197 
4198  exc = make_econv_exception(ec);
4199  if (NIL_P(exc))
4200  return;
4201  rb_exc_raise(exc);
4202 }
4203 
4204 /*
4205  * call-seq:
4206  * ecerr.source_encoding_name -> string
4207  *
4208  * Returns the source encoding name as a string.
4209  */
4210 static VALUE
4212 {
4213  return rb_attr_get(self, rb_intern("source_encoding_name"));
4214 }
4215 
4216 /*
4217  * call-seq:
4218  * ecerr.source_encoding -> encoding
4219  *
4220  * Returns the source encoding as an encoding object.
4221  *
4222  * Note that the result may not be equal to the source encoding of
4223  * the encoding converter if the conversion has multiple steps.
4224  *
4225  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4226  * begin
4227  * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4228  * rescue Encoding::UndefinedConversionError
4229  * p $!.source_encoding #=> #<Encoding:UTF-8>
4230  * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4231  * p $!.source_encoding_name #=> "UTF-8"
4232  * p $!.destination_encoding_name #=> "EUC-JP"
4233  * end
4234  *
4235  */
4236 static VALUE
4238 {
4239  return rb_attr_get(self, rb_intern("source_encoding"));
4240 }
4241 
4242 /*
4243  * call-seq:
4244  * ecerr.destination_encoding_name -> string
4245  *
4246  * Returns the destination encoding name as a string.
4247  */
4248 static VALUE
4250 {
4251  return rb_attr_get(self, rb_intern("destination_encoding_name"));
4252 }
4253 
4254 /*
4255  * call-seq:
4256  * ecerr.destination_encoding -> string
4257  *
4258  * Returns the destination encoding as an encoding object.
4259  */
4260 static VALUE
4262 {
4263  return rb_attr_get(self, rb_intern("destination_encoding"));
4264 }
4265 
4266 /*
4267  * call-seq:
4268  * ecerr.error_char -> string
4269  *
4270  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4271  *
4272  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4273  * begin
4274  * ec.convert("\xa0")
4275  * rescue Encoding::UndefinedConversionError
4276  * puts $!.error_char.dump #=> "\xC2\xA0"
4277  * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4278  * end
4279  *
4280  */
4281 static VALUE
4283 {
4284  return rb_attr_get(self, rb_intern("error_char"));
4285 }
4286 
4287 /*
4288  * call-seq:
4289  * ecerr.error_bytes -> string
4290  *
4291  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4292  *
4293  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4294  * begin
4295  * ec.convert("abc\xA1\xFFdef")
4296  * rescue Encoding::InvalidByteSequenceError
4297  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4298  * puts $!.error_bytes.dump #=> "\xA1"
4299  * puts $!.readagain_bytes.dump #=> "\xFF"
4300  * end
4301  */
4302 static VALUE
4304 {
4305  return rb_attr_get(self, rb_intern("error_bytes"));
4306 }
4307 
4308 /*
4309  * call-seq:
4310  * ecerr.readagain_bytes -> string
4311  *
4312  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4313  */
4314 static VALUE
4316 {
4317  return rb_attr_get(self, rb_intern("readagain_bytes"));
4318 }
4319 
4320 /*
4321  * call-seq:
4322  * ecerr.incomplete_input? -> true or false
4323  *
4324  * Returns true if the invalid byte sequence error is caused by
4325  * premature end of string.
4326  *
4327  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4328  *
4329  * begin
4330  * ec.convert("abc\xA1z")
4331  * rescue Encoding::InvalidByteSequenceError
4332  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4333  * p $!.incomplete_input? #=> false
4334  * end
4335  *
4336  * begin
4337  * ec.convert("abc\xA1")
4338  * ec.finish
4339  * rescue Encoding::InvalidByteSequenceError
4340  * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4341  * p $!.incomplete_input? #=> true
4342  * end
4343  */
4344 static VALUE
4346 {
4347  return rb_attr_get(self, rb_intern("incomplete_input"));
4348 }
4349 
4350 /*
4351  * Document-class: Encoding::UndefinedConversionError
4352  *
4353  * Raised by Encoding and String methods when a transcoding operation
4354  * fails.
4355  */
4356 
4357 /*
4358  * Document-class: Encoding::InvalidByteSequenceError
4359  *
4360  * Raised by Encoding and String methods when the string being
4361  * transcoded contains a byte invalid for the either the source or
4362  * target encoding.
4363  */
4364 
4365 /*
4366  * Document-class: Encoding::ConverterNotFoundError
4367  *
4368  * Raised by transcoding methods when a named encoding does not
4369  * correspond with a known converter.
4370  */
4371 
4372 void
4374 {
4378 
4379  transcoder_table = st_init_strcasetable();
4380 
4381  sym_invalid = ID2SYM(rb_intern("invalid"));
4382  sym_undef = ID2SYM(rb_intern("undef"));
4383  sym_replace = ID2SYM(rb_intern("replace"));
4384  sym_fallback = ID2SYM(rb_intern("fallback"));
4385  sym_aref = ID2SYM(rb_intern("[]"));
4386  sym_xml = ID2SYM(rb_intern("xml"));
4387  sym_text = ID2SYM(rb_intern("text"));
4388  sym_attr = ID2SYM(rb_intern("attr"));
4389 
4390  sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
4391  sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
4392  sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
4393  sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
4394  sym_finished = ID2SYM(rb_intern("finished"));
4395  sym_after_output = ID2SYM(rb_intern("after_output"));
4396  sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
4397  sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
4398  sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
4399  sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
4400  sym_partial_input = ID2SYM(rb_intern("partial_input"));
4401 
4402 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4403  sym_newline = ID2SYM(rb_intern("newline"));
4404  sym_universal = ID2SYM(rb_intern("universal"));
4405  sym_crlf = ID2SYM(rb_intern("crlf"));
4406  sym_cr = ID2SYM(rb_intern("cr"));
4407  sym_lf = ID2SYM(rb_intern("lf"));
4408 #endif
4409 
4410  rb_define_method(rb_cString, "encode", str_encode, -1);
4411  rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4412 
4432 
4433  /* Document-const: INVALID_MASK
4434  *
4435  * Mask for invalid byte sequences
4436  */
4438 
4439  /* Document-const: INVALID_REPLACE
4440  *
4441  * Replace invalid byte sequences
4442  */
4444 
4445  /* Document-const: UNDEF_MASK
4446  *
4447  * Mask for a valid character in the source encoding but no related
4448  * character(s) in destination encoding.
4449  */
4451 
4452  /* Document-const: UNDEF_REPLACE
4453  *
4454  * Replace byte sequences that are undefined in the destination encoding.
4455  */
4457 
4458  /* Document-const: UNDEF_HEX_CHARREF
4459  *
4460  * Replace byte sequences that are undefined in the destination encoding
4461  * with an XML hexadecimal character reference. This is valid for XML
4462  * conversion.
4463  */
4465 
4466  /* Document-const: PARTIAL_INPUT
4467  *
4468  * Indicates the source may be part of a larger string. See
4469  * primitive_convert for an example.
4470  */
4472 
4473  /* Document-const: AFTER_OUTPUT
4474  *
4475  * Stop converting after some output is complete but before all of the
4476  * input was consumed. See primitive_convert for an example.
4477  */
4479 
4480  /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4481  *
4482  * Decorator for converting CRLF and CR to LF
4483  */
4485 
4486  /* Document-const: CRLF_NEWLINE_DECORATOR
4487  *
4488  * Decorator for converting LF to CRLF
4489  */
4491 
4492  /* Document-const: CR_NEWLINE_DECORATOR
4493  *
4494  * Decorator for converting LF to CR
4495  */
4497 
4498  /* Document-const: XML_TEXT_DECORATOR
4499  *
4500  * Escape as XML CharData
4501  */
4503 
4504  /* Document-const: XML_ATTR_CONTENT_DECORATOR
4505  *
4506  * Escape as XML AttValue
4507  */
4509 
4510  /* Document-const: XML_ATTR_QUOTE_DECORATOR
4511  *
4512  * Escape as XML AttValue
4513  */
4515 
4521 
4529 
4530  Init_newline();
4531 }
RUBY_EXTERN VALUE rb_cString
Definition: ruby.h:1456
#define BL_ACTION(byte)
#define FOURbt
static VALUE sym_replace
Definition: transcode.c:27
const char * ascii_incompat_name
Definition: transcode.c:1765
unsigned char ary[8]
Definition: transcode.c:67
int rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
Definition: transcode.c:2564
#define ECONV_XML_TEXT_DECORATOR
Definition: encoding.h:324
#define T_SYMBOL
Definition: ruby.h:502
Definition: string.c:5075
#define FUNio
search_path_queue_t * queue
Definition: transcode.c:250
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:690
void rb_econv_check_error(rb_econv_t *ec)
Definition: transcode.c:4194
VALUE next_info
Definition: transcode.c:60
RUBY_EXTERN VALUE rb_cData
Definition: ruby.h:1433
static VALUE econv_destination_encoding(VALUE self)
Definition: transcode.c:3461
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:138
static VALUE sym_undefined_conversion
Definition: transcode.c:38
#define NOMAP
VALUE rb_eConverterNotFoundError
Definition: transcode.c:23
VALUE rb_ary_new4(long n, const VALUE *elts)
Definition: array.c:451
rb_econv_result_t
Definition: encoding.h:242
VALUE rb_ary_entry(VALUE ary, long offset)
Definition: array.c:1088
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:139
unsigned char * in_buf_end
Definition: transcode.c:126
const unsigned char * error_bytes_start
Definition: transcode.c:139
#define RARRAY_LEN(a)
Definition: ruby.h:899
union rb_transcoding::@120 writebuf
void rb_bug(const char *fmt,...)
Definition: error.c:295
rb_econv_result_t last_result
Definition: transcode.c:108
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:155
VALUE rb_econv_make_exception(rb_econv_t *ec)
Definition: transcode.c:4188
const char * dst_encoding
rb_econv_result_t result
Definition: transcode.c:135
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:232
static VALUE sym_invalid_byte_sequence
Definition: transcode.c:37
size_t strlen(const char *)
#define INT2NUM(x)
Definition: ruby.h:1178
struct search_path_queue_tag search_path_queue_t
#define DECORATOR_P(sname, dname)
Definition: transcode.c:154
int i
Definition: win32ole.c:784
Definition: st.h:77
#define GB4bt
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1446
VALUE rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:2028
Definition: st.h:108
VALUE rb_cEncoding
Definition: encoding.c:40
#define NUM2INT(x)
Definition: ruby.h:622
static int max(int a, int b)
Definition: strftime.c:141
#define ZERObt
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1497
static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, const char *dst_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2258
VALUE rb_eInvalidByteSequenceError
Definition: transcode.c:22
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Definition: encoding.h:325
static void econv_args(int argc, VALUE *argv, volatile VALUE *snamev_p, volatile VALUE *dnamev_p, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p, int *ecflags_p, VALUE *ecopts_p)
Definition: transcode.c:2978
#define getGB4bt1(a)
#define FL_TAINT
Definition: ruby.h:1115
void rb_econv_binmode(rb_econv_t *ec)
Definition: transcode.c:1934
ssize_t writebuf_len
Definition: transcode.c:72
static void rb_transcoding_close(rb_transcoding *tc)
Definition: transcode.c:822
rb_encoding * source_encoding
Definition: transcode.c:146
static VALUE sym_newline
Definition: transcode.c:33
#define Qtrue
Definition: ruby.h:434
unsigned char * out_data_start
Definition: transcode.c:105
static int decorate_convpath(VALUE convpath, int ecflags)
Definition: transcode.c:3038
static int enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
Definition: transcode.c:2605
static VALUE sym_crlf_newline
Definition: transcode.c:30
void Init_newline(void)
Definition: newline.c:183
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: ruby.h:1016
#define MAX_ECFLAGS_DECORATORS
Definition: transcode.c:1026
static size_t rb_transcoding_memsize(rb_transcoding *tc)
Definition: transcode.c:838
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:63
#define TypedData_Get_Struct(obj, type, data_type, sval)
Definition: ruby.h:1030
int rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
Definition: transcode.c:2519
unsigned char * in_data_start
Definition: transcode.c:124
#define ECONV_ERROR_HANDLER_MASK
Definition: encoding.h:307
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1917
static int str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, const char **sname_p, rb_encoding **senc_p, const char **dname_p, rb_encoding **denc_p)
Definition: transcode.c:2629
VALUE rb_method_call(int, VALUE *, VALUE)
Definition: proc.c:1504
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:194
#define getBT3(a)
rb_encoding * destination_encoding
Definition: transcode.c:147
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Definition: encoding.h:328
struct rb_transcoding * tc
Definition: transcode.c:103
#define SUSPEND(ret, num)
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:103
static VALUE sym_cr_newline
Definition: transcode.c:31
VALUE rb_eTypeError
Definition: error.c:516
static int str_transcode(int argc, VALUE *argv, VALUE *self)
Definition: transcode.c:2731
static VALUE sym_aref
Definition: transcode.c:27
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
Definition: transcode.c:1813
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:822
VALUE rb_eEncodingError
Definition: error.c:522
void st_free_table(st_table *)
Definition: st.c:334
static VALUE econv_last_error(VALUE self)
Definition: transcode.c:4114
#define SYM2ID(x)
Definition: ruby.h:364
VALUE rb_obj_is_method(VALUE)
Definition: proc.c:904
#define UNDEF
struct rb_transcoding * error_tc
Definition: transcode.c:136
static rb_econv_t * rb_econv_alloc(int n_hint)
Definition: transcode.c:856
void rb_str_set_len(VALUE, long)
Definition: string.c:1838
int rb_enc_str_coderange(VALUE)
Definition: string.c:327
static rb_econv_t * rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
Definition: transcode.c:933
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:534
VALUE rb_to_int(VALUE)
Definition: object.c:2482
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:1788
ssize_t(* func_sio)(void *, const unsigned char *, size_t, VALUE, unsigned char *, size_t)
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:766
unsigned int conv_tree_start
static void rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3165
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define T_HASH
Definition: ruby.h:493
const char * lib
Definition: transcode.c:159
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2867
#define THREEbt
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
Definition: transcode.c:2570
#define STR1
#define DATA_PTR(dta)
Definition: ruby.h:985
const rb_transcoder * transcoder
Definition: transcode.c:160
#define next_info
static int output_replacement_character(rb_econv_t *ec)
Definition: transcode.c:2222
struct rb_econv_t::@121 last_error
#define T_ARRAY
Definition: ruby.h:492
VALUE(* func_ii)(void *, VALUE)
const char * dname
Definition: transcode.c:158
static rb_econv_result_t rb_trans_conv(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int *result_position_ptr)
Definition: transcode.c:1175
int(* state_init_func)(void *)
void callback(ffi_cif *cif, void *resp, void **args, void *ctx)
Definition: closure.c:53
ssize_t(* func_so)(void *, const unsigned char *, size_t, unsigned char *, size_t)
static rb_econv_result_t transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:757
static VALUE econv_finish(VALUE self)
Definition: transcode.c:3854
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1166
static VALUE econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
Definition: transcode.c:1970
static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:166
static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry)
Definition: transcode.c:362
VALUE rb_str_tmp_new(long)
VALUE(* func_si)(void *, const unsigned char *, size_t)
static int transcode_search_path(const char *sname, const char *dname, void(*callback)(const char *sname, const char *dname, int depth, void *arg), void *arg)
Definition: transcode.c:277
unsigned char * in_buf_start
Definition: transcode.c:123
static rb_econv_t * rb_econv_open0(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:976
static void econv_free(void *ptr)
Definition: transcode.c:2892
const char * enc
Definition: transcode.c:245
static VALUE sym_source_buffer_empty
Definition: transcode.c:40
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Definition: transcode.c:1753
#define FUNsio
#define ENC_CODERANGE_7BIT
Definition: encoding.h:58
size_t error_bytes_len
Definition: transcode.c:140
const char * rb_obj_classname(VALUE)
Definition: variable.c:396
#define getGB4bt2(a)
static VALUE sym_crlf
Definition: transcode.c:33
ssize_t(* finish_func)(void *, unsigned char *, size_t)
static VALUE econv_convert(VALUE self, VALUE source_string)
Definition: transcode.c:3804
static VALUE sym_partial_input
Definition: transcode.c:35
static const char transcoder_lib_prefix[]
Definition: transcode.c:230
static rb_econv_t * rb_econv_init_by_convpath(VALUE self, VALUE convpath, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p)
Definition: transcode.c:3180
Win32OLEIDispatch * p
Definition: win32ole.c:786
void rb_exc_raise(VALUE mesg)
Definition: eval.c:527
static unsigned char * output
Definition: nkf.c:32
static const char * get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
Definition: transcode.c:393
static VALUE str_encode_associate(VALUE str, int encidx)
Definition: transcode.c:2745
st_table * st_init_strcasetable(void)
Definition: st.c:296
#define FUNii
st_table * visited
Definition: transcode.c:249
#define RB_TYPE_P(obj, type)
Definition: ruby.h:1537
static VALUE ecerr_incomplete_input(VALUE self)
Definition: transcode.c:4345
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Definition: transcode.c:3150
#define fail()
#define FL_UNTRUSTED
Definition: ruby.h:1116
int st_lookup(st_table *, st_data_t, st_data_t *)
static unsigned char * str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
Definition: transcode.c:2413
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:146
ssize_t readagain_len
Definition: transcode.c:65
static VALUE econv_primitive_errinfo(VALUE self)
Definition: transcode.c:3962
unsigned int output_index
Definition: transcode.c:62
unsigned int input
Definition: nkf.c:4311
#define TRANSCODING_READBUF(tc)
Definition: transcode.c:84
static size_t econv_memsize(const void *ptr)
Definition: transcode.c:2899
#define ALLOC_N(type, n)
Definition: ruby.h:1223
void Init_transcode(void)
Definition: transcode.c:4373
unsigned char * in_data_end
Definition: transcode.c:125
static VALUE str_encode_bang(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2777
Definition: transcode.c:156
static VALUE str_encode(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2859
int num_finished
Definition: transcode.c:130
const char * destination_encoding
Definition: transcode.c:138
static int rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
Definition: transcode.c:1894
#define val
int resume_position
Definition: transcode.c:58
#define ECONV_INVALID_MASK
Definition: encoding.h:309
VALUE rb_eRuntimeError
Definition: error.c:515
#define RSTRING_END(str)
Definition: ruby.h:870
struct rb_econv_t rb_econv_t
Definition: encoding.h:252
#define SUSPEND_AFTER_OUTPUT(num)
#define getGB4bt3(a)
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:478
VALUE rb_str_cat2(VALUE, const char *)
Definition: string.c:1986
#define ECONV_INVALID_REPLACE
Definition: encoding.h:310
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1702
VALUE rb_ary_new(void)
Definition: array.c:424
int(* state_fini_func)(void *)
#define dp(v)
Definition: vm_debug.h:23
static VALUE econv_get_replacement(VALUE self)
Definition: transcode.c:4138
#define ECONV_PARTIAL_INPUT
Definition: encoding.h:339
#define ECONV_AFTER_OUTPUT
Definition: encoding.h:340
#define snprintf
Definition: subst.h:6
#define NIL_P(v)
Definition: ruby.h:446
void st_add_direct(st_table *, st_data_t, st_data_t)
Definition: st.c:629
static void more_output_buffer(VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), int max_output, unsigned char **out_start_ptr, unsigned char **out_pos, unsigned char **out_stop_ptr)
Definition: transcode.c:2140
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2204
void rb_ary_store(VALUE ary, long idx, VALUE val)
Definition: array.c:719
static VALUE sym_attr
Definition: transcode.c:28
static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
Definition: transcode.c:3121
#define OBJ_FROZEN(x)
Definition: ruby.h:1163
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Definition: transcode.c:1857
static st_table * transcoder_table
Definition: transcode.c:163
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Definition: transcode.c:1587
const char * sname
Definition: transcode.c:157
int argc
Definition: ruby.c:130
#define Qfalse
Definition: ruby.h:433
static VALUE make_econv_exception(rb_econv_t *ec)
Definition: transcode.c:2039
VALUE rb_cEncodingConverter
Definition: transcode.c:25
VALUE rb_require_safe(VALUE, int)
Definition: load.c:934
static const rb_data_type_t econv_data_type
Definition: transcode.c:2904
#define ALLOCA_N(type, n)
Definition: ruby.h:1227
static VALUE econv_set_replacement(VALUE self, VALUE arg)
Definition: transcode.c:4164
#define TRANSCODING_STATE(tc)
Definition: transcode.c:97
#define LONG_MAX
Definition: ruby.h:201
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1242
ssize_t(* func_io)(void *, VALUE, const unsigned char *, size_t)
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:60
static VALUE sym_fallback
Definition: transcode.c:27
char ary[sizeof(double) > sizeof(void *)?sizeof(double):sizeof(void *)]
Definition: transcode.c:80
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:748
int err
Definition: win32.c:87
#define OBJ_FREEZE(x)
Definition: ruby.h:1164
static VALUE method_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2246
rb_transcoder_asciicompat_type_t asciicompat_type
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
Definition: transcode.c:233
#define PRIdPTRDIFF
Definition: ruby.h:171
static VALUE econv_equal(VALUE self, VALUE other)
Definition: transcode.c:3516
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1900
#define ENC_CODERANGE_VALID
Definition: encoding.h:59
#define ECONV_UNDEF_MASK
Definition: encoding.h:312
#define ALLOC(type)
Definition: ruby.h:1224
#define SUSPEND_OBUF(num)
VALUE rb_str_resize(VALUE, long)
Definition: string.c:1854
static int str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
Definition: transcode.c:2656
void rb_register_transcoder(const rb_transcoder *tr)
Definition: transcode.c:205
size_t rb_str_capacity(VALUE)
Definition: string.c:360
unsigned char * out_buf_start
Definition: transcode.c:104
static int transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:256
#define getGB4bt0(a)
static VALUE econv_putback(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:4065
ssize_t recognized_len
Definition: transcode.c:64
static VALUE sym_xml
Definition: transcode.c:28
int num_trans
Definition: transcode.c:129
#define FUNso
static void search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3077
#define RSTRING_LEN(str)
Definition: ruby.h:862
static rb_econv_t * check_econv(VALUE self)
Definition: transcode.c:3428
int num_additional
Definition: transcode.c:961
#define REALLOC_N(var, type, n)
Definition: ruby.h:1225
VALUE rb_obj_is_proc(VALUE)
Definition: proc.c:91
static VALUE econv_s_allocate(VALUE klass)
Definition: transcode.c:2910
search_path_queue_t ** queue_last_ptr
Definition: transcode.c:251
VALUE rb_sprintf(const char *format,...)
Definition: sprintf.c:1275
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:898
static VALUE econv_insert_output(VALUE self, VALUE string)
Definition: transcode.c:4020
static VALUE ecerr_destination_encoding(VALUE self)
Definition: transcode.c:4261
int rb_econv_putbackable(rb_econv_t *ec)
Definition: transcode.c:1742
#define rb_enc_name(enc)
Definition: encoding.h:124
#define RSTRING_EMBED_LEN_MAX
Definition: ruby.h:841
unsigned char * out_buf_end
Definition: transcode.c:107
static int decorator_names(int ecflags, const char **decorators_ret)
Definition: transcode.c:1029
unsigned char next_byte
Definition: transcode.c:61
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Definition: transcode.c:2190
struct rb_transcoding * last_tc
Definition: transcode.c:131
#define MEMMOVE(p1, p2, type, n)
Definition: ruby.h:1243
#define STR1_BYTEINDEX(w)
VALUE rb_hash_new(void)
Definition: hash.c:234
static VALUE aref_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2252
static VALUE make_encobj(const char *name)
Definition: transcode.c:2936
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1570
const char * base_enc
Definition: transcode.c:252
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1128
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:461
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4308
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Definition: array.c:545
#define ECONV_CRLF_NEWLINE_DECORATOR
Definition: encoding.h:322
const char * source_encoding
Definition: transcode.c:137
#define Qnil
Definition: ruby.h:435
static VALUE sym_lf
Definition: transcode.c:33
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:400
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Definition: transcode.c:1863
static VALUE econv_init(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3354
unsigned long VALUE
Definition: ruby.h:104
static VALUE result
Definition: nkf.c:40
#define RBASIC(obj)
Definition: ruby.h:1094
static VALUE sym_universal_newline
Definition: transcode.c:29
union rb_transcoding::rb_transcoding_state_t state
#define ECONV_NEWLINE_DECORATOR_MASK
Definition: encoding.h:317
const char * src_encoding
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:870
#define ECONV_UNDEF_HEX_CHARREF
Definition: encoding.h:314
#define getBT1(a)
static void trans_open_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:965
#define rb_enc_asciicompat(enc)
Definition: encoding.h:184
static VALUE sym_universal
Definition: transcode.c:33
VALUE rb_str_new_cstr(const char *)
Definition: string.c:447
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
static VALUE ecerr_error_char(VALUE self)
Definition: transcode.c:4282
VALUE rb_str_dump(VALUE)
Definition: string.c:4634
VALUE rb_proc_call(VALUE, VALUE)
Definition: proc.c:571
const char * ascii_compat_name
Definition: transcode.c:1764
unsigned char * ptr
Definition: transcode.c:68
static rb_encoding * make_encoding(const char *name)
Definition: transcode.c:2926
#define ECONV_CR_NEWLINE_DECORATOR
Definition: encoding.h:323
#define RARRAY_LENINT(ary)
Definition: ruby.h:908
VALUE rb_str_dup(VALUE)
Definition: string.c:946
static VALUE econv_source_encoding(VALUE self)
Definition: transcode.c:3446
static VALUE proc_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2240
static VALUE sym_cr
Definition: transcode.c:33
static VALUE sym_finished
Definition: transcode.c:41
VALUE rb_funcall3(VALUE, ID, int, const VALUE *)
Calls a method.
Definition: vm_eval.c:819
VALUE rb_hash_freeze(VALUE hash)
Definition: hash.c:36
#define FUNsi
void xfree(void *)
#define FL_UNSET(x, f)
Definition: ruby.h:1150
#define INVALID
#define BL_MIN_BYTE
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1598
#define StringValueCStr(v)
Definition: ruby.h:548
static int make_replacement(rb_econv_t *ec)
Definition: transcode.c:2156
#define RSTRING_PTR(str)
Definition: ruby.h:866
static rb_econv_result_t transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:432
#define ONEbt
#define ECONV_UNDEF_REPLACE
Definition: encoding.h:313
void rb_str_modify(VALUE)
Definition: string.c:1369
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:772
static VALUE sym_after_output
Definition: transcode.c:42
int size
Definition: encoding.c:52
static VALUE econv_inspect(VALUE self)
Definition: transcode.c:3408
#define f
#define rb_check_arity(argc, min, max)
Definition: intern.h:277
#define INT2FIX(i)
Definition: ruby.h:241
static rb_transcoding * rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
Definition: transcode.c:783
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:488
VALUE rb_exc_new3(VALUE etype, VALUE str)
Definition: error.c:553
unsigned char * out_data_end
Definition: transcode.c:106
static rb_econv_result_t rb_econv_convert0(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1266
#define xmalloc
Definition: defines.h:64
#define SIZE_MAX
Definition: ruby.h:282
static int asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:1769
size_t rb_econv_memsize(rb_econv_t *ec)
Definition: transcode.c:1720
rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:1067
#define TRANSCODING_WRITEBUF(tc)
Definition: transcode.c:88
static const unsigned char * transcode_char_start(rb_transcoding *tc, const unsigned char *in_start, const unsigned char *inchar_start, const unsigned char *in_p, size_t *char_len_ptr)
Definition: transcode.c:412
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:557
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:570
void rb_error_arity(int argc, int min, int max)
#define RARRAY_PTR(a)
Definition: ruby.h:904
static VALUE ecerr_error_bytes(VALUE self)
Definition: transcode.c:4303
static rb_econv_result_t rb_transcoding_convert(rb_transcoding *tc, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:810
VALUE rb_str_catf(VALUE str, const char *format,...)
Definition: sprintf.c:1315
uint8_t key[16]
Definition: random.c:1370
void rb_str_shared_replace(VALUE, VALUE)
Definition: string.c:857
#define RTEST(v)
Definition: ruby.h:445
static void declare_transcoder(const char *sname, const char *dname, const char *lib)
Definition: transcode.c:222
unsigned int next_table
Definition: transcode.c:59
size_t readagain_len
Definition: transcode.c:141
static int rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
Definition: transcode.c:1875
static VALUE sym_invalid
Definition: transcode.c:27
static int rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
Definition: transcode.c:894
v
Definition: win32ole.c:798
#define getBT2(a)
static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
Definition: transcode.c:2960
int num_allocated
Definition: transcode.c:128
#define BYTE_ADDR(index)
const char * destination_encoding_name
Definition: transcode.c:114
static VALUE econv_convpath(VALUE self)
Definition: transcode.c:3492
static int trans_sweep(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int start)
Definition: transcode.c:1094
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1380
VALUE rb_ary_new2(long capa)
Definition: array.c:417
const char * rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
Definition: transcode.c:1786
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:439
#define rb_safe_level()
Definition: tcltklib.c:94
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Definition: transcode.c:1503
static VALUE ecerr_source_encoding(VALUE self)
Definition: transcode.c:4237
static int output_hex_charref(rb_econv_t *ec)
Definition: transcode.c:1390
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Definition: transcode.c:1869
#define hash_fallback
Definition: transcode.c:2237
static VALUE ecerr_readagain_bytes(VALUE self)
Definition: transcode.c:4315
const char * name
Definition: nkf.c:208
#define xrealloc
Definition: defines.h:67
#define ID2SYM(x)
Definition: ruby.h:363
VALUE rb_eUndefinedConversionError
Definition: transcode.c:21
const char * rb_id2name(ID id)
Definition: ripper.c:17012
unsigned long st_data_t
Definition: st.h:35
int started
Definition: transcode.c:116
rb_econv_elem_t * elems
Definition: transcode.c:127
static VALUE sym_text
Definition: transcode.c:28
const char * replacement_enc
Definition: transcode.c:120
VALUE rb_str_new_frozen(VALUE)
Definition: string.c:713
VALUE rb_str_drop_bytes(VALUE, long)
Definition: string.c:3351
const char * source_encoding_name
Definition: transcode.c:113
size_t replacement_len
Definition: transcode.c:119
int replacement_allocated
Definition: transcode.c:121
static VALUE sym_undef
Definition: transcode.c:27
#define BL_MAX_BYTE
struct search_path_queue_tag * next
Definition: transcode.c:244
int rb_enc_find_index(const char *name)
Definition: encoding.c:635
static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx)
Definition: transcode.c:2877
static int econv_opts(VALUE opt, int ecflags)
Definition: transcode.c:2420
#define rb_check_frozen(obj)
Definition: intern.h:258
static VALUE sym_destination_buffer_full
Definition: transcode.c:39
#define getBT0(a)
static unsigned char * allocate_converted_string(const char *sname, const char *dname, const unsigned char *str, size_t len, unsigned char *caller_dst_buf, size_t caller_dst_bufsize, size_t *dst_len_ptr)
Definition: transcode.c:1519
const rb_transcoder * transcoder
Definition: transcode.c:54
static transcoder_entry_t * get_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:189
#define rb_intern(str)
ssize_t writebuf_off
Definition: transcode.c:71
VALUE rb_str_buf_new(long)
Definition: string.c:777
#define SYMBOL_P(x)
Definition: ruby.h:362
#define TWObt
#define NULL
Definition: _sdbm.c:102
struct rb_transcoding rb_transcoding
#define Qundef
Definition: ruby.h:436
VALUE rb_hash_aset(VALUE, VALUE, VALUE)
union rb_transcoding::@119 readbuf
st_index_t num_entries
Definition: st.h:93
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1344
int st_foreach(st_table *, int(*)(ANYARGS), st_data_t)
Definition: st.c:1006
const unsigned char * replacement_str
Definition: transcode.c:118
VALUE rb_str_new2(const char *)
#define bp()
Definition: vm_debug.h:27
#define STR1_LENGTH(byte_addr)
#define encoding_equal(enc1, enc2)
Definition: transcode.c:241
#define TRANSCODING_WRITEBUF_SIZE(tc)
Definition: transcode.c:92
static rb_encoding * make_dummy_encoding(const char *name)
Definition: transcode.c:2916
VALUE rb_eArgError
Definition: error.c:517
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Definition: encoding.h:321
#define writebuf_off
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:659
#define NUM2LONG(x)
Definition: ruby.h:592
transcoder_entry_t ** entries
Definition: transcode.c:960
static VALUE econv_result_to_symbol(rb_econv_result_t res)
Definition: transcode.c:3551
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1122
char ** argv
Definition: ruby.c:131
#define StringValue(v)
Definition: ruby.h:546
static VALUE ecerr_source_encoding_name(VALUE self)
Definition: transcode.c:4211
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:548
static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3660
VALUE rb_obj_class(VALUE)
Definition: object.c:194
VALUE rb_str_new(const char *, long)
Definition: string.c:425
static VALUE ecerr_destination_encoding_name(VALUE self)
Definition: transcode.c:4249
static VALUE sym_incomplete_input
Definition: transcode.c:43