/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- sjis2eucj
- eucj2sjis
- utf2euc_emit_euc
- utf2euc_2
- utf2euc_3
- utf2euc_4
- utf2eucj
- jconv_ucs4_to_utf8
- eucj2utf_emit_utf
- eucj2utf
- jis_esc
- jis2eucj
- jis_ensure_state
- eucj2jis
- jis_reset
- pivot
- conv_name_match
- conv_name_find
- jconv_ident
- jconv_1tier
- jconv_2tier
- jconv_iconv
- jconv_iconv_reset
- jconv_open
- jconv_close
- jconv
- jconv_reset
1 /*
2 * jconv.c - alternative japanese code conversion routines
3 *
4 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * $Id: jconv.c,v 1.19 2004/09/15 00:37:12 shirok Exp $
34 */
35
36 /* Some iconv() implementations don't support japanese character encodings,
37 * or have problems handling them. This code provides an alternative way
38 * to convert these encodings.
39 */
40
41 /* This file handles conversion among UTF8, Shift-JIS, EUC_JP, and ISO2022JP.
42 * Shift-JIS and EUC_JP are based on JIS X 0213:2000. ISO2022JP partially
43 * handles ISO2022-JP-3 as well.
44 *
45 * EUC_JP is used as a 'pivot' encoding, for it can naturally handle
46 * JISX 0201, JISX 0208, JISX 0212 and JISx 0213 characters.
47 */
48
49 #include <ctype.h>
50 #include "charconv.h"
51
52 #define INCHK(n) do{if (inroom < (n)) return INPUT_NOT_ENOUGH;}while(0)
53 #define OUTCHK(n) do{if (outroom < (n)) return OUTPUT_NOT_ENOUGH;}while(0)
54
55 #define ERRP(n) ((n)==INPUT_NOT_ENOUGH||(n)==OUTPUT_NOT_ENOUGH||(n)==ILLEGAL_SEQUENCE)
56
57 /* Substitution characters.
58 * Unrecognized 1-byte character is substituted by SUBST1_CHAR.
59 * It's common to all encodings.
60 * Unrecognized or uncovertable multibyte character is substituted
61 * by so-called 'Geta-sign'.
62 */
63 #define SUBST1_CHAR '?'
64 #define EUCJ_SUBST2_CHAR1 0xa2
65 #define EUCJ_SUBST2_CHAR2 0xae
66 #define JIS_SUBST2_CHAR1 0x02
67 #define JIS_SUBST2_CHAR2 0x0e
68 #define SJIS_SUBST2_CHAR1 0x81
69 #define SJIS_SUBST2_CHAR2 0xac
70 #define UTF8_SUBST2_CHAR1 0xe3
71 #define UTF8_SUBST2_CHAR2 0x80
72 #define UTF8_SUBST2_CHAR3 0x93
73
74 #define EUCJ_SUBST \
75 do { OUTCHK(2); \
76 outptr[0] = EUCJ_SUBST2_CHAR1; \
77 outptr[1] = EUCJ_SUBST2_CHAR2; \
78 *outchars = 2; } while (0)
79
80 #define SJIS_SUBST \
81 do { OUTCHK(2); \
82 outptr[0] = SJIS_SUBST2_CHAR1; \
83 outptr[1] = SJIS_SUBST2_CHAR2; \
84 *outchars = 2; } while (0)
85
86 #define UTF8_SUBST \
87 do { OUTCHK(3); \
88 outptr[0] = UTF8_SUBST2_CHAR1; \
89 outptr[1] = UTF8_SUBST2_CHAR2; \
90 outptr[2] = UTF8_SUBST2_CHAR2; \
91 *outchars = 3; } while (0)
92
93 /*=================================================================
94 * Shift JIS
95 */
96
97 /* Shift_JISX0213 -> EUC-JP
98 *
99 * Mapping anormalities
100 *
101 * 0x5c, 0x7e : Shift_JISX0213 mapping table maps 0x5c to U+00A5
102 * (YEN SIGN) and 0x7e to U+203E (OVERLINE). But mapping so
103 * breaks the program code written in Shift JIS. I map them
104 * to the corresponding ASCII chars.
105 * 0xfd, 0xfe, 0xff : These are reserved bytes. Apple uses these
106 * bytes for vendor extension:
107 * 0xfd - U+00A9 COPYRIGHT SIGN |EUC A9A6 |JISX0213
108 * 0xfe - U+2122 TRADE MARK SIGN |EUC 8FA2EF|JISX0212
109 * 0xff - U+2026 HORIZONTAL ELLIPSIS|EUC A1C4 |JISX0208
110 * This is a one-direction mapping.
111 * 0x80, 0xa0 : These are reserved bytes. Replaced to the
112 * one-byte substitution character of destination encoding.
113 *
114 * Conversion scheme
115 * 0x00-0x7f : corresponding ASCII range.
116 * 0x80 : substitution character
117 * 0x81 -- 0x9f : first byte (s1) of double byte range for JIS X 0213 m=1
118 * 0xa0 : substitution character
119 * 0xa1 -- 0xdf : JISX 0201 kana = s1-0x80
120 * 0xe0 -- 0xef : first byte (s1) of double byte range for JIS X 0213 m=1
121 * 0xf0 -- 0xfc : first byte (s1) of double byte range for JIS X 0213 m=2
122 * 0xfd : U+00A9, EUC A9A6, JISX0213 (1, 0x09, 0x06)
123 * 0xfe : U+2122, EUC 8FA2EF, JISX0212
124 * 0xff : U+2026, EUC A1C4, JISX0208 (1, 0x01, 0x24)
125 *
126 * For double-byte character, second byte s2 must be in the range of
127 * 0x40 <= s2 <= 0x7e or 0x80 <= s2 <= 0xfc. Otherwise, double-byte
128 * substitution character is used.
129 *
130 * two bytes (s1, s2) maps to JIS X 0213 (m, k, t) by
131 * m = 1 if s1 <= 0xef, 2 otherwise
132 * k = (s1-0x80)*2 - ((s2 < 0x9f)? 1 : 0) if s1 <= 0x9f
133 * (s1-0xc0)*2 - ((s2 < 0x9f)? 1 : 0) if 0xe0 <= s1 <= 0xef
134 * (s1-0x9e)*2 - ((s2 < 0x89)? 1 : 0) if s1 >= 0xf5
135 * otherwise, use the following table
136 * s1 k (s2>=0x80, s2<0x80)
137 * 0xf0 (0x01, 0x08)
138 * 0xf1 (0x03, 0x04)
139 * 0xf2 (0x05, 0x0c)
140 * 0xf3 (0x0e, 0x0d)
141 * 0xf4 (0x0f, 0x4e)
142 * t = s2-0x3f if s2 < 0x7f
143 * s2-0x40 if s2 < 0x9f
144 * s2-0x9e otherwise
145 *
146 * JIS X 0213 to EUC-JP is a straightfoward conversion.
147 */
148
149 static size_t sjis2eucj(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
150 char *outptr, size_t outroom, size_t *outchars)
151 {
152 unsigned char s1, s2;
153 static unsigned char cvt[] = { 0xa1, 0xa8, 0xa3, 0xa4, 0xa5, 0xac, 0xae, 0xad, 0xaf, 0xee };
154
155 s1 = inptr[0];
156 if (s1 < 0x7f) {
157 *outptr = s1;
158 *outchars = 1;
159 return 1;
160 }
161 if ((s1 > 0x80 && s1 < 0xa0) || (s1 >= 0xe0 && s1 <= 0xfc)) {
162 /* Double byte char */
163 unsigned char m, e1, e2;
164 INCHK(2);
165 s2 = inptr[1];
166 if (s2 < 0x40 || s2 > 0xfc) {
167 EUCJ_SUBST;
168 return 2;
169 }
170
171 if (s1 <= 0x9f) {
172 OUTCHK(2);
173 m = 1;
174 e1 = (s1-0x80)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
175 } else if (s1 <= 0xef) {
176 OUTCHK(2);
177 m = 1;
178 e1 = (s1-0xc0)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
179 } else if (s1 >= 0xf5) {
180 OUTCHK(3);
181 m = 2;
182 e1 = (s1-0xf5)*2 + 0x50 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
183 } else {
184 OUTCHK(3);
185 m = 2;
186 e1 = cvt[(s1-0xf0)*2+((s2 < 0x9f)? 1 : 0)];
187 }
188
189 if (s2 < 0x7f) {
190 e2 = s2 - 0x3f + 0xa0;
191 } else if (s2 < 0x9f) {
192 e2 = s2 - 0x40 + 0xa0;
193 } else {
194 e2 = s2 - 0x9e + 0xa0;
195 }
196 if (m == 1) {
197 outptr[0] = e1;
198 outptr[1] = e2;
199 *outchars = 2;
200 } else {
201 outptr[0] = 0x8f;
202 outptr[1] = e1;
203 outptr[2] = e2;
204 *outchars = 3;
205 }
206 return 2;
207 }
208 if (s1 >= 0xa1 && s1 <= 0xdf) {
209 /* JISX0201 KANA */
210 OUTCHK(2);
211 outptr[0] = 0x8e;
212 outptr[1] = s1;
213 *outchars = 2;
214 return 1;
215 }
216 if (s1 == 0xfd) {
217 /* copyright mark */
218 OUTCHK(2);
219 outptr[0] = 0xa9;
220 outptr[1] = 0xa6;
221 *outchars = 2;
222 return 1;
223 }
224 if (s1 == 0xfe) {
225 /* trademark sign. this is not in JISX0213, but in JISX0212. */
226 OUTCHK(3);
227 outptr[0] = 0x8f;
228 outptr[1] = 0xa2;
229 outptr[2] = 0xef;
230 *outchars = 3;
231 return 1;
232 }
233 if (s1 == 0xff) {
234 /* horizontal ellipsis. */
235 OUTCHK(2);
236 outptr[0] = 0xa1;
237 outptr[1] = 0xc4;
238 *outchars = 2;
239 return 1;
240 }
241
242 /* s1 == 0x80 or 0xa0 */
243 outptr[0] = SUBST1_CHAR;
244 *outchars = 1;
245 return 1;
246 }
247
248 /* EUC_JISX0213 -> Shift_JIS
249 *
250 * Mapping anormalities
251 *
252 * 0x80--0xa0 except 0x8e and 0x8f : C1 region.
253 * Doesn't have corresponding SJIS bytes,
254 * so mapped to substitution char.
255 * 0xff : reserved byte. mapped to substitution char.
256 *
257 * Conversion scheme
258 * 0x00-0x7f : corresponding ASCII range.
259 * 0x80--0x8d : substitution char.
260 * 0x8e : leading byte of JISX 0201 kana
261 * 0x8f : leading byte of JISX 0212 or JISX 0213 plane 2
262 * 0x90--0xa0 : substitution char.
263 * 0xa1--0xfe : first byte (e1) of JISX 0213 plane 1
264 * 0xff : substitution char
265 *
266 * For double or trible-byte character, subsequent byte has to be in
267 * the range between 0xa1 and 0xfe inclusive. If not, it is replaced
268 * for the substitution character.
269 *
270 * If the first byte is in the range of 0xa1--0xfe, two bytes (e1, e2)
271 * is mapped to SJIS (s1, s2) by:
272 *
273 * s1 = (e1 - 0xa0 + 0x101)/2 if 0xa1 <= e1 <= 0xde
274 * (e1 - 0xa0 + 0x181)/2 if 0xdf <= e1 <= 0xfe
275 * s2 = (e2 - 0xa0 + 0x3f) if odd?(e1) && 0xa1 <= e2 <= 0xdf
276 * (e2 - 0xa0 + 0x40) if odd?(e1) && 0xe0 <= e2 <= 0xfe
277 * (e2 - 0xa0 + 0x9e) if even?(e1)
278 *
279 * If the first byte is 0x8f, the second byte (e1) and the third byte
280 * (e2) is mapped to SJIS (s1, s2) by:
281 * if (0xee <= e1 <= 0xfe) s1 = (e1 - 0xa0 + 0x19b)/2
282 * otherwise, follow the table:
283 * e1 == 0xa1 or 0xa8 => s1 = 0xf0
284 * e1 == 0xa3 or 0xa4 => s1 = 0xf1
285 * e1 == 0xa5 or 0xac => s1 = 0xf2
286 * e1 == 0xae or 0xad => s1 = 0xf3
287 * e1 == 0xaf => s1 = 0xf4
288 * If e1 is other value, it is JISX0212; we use substitution char.
289 * s2 is mapped with the same rule above.
290 */
291
292 static size_t eucj2sjis(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
293 char *outptr, size_t outroom, size_t *outchars)
294 {
295 unsigned char e1, e2;
296 e1 = inptr[0];
297 if (e1 <= 0x7f) {
298 outptr[0] = e1;
299 *outchars = 1;
300 return 1;
301 }
302 if (e1 >= 0xa1 && e1 <= 0xfe) {
303 /* double byte char (JISX 0213 plane 1) */
304 unsigned char s1, s2;
305 INCHK(2);
306 e2 = inptr[1];
307 if (e2 < 0xa1 || e2 == 0xff) {
308 SJIS_SUBST;
309 return 2;
310 }
311 OUTCHK(2);
312 if (e1 <= 0xde) s1 = (e1 - 0xa0 + 0x101)/2;
313 else s1 = (e1 - 0xa0 + 0x181)/2;
314 if (e1%2 == 0) {
315 s2 = e2 - 0xa0 + 0x9e;
316 } else {
317 if (e2 <= 0xdf) s2 = e2 - 0xa0 + 0x3f;
318 else s2 = e2 - 0xa0 + 0x40;
319 }
320 outptr[0] = s1;
321 outptr[1] = s2;
322 *outchars = 2;
323 return 2;
324 }
325 if (e1 == 0x8e) {
326 /* JISX 0201 kana */
327 INCHK(2);
328 e2 = inptr[1];
329 if (e2 < 0xa1 || e2 == 0xff) {
330 outptr[0] = SUBST1_CHAR;
331 } else {
332 outptr[0] = e2;
333 }
334 *outchars = 1;
335 return 2;
336 }
337 if (e1 == 0x8f) {
338 /* triple byte char */
339 unsigned char s1, s2;
340 unsigned char cvt[] = { 0xf0, 0, 0xf1, 0xf1, 0xf2, 0, 0, 0xf0, 0, 0, 0, 0xf2, 0xf3, 0xf3, 0xf4 };
341
342 INCHK(3);
343 OUTCHK(2);
344 e1 = inptr[1];
345 e2 = inptr[2];
346 if (e1 < 0xa1 || e1 == 0xff || e2 < 0xa1 || e2 == 0xff) {
347 SJIS_SUBST;
348 return 3;
349 }
350 if (e1 >= 0xee) {
351 s1 = (e1 - 0xa0 + 0x19b)/2;
352 } else if (e1 >= 0xb0) {
353 SJIS_SUBST;
354 return 3;
355 } else {
356 s1 = cvt[e1-0xa1];
357 if (s1 == 0) {
358 SJIS_SUBST;
359 return 3;
360 }
361 }
362 if (e1%2 == 0) {
363 s2 = e2 - 0xa0 + 0x9e;
364 } else {
365 if (e2 < 0xdf) s2 = e2 - 0xa0 + 0x3f;
366 else s2 = e2 - 0xa0 + 0x40;
367 }
368 outptr[0] = s1;
369 outptr[1] = s2;
370 *outchars = 2;
371 return 3;
372 }
373 /* no corresponding char */
374 *outptr = SUBST1_CHAR;
375 *outchars = 1;
376 return 1;
377 }
378
379 /*=================================================================
380 * UTF8
381 */
382
383 /* Conversion between UTF8 and EUC_JP is based on the table found at
384 * http://isweb11.infoseek.co.jp/computer/wakaba/table/jis-note.ja.html
385 *
386 * There are some characters in JISX0213 that can't be represented
387 * in a single Unicode character, but can be with a combining character.
388 * In such case, EUC_JP to UTF8 conversion uses combining character,
389 * but UTF8 to EUC_JP conversion translates the combining character into
390 * another character. For example, a single JISX0213 katakana 'nga'
391 * (hiragana "ka" with han-dakuon mark) will translates to Unicode
392 * U+304B+309A (HIRAGANA LETTER KA + COMBINING KATAKANA-HIRAGANA SEMI-VOICED
393 * SOUND MARK). When this sequence is converted to EUC_JP again, it
394 * becomes EUCJ 0xA4AB + 0xA1AC. This is an implementation limitation,
395 * and should be removed in later release.
396 */
397
398 /* [UTF8 -> EUC_JP conversion]
399 *
400 * EUC-JP has the corresponding characters to the wide range of
401 * UCS characters.
402 *
403 * UCS4 character # of EUC_JP characters
404 * ---------------------------------------
405 * U+0000+0xxx 564
406 * U+0000+1xxx 6
407 * U+0000+2xxx 321
408 * U+0000+3xxx 422
409 * U+0000+4xxx 347
410 * U+0000+5xxx 1951
411 * U+0000+6xxx 2047
412 * U+0000+7xxx 1868
413 * U+0000+8xxx 1769
414 * U+0000+9xxx 1583
415 * U+0000+fxxx 241
416 * U+0002+xxxx 302
417 *
418 * It is so wide and so sparse that naive lookup table implementation from
419 * UCS to EUC can be space-wasting. I use hierarchical table with some
420 * ad-hoc heuristics. Since the hierarchical table is used, I directly
421 * translates UTF8 to EUC_JP, without converting it to UCS4.
422 *
423 * Strategy outline: say input consists of bytes named u0, u1, ....
424 *
425 * u0 <= 0x7f : ASCII range
426 * u0 in [0xc2-0xd1] : UTF8 uses 2 bytes. Some mappings within this range
427 * is either very regular or very small, and they are
428 * hardcoded. Other mappings uses table lookup.
429 * u0 == 0xe1 : UTF8 uses 3 bytes. There are only 6 characters in this
430 * range, and it is hardcoded.
431 * u0 in [0xe2-0xe9, 0xef] : Large number of characters are in this range.
432 * Two-level table of 64 entries each is used to dispatch the
433 * characters.
434 * u0 == 0xf0 : UTF8 uses 4 bytes. u1 is in [0xa0-0xaa]. u2 and u3 is
435 * used for dispatch table of 64 entries each.
436 *
437 * The final table entry is unsigned short. 0x0000 means no corresponding
438 * character is defined in EUC_JP. >=0x8000 is the EUC_JP character itself.
439 * < 0x8000 means the character is in G3 plane; 0x8f should be preceded,
440 * and 0x8000 must be added to the value.
441 */
442
443 #include "ucs2eucj.c"
444
445 /* Emit given euc char */
446 static inline size_t utf2euc_emit_euc(unsigned short euc, size_t inchars, char *outptr, size_t outroom, size_t *outchars)
447 {
448 if (euc == 0) {
449 EUCJ_SUBST;
450 } else if (euc < 0x8000) {
451 OUTCHK(3);
452 outptr[0] = 0x8f;
453 outptr[1] = (euc >> 8) + 0x80;
454 outptr[2] = euc & 0xff;
455 *outchars = 3;
456 } else {
457 OUTCHK(2);
458 outptr[0] = (euc >> 8);
459 outptr[1] = euc & 0xff;
460 *outchars = 2;
461 }
462 return inchars;
463 }
464
465 /* handle 2-byte UTF8 sequence. 0xc0 <= u0 <= 0xdf */
466 static inline size_t utf2euc_2(ScmConvInfo *cinfo, unsigned char u0,
467 const char *inptr, size_t inroom,
468 char *outptr, size_t outroom, size_t *outchars)
469 {
470 unsigned char u1;
471 unsigned short *etab = NULL;
472
473 INCHK(2);
474 u1 = (unsigned char)inptr[1];
475 if (u1 < 0x80 || u1 >= 0xc0) return ILLEGAL_SEQUENCE;
476
477 switch (u0) {
478 case 0xc2: etab = utf2euc_c2; break;
479 case 0xc3: etab = utf2euc_c3; break;
480 case 0xc4: etab = utf2euc_c4; break;
481 case 0xc5: etab = utf2euc_c5; break;
482 case 0xc6:
483 if (u1 == 0x93) { /* U+0193 -> euc ABA9 */
484 return utf2euc_emit_euc(0xaba9, 2, outptr, outroom, outchars);
485 } else break;
486 case 0xc7: etab = utf2euc_c7; break;
487 case 0xc9: etab = utf2euc_c9; break;
488 case 0xca: etab = utf2euc_ca; break;
489 case 0xcb: etab = utf2euc_cb; break;
490 case 0xcc: etab = utf2euc_cc; break;
491 case 0xcd:
492 if (u1 == 0xa1) { /* U+0361 -> euc ABD2 */
493 return utf2euc_emit_euc(0xabd2, 2, outptr, outroom, outchars);
494 } else break;
495 case 0xce: etab = utf2euc_ce; break;
496 case 0xcf: etab = utf2euc_cf; break;
497 case 0xd0: etab = utf2euc_d0; break;
498 case 0xd1: etab = utf2euc_d1; break;
499 default:
500 break;
501 }
502 if (etab != NULL) {
503 /* table lookup */
504 return utf2euc_emit_euc(etab[u1-0x80], 2, outptr, outroom, outchars);
505 }
506 EUCJ_SUBST;
507 return 2;
508 }
509
510 /* handle 3-byte UTF8 sequence. 0xe0 <= u0 <= 0xef */
511 static inline size_t utf2euc_3(ScmConvInfo *cinfo, unsigned char u0,
512 const char *inptr, size_t inroom,
513 char *outptr, size_t outroom, size_t *outchars)
514 {
515 unsigned char u1, u2;
516 unsigned char *tab1 = NULL;
517 unsigned short (*tab2)[64] = NULL;
518
519 INCHK(3);
520 u1 = (unsigned char)inptr[1];
521 u2 = (unsigned char)inptr[2];
522
523 switch (u0) {
524 case 0xe1: /* special case : there's only 6 chars */
525 {
526 unsigned short euc = 0;
527 if (u1 == 0xb8) {
528 if (u2 == 0xbe) euc = 0xa8f2;
529 else if (u2 == 0xbf) euc = 0xa8f3;
530 } else if (u1 == 0xbd) {
531 if (u2 == 0xb0) euc = 0xabc6;
532 else if (u2 == 0xb1) euc = 0xabc7;
533 else if (u2 == 0xb2) euc = 0xabd0;
534 else if (u2 == 0xb3) euc = 0xabd1;
535 }
536 return utf2euc_emit_euc(euc, 3, outptr, outroom, outchars);
537 }
538 case 0xe2: tab1 = utf2euc_e2; tab2 = utf2euc_e2_xx; break;
539 case 0xe3: tab1 = utf2euc_e3; tab2 = utf2euc_e3_xx; break;
540 case 0xe4: tab1 = utf2euc_e4; tab2 = utf2euc_e4_xx; break;
541 case 0xe5: tab1 = utf2euc_e5; tab2 = utf2euc_e5_xx; break;
542 case 0xe6: tab1 = utf2euc_e6; tab2 = utf2euc_e6_xx; break;
543 case 0xe7: tab1 = utf2euc_e7; tab2 = utf2euc_e7_xx; break;
544 case 0xe8: tab1 = utf2euc_e8; tab2 = utf2euc_e8_xx; break;
545 case 0xe9: tab1 = utf2euc_e9; tab2 = utf2euc_e9_xx; break;
546 case 0xef: tab1 = utf2euc_ef; tab2 = utf2euc_ef_xx; break;
547 default:
548 break;
549 }
550 if (tab1 != NULL) {
551 unsigned char ind = tab1[u1-0x80];
552 if (ind != 0) {
553 return utf2euc_emit_euc(tab2[ind-1][u2-0x80], 3, outptr, outroom, outchars);
554 }
555 }
556 EUCJ_SUBST;
557 return 3;
558 }
559
560 /* handle 4-byte UTF8 sequence. u0 == 0xf0, 0xa0 <= u1 <= 0xaa */
561 static inline size_t utf2euc_4(ScmConvInfo *cinfo, unsigned char u0,
562 const char *inptr, size_t inroom,
563 char *outptr, size_t outroom, size_t *outchars)
564 {
565 unsigned char u1, u2, u3;
566 unsigned short *tab = NULL;
567
568 INCHK(4);
569 if (u0 != 0xf0) {
570 EUCJ_SUBST;
571 return 4;
572 }
573 u1 = (unsigned char)inptr[1];
574 u2 = (unsigned char)inptr[2];
575 u3 = (unsigned char)inptr[3];
576
577 switch (u1) {
578 case 0xa0: tab = utf2euc_f0_a0; break;
579 case 0xa1: tab = utf2euc_f0_a1; break;
580 case 0xa2: tab = utf2euc_f0_a2; break;
581 case 0xa3: tab = utf2euc_f0_a3; break;
582 case 0xa4: tab = utf2euc_f0_a4; break;
583 case 0xa5: tab = utf2euc_f0_a5; break;
584 case 0xa6: tab = utf2euc_f0_a6; break;
585 case 0xa7: tab = utf2euc_f0_a7; break;
586 case 0xa8: tab = utf2euc_f0_a8; break;
587 case 0xa9: tab = utf2euc_f0_a9; break;
588 case 0xaa: tab = utf2euc_f0_aa; break;
589 default:
590 break;
591 }
592 if (tab != NULL) {
593 int i;
594 unsigned short u2u3 = u2*256 + u3;
595 for (i=0; tab[i]; i+=2) {
596 if (tab[i] == u2u3) {
597 return utf2euc_emit_euc(tab[i+1], 4, outptr, outroom, outchars);
598 }
599 }
600 }
601 EUCJ_SUBST;
602 return 4;
603 }
604
605 /* Body of UTF8 -> EUC_JP conversion */
606 static size_t utf2eucj(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
607 char *outptr, size_t outroom, size_t *outchars)
608 {
609 unsigned char u0;
610
611 u0 = (unsigned char)inptr[0];
612 if (u0 <= 0x7f) {
613 *outptr = u0;
614 *outchars = 1;
615 return 1;
616 }
617 if (u0 <= 0xbf) {
618 /* invalid UTF8 sequence */
619 return ILLEGAL_SEQUENCE;
620 }
621 if (u0 <= 0xdf) {
622 /* 2-byte UTF8 sequence */
623 return utf2euc_2(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
624 }
625 if (u0 <= 0xef) {
626 /* 3-byte UTF8 sequence */
627 return utf2euc_3(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
628 }
629 if (u0 <= 0xf7) {
630 /* 4-byte UTF8 sequence */
631 return utf2euc_4(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
632 }
633 if (u0 <= 0xfb) {
634 /* 5-byte UTF8 sequence */
635 INCHK(5);
636 EUCJ_SUBST;
637 return 5;
638 }
639 if (u0 <= 0xfd) {
640 /* 6-byte UTF8 sequence */
641 INCHK(6);
642 EUCJ_SUBST;
643 return 6;
644 }
645 return ILLEGAL_SEQUENCE;
646 }
647
648 /* [EUC_JP -> UTF8 conversion]
649 *
650 * Conversion strategy:
651 * If euc0 is in ASCII range, or C1 range except 0x8e or 0x8f, map it as is.
652 * If euc0 is 0x8e, use JISX0201-KANA table.
653 * If euc0 is 0x8f, use JISX0213 plane 2 table.
654 * If euc0 is in [0xa1-0xfe], use JISX0213 plane1 table.
655 * If euc0 is 0xa0 or 0xff, return ILLEGAL_SEQUENCE.
656 *
657 * JISX0213 plane2 table is consisted by a 2-level tree. The first-level
658 * returns an index to the second-level table by (euc1 - 0xa1). Only the
659 * range of JISX0213 defined region is converted; JISX0212 region will be
660 * mapped to the substitution char.
661 */
662
663 #include "eucj2ucs.c"
664
665 /* UTF8 utility. Similar stuff is included in gauche/char_utf_8.h
666 if the native encoding is UTF8, but not otherwise.
667 So I include them here as well. */
668
669 void jconv_ucs4_to_utf8(unsigned int ucs, char *cp)
670 {
671 if (ucs < 0x80) {
672 *cp = ucs;
673 }
674 else if (ucs < 0x800) {
675 *cp++ = ((ucs>>6)&0x1f) | 0xc0;
676 *cp = (ucs&0x3f) | 0x80;
677 }
678 else if (ucs < 0x10000) {
679 *cp++ = ((ucs>>12)&0x0f) | 0xe0;
680 *cp++ = ((ucs>>6)&0x3f) | 0x80;
681 *cp = (ucs&0x3f) | 0x80;
682 }
683 else if (ucs < 0x200000) {
684 *cp++ = ((ucs>>18)&0x07) | 0xf0;
685 *cp++ = ((ucs>>12)&0x3f) | 0x80;
686 *cp++ = ((ucs>>6)&0x3f) | 0x80;
687 *cp = (ucs&0x3f) | 0x80;
688 }
689 else if (ucs < 0x4000000) {
690 *cp++ = ((ucs>>24)&0x03) | 0xf8;
691 *cp++ = ((ucs>>18)&0x3f) | 0x80;
692 *cp++ = ((ucs>>12)&0x3f) | 0x80;
693 *cp++ = ((ucs>>6)&0x3f) | 0x80;
694 *cp = (ucs&0x3f) | 0x80;
695 } else {
696 *cp++ = ((ucs>>30)&0x1) | 0xfc;
697 *cp++ = ((ucs>>24)&0x3f) | 0x80;
698 *cp++ = ((ucs>>18)&0x3f) | 0x80;
699 *cp++ = ((ucs>>12)&0x3f) | 0x80;
700 *cp++ = ((ucs>>6)&0x3f) | 0x80;
701 *cp++ = (ucs&0x3f) | 0x80;
702 }
703 }
704
705 /* Given 'encoded' ucs, emit utf8. 'Encoded' ucs is the entry of the
706 conversion table. If ucs >= 0x100000, it is composed by two UCS2
707 character. Otherwise, it is one UCS4 character. */
708 static inline size_t eucj2utf_emit_utf(unsigned int ucs, size_t inchars,
709 char *outptr, size_t outroom,
710 size_t *outchars)
711 {
712 if (ucs == 0) {
713 UTF8_SUBST;
714 } else if (ucs < 0x100000) {
715 int outreq = UCS2UTF_NBYTES(ucs);
716 OUTCHK(outreq);
717 jconv_ucs4_to_utf8(ucs, outptr);
718 *outchars = outreq;
719 } else {
720 /* we need two UCS characters */
721 unsigned int ucs0 = (ucs >> 16) & 0xffff;
722 unsigned int ucs1 = ucs & 0xfff;
723 int outreq0 = UCS2UTF_NBYTES(ucs0);
724 int outreq1 = UCS2UTF_NBYTES(ucs1);
725 OUTCHK(outreq0+outreq1);
726 jconv_ucs4_to_utf8(ucs0, outptr);
727 jconv_ucs4_to_utf8(ucs1, outptr+outreq0);
728 *outchars = outreq0+outreq1;
729 }
730 return inchars;
731 }
732
733 static size_t eucj2utf(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
734 char *outptr, size_t outroom, size_t *outchars)
735 {
736 unsigned char e0, e1, e2;
737 unsigned int ucs;
738
739 e0 = (unsigned char)inptr[0];
740 if (e0 < 0xa0) {
741 if (e0 == 0x8e) {
742 /* JIS X 0201 KANA */
743 INCHK(2);
744 e1 = (unsigned char)inptr[1];
745 if (e1 < 0xa1 || e1 > 0xdf) return ILLEGAL_SEQUENCE;
746 ucs = 0xff61 + (e1 - 0xa1);
747 return eucj2utf_emit_utf(ucs, 2, outptr, outroom, outchars);
748 }
749 else if (e0 == 0x8f) {
750 /* JIS X 0213 plane 2 */
751 int index;
752
753 INCHK(3);
754 e1 = (unsigned char)inptr[1];
755 e2 = (unsigned char)inptr[2];
756 if (e1 < 0xa1 || e1 > 0xfe || e2 < 0xa1 || e2 > 0xfe) {
757 return ILLEGAL_SEQUENCE;
758 }
759 index = euc_jisx0213_2_index[e1 - 0xa1];
760 if (index < 0) {
761 UTF8_SUBST;
762 return 3;
763 }
764 ucs = euc_jisx0213_2_to_ucs2[index][e2 - 0xa1];
765 return eucj2utf_emit_utf(ucs, 3, outptr, outroom, outchars);
766 }
767 else {
768 /* ASCII or C1 region */
769 outptr[0] = e0;
770 *outchars = 1;
771 return 1;
772 }
773 }
774 if (e0 > 0xa0 && e0 < 0xff) {
775 /* JIS X 0213 plane 1 */
776 INCHK(2);
777 e1 = (unsigned char)inptr[1];
778 if (e1 < 0xa1 || e1 > 0xfe) return ILLEGAL_SEQUENCE;
779 ucs = euc_jisx0213_1_to_ucs2[e0 - 0xa1][e1 - 0xa1];
780 return eucj2utf_emit_utf(ucs, 2, outptr, outroom, outchars);
781 }
782 return ILLEGAL_SEQUENCE;
783 }
784
785 /*=================================================================
786 * ISO2022-JP
787 */
788
789 /* ISO2022-JP{-1(,2),3} -> EUC_JP
790 * Strategy: accepts as many possibilities as possible.
791 * The following escape sequence is recognized:
792 * (See Lunde, CJKV information processing, O'Reilly, pp.155--158)
793 *
794 * <ESC> ( B ASCII
795 * <ESC> ( J JIS-Roman
796 * <ESC> ( H JIS-Roman (for compatibility)
797 * <ESC> ( I Half-width katakana (JIS X 0201 kana)
798 * <ESC> $ @ JIS C 6226-1978 (78JIS)
799 * <ESC> $ B JIS X 0208-1983 (83JIS)
800 * <ESC> $ ( D JIS X 0212-1990
801 * <ESC> $ ( O JIS X 0213:2000 plane 1
802 * <ESC> $ ( P JIS X 0213:2000 plane 2
803 * <ESC> & @ <ESC> $ B JIS X 0208-1990, JIS X 0208:1997
804 * 0x0e JIS7 half-width katakana shift-out
805 * 0x0f JIS7 half-width katakana shift-in
806 *
807 * The state is reset to ASCII whenever newline character is read.
808 *
809 * The following escape sequences defined in ISO2022-JP-2 are recognized,
810 * but all the characters within the sequence will be replaced by '?'.
811 *
812 * <ESC> $ A (GB2312-80) unsupported
813 * <ESC> $ ( C (KS X 1001:1992) unsupported
814 * <ESC> . A (ISO8859-1:1998) unsupported
815 * <ESC> . F (ISO8859-7:1998) unsupported
816 *
817 * If other escape sequence is seen, the converter returns ILLEGAL_SEQUENCE.
818 *
819 * JIS8 kana is allowed.
820 */
821
822 /* input states */
823 enum {
824 JIS_ASCII,
825 JIS_ROMAN,
826 JIS_KANA,
827 JIS_78,
828 JIS_0212,
829 JIS_0213_1,
830 JIS_0213_2,
831 JIS_UNKNOWN,
832 };
833
834 /* deal with escape sequence. escape byte itself is already consumed.
835 returns # of input bytes consumed by the escape sequence,
836 or an error code. cinfo->istate is updated accordingly. */
837 static size_t jis_esc(ScmConvInfo *cinfo, const char *inptr, size_t inroom)
838 {
839 unsigned char j1, j2;
840 INCHK(2);
841 j1 = inptr[0];
842 j2 = inptr[1];
843 switch (j1) {
844 case '(':
845 switch (j2) {
846 case 'B': cinfo->istate = JIS_ASCII; break;
847 case 'J': cinfo->istate = JIS_ROMAN; break;
848 case 'H': cinfo->istate = JIS_ROMAN; break;
849 case 'I': cinfo->istate = JIS_KANA; break;
850 default: return ILLEGAL_SEQUENCE;
851 }
852 return 2;
853 case '$':
854 switch (j2) {
855 case '@': cinfo->istate = JIS_78; break;
856 case 'B': cinfo->istate = JIS_0213_1; break;
857 case 'A': cinfo->istate = JIS_UNKNOWN; break;
858 case '(':
859 {
860 INCHK(3);
861 switch (inptr[2]) {
862 case 'D': cinfo->istate = JIS_0212; break;
863 case 'O': cinfo->istate = JIS_0213_1; break;
864 case 'P': cinfo->istate = JIS_0213_2; break;
865 case 'C': cinfo->istate = JIS_UNKNOWN; break;
866 default: return ILLEGAL_SEQUENCE;
867 }
868 return 3;
869 break;
870 }
871 default: return ILLEGAL_SEQUENCE;
872 }
873 return 2;
874 case '&':
875 {
876 INCHK(6);
877 if (inptr[2] == '@' && inptr[3] == 0x1b && inptr[4] == '$'
878 && inptr[5] == 'B') {
879 cinfo->istate = JIS_0213_1;
880 return 5;
881 } else {
882 return ILLEGAL_SEQUENCE;
883 }
884 }
885 case '.':
886 switch (inptr[2]) {
887 case 'A':/*fallthrough*/;
888 case 'F': cinfo->istate = JIS_UNKNOWN; break;
889 default: return ILLEGAL_SEQUENCE;
890 }
891 return 2;
892 default: return ILLEGAL_SEQUENCE;
893 }
894 }
895
896 /* main routine for iso2022-jp -> euc_jp */
897 static size_t jis2eucj(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
898 char *outptr, size_t outroom, size_t *outchars)
899 {
900 unsigned char j0, j1;
901 size_t inoffset = 0, r;
902
903 j0 = inptr[inoffset];
904 /* skip escape sequence */
905 while (j0 == 0x1b) {
906 inoffset++;
907 r = jis_esc(cinfo, inptr+inoffset, inroom-inoffset);
908 if (ERRP(r)) return r;
909 inoffset += r;
910 if (inoffset >= inroom) {
911 *outchars = 0;
912 return inoffset;
913 }
914 j0 = inptr[inoffset];
915 }
916
917 if (j0 == '\n' || j0 == '\r') {
918 cinfo->istate = JIS_ASCII;
919 outptr[0] = j0;
920 *outchars = 1;
921 return 1+inoffset;
922 } else if (j0 < 0x20) {
923 outptr[0] = j0;
924 *outchars = 1;
925 return 1+inoffset;
926 } else if (j0 >= 0xa1 && j0 <= 0xdf) {
927 /* JIS8 kana */
928 OUTCHK(2);
929 outptr[0] = 0x8e;
930 outptr[1] = j0;
931 *outchars = 2;
932 return 1+inoffset;
933 } else {
934 switch (cinfo->istate) {
935 case JIS_ROMAN:
936 /* jis-roman and ascii differs on 0x5c and 0x7e -- for now,
937 I ignore the difference. */
938 /* FALLTHROUGH */
939 case JIS_ASCII:
940 outptr[0] = j0;
941 *outchars = 1;
942 return 1+inoffset;
943 case JIS_KANA:
944 OUTCHK(2);
945 outptr[0] = 0x8e;
946 outptr[1] = j0 + 0x80;
947 *outchars = 2;
948 return 1+inoffset;
949 case JIS_78:
950 /* for now, I ignore the difference between JIS78 and JIS83 */
951 /* FALLTHROUGH */
952 case JIS_0213_1:
953 INCHK(inoffset+2);
954 OUTCHK(2);
955 j1 = inptr[inoffset+1];
956 outptr[0] = j0 + 0x80;
957 outptr[1] = j1 + 0x80;
958 *outchars = 2;
959 return 2+inoffset;
960 case JIS_0212:
961 /* jis x 0212 and jis x 0213 plane 2 are different character sets,
962 but uses the same conversion scheme. */
963 /* FALLTHROUGH */
964 case JIS_0213_2:
965 INCHK(inoffset+2);
966 OUTCHK(3);
967 j1 = inptr[inoffset+1];
968 outptr[0] = 0x8f;
969 outptr[1] = j0 + 0x80;
970 outptr[2] = j1 + 0x80;
971 *outchars = 3;
972 return 2+inoffset;
973 case JIS_UNKNOWN:
974 outptr[0] = SUBST1_CHAR;
975 *outchars = 1;
976 return 1+inoffset;
977 default:
978 Scm_Error("internal state of ISO2022-JP -> EUC_JP got messed up (%d). Implementation error?", cinfo->istate);
979 }
980 }
981 return ILLEGAL_SEQUENCE;
982 }
983
984 /* EUC_JP -> ISO2022JP(-3)
985 *
986 * For now, I follow the strategy of iso2022jp-3-compatible behavior.
987 */
988
989 /* ensure the current state is newstate. returns # of output chars.
990 may return OUTPUT_NOT_ENOUGH. */
991 static size_t jis_ensure_state(ScmConvInfo *cinfo, int newstate, size_t outbytes,
992 char *outptr, size_t outroom)
993 {
994 const char *escseq = NULL;
995 size_t esclen = 0;
996
997 if (cinfo->ostate == newstate) {
998 OUTCHK(outbytes);
999 return 0;
1000 }
1001 switch (newstate) {
1002 case JIS_ASCII:
1003 escseq = "\033(B"; esclen = 3; break;
1004 case JIS_KANA:
1005 escseq = "\033(I"; esclen = 3; break;
1006 case JIS_0213_1:
1007 escseq = "\033$B"; esclen = 3; break;
1008 case JIS_0213_2:
1009 escseq = "\033$(P"; esclen = 4; break;
1010 case JIS_0212:
1011 escseq = "\033$(D"; esclen = 4; break;
1012 default:
1013 Scm_Error("something wrong in jis_ensure_state: implementation error?");
1014 return 0; /* dummy */
1015 }
1016 OUTCHK(esclen + outbytes);
1017 memcpy(outptr, escseq, esclen);
1018 cinfo->ostate = newstate;
1019 return esclen;
1020 }
1021
1022 static size_t eucj2jis(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
1023 char *outptr, size_t outroom, size_t *outchars)
1024 {
1025 unsigned char e0, e1;
1026 size_t outoffset = 0;
1027 e0 = inptr[0];
1028 if (e0 < 0x80) {
1029 outoffset = jis_ensure_state(cinfo, JIS_ASCII, 1, outptr, outroom);
1030 if (ERRP(outoffset)) return outoffset;
1031 outptr[outoffset] = e0;
1032 *outchars = outoffset+1;
1033 return 1;
1034 } else if (e0 == 0x8e) {
1035 INCHK(2);
1036 e1 = inptr[1];
1037 if (e1 > 0xa0 && e1 < 0xff) {
1038 outoffset = jis_ensure_state(cinfo, JIS_KANA, 1, outptr, outroom);
1039 if (ERRP(outoffset)) return outoffset;
1040 outptr[outoffset] = e1 - 0x80;
1041 *outchars = outoffset+1;
1042 return 2;
1043 }
1044 } else if (e0 == 0x8f) {
1045 INCHK(3);
1046 e0 = inptr[1];
1047 e1 = inptr[2];
1048 if (e0 > 0xa0 && e0 < 0xff && e1 > 0xa0 && e1 < 0xff) {
1049 int newstate = JIS_0212;
1050 switch (e0) {
1051 case 0xa1:; case 0xa3:; case 0xa4:; case 0xa5:;
1052 case 0xa8:; case 0xac:; case 0xad:; case 0xae:; case 0xaf:;
1053 newstate = JIS_0213_2; break;
1054 default:
1055 if (e0 >= 0xee) newstate = JIS_0213_2;
1056 }
1057 outoffset = jis_ensure_state(cinfo, newstate, 2, outptr, outroom);
1058 outptr[outoffset] = e0 - 0x80;
1059 outptr[outoffset+1] = e1 - 0x80;
1060 *outchars = outoffset+1;
1061 return 3;
1062 }
1063 } else if (e0 > 0xa0 && e0 < 0xff) {
1064 INCHK(2);
1065 e1 = inptr[1];
1066 if (e1 > 0xa0 && e1 < 0xff) {
1067 outoffset = jis_ensure_state(cinfo, JIS_0213_1, 2, outptr, outroom);
1068 if (ERRP(outoffset)) return outoffset;
1069 outptr[outoffset] = e0 - 0x80;
1070 outptr[outoffset+1] = e1 - 0x80;
1071 *outchars = outoffset+2;
1072 return 2;
1073 }
1074 }
1075 return ILLEGAL_SEQUENCE;
1076 }
1077
1078 /* reset proc */
1079 static size_t jis_reset(ScmConvInfo *cinfo, char *outptr, size_t outroom)
1080 {
1081 if (outptr == NULL) {
1082 /* just reset */
1083 cinfo->ostate = JIS_ASCII;
1084 return 0;
1085 } else {
1086 if (cinfo->ostate == JIS_ASCII) return 0;
1087 if (outroom < 3) return OUTPUT_NOT_ENOUGH;
1088 outptr[0] = 0x1b;
1089 outptr[1] = '(';
1090 outptr[2] = 'B';
1091 cinfo->ostate = JIS_ASCII;
1092 return 3;
1093 }
1094 }
1095
1096 /*=================================================================
1097 * EUC_JP
1098 */
1099
1100 /* EUC_JP is a pivot code, so we don't need to convert. This function
1101 is just a placeholder. */
1102 static size_t pivot(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
1103 char *outptr, size_t outroom, size_t *outchars)
1104 {
1105 return 0;
1106 }
1107
1108 /*=================================================================
1109 * JCONV - the entry
1110 */
1111
1112 /* canonical code designator */
1113 enum {
1114 JCODE_EUCJ,
1115 JCODE_SJIS,
1116 JCODE_UTF8,
1117 JCODE_ISO2022JP,
1118 JCODE_NONE, /* a special entry standing for byte stream */
1119 #if 0
1120 JCODE_ISO2022JP-2,
1121 JCODE_ISO2022JP-3
1122 #endif
1123 };
1124
1125 /* map canonical code designator to inconv and outconv. the order of
1126 entry must match with the above designators. */
1127 static struct conv_converter_rec {
1128 ScmConvProc inconv;
1129 ScmConvProc outconv;
1130 ScmConvReset reset;
1131 } conv_converter[] = {
1132 { pivot, pivot, NULL }, /* EUCJ */
1133 { sjis2eucj, eucj2sjis, NULL }, /* SJIS */
1134 { utf2eucj, eucj2utf, NULL }, /* UTF8 */
1135 { jis2eucj, eucj2jis, jis_reset }, /* ISO2022JP */
1136 { pivot, pivot, NULL }, /* NONE */
1137 };
1138
1139 /* map convesion name to the canonical code */
1140 static struct conv_support_rec {
1141 const char *name;
1142 int code;
1143 } conv_supports[] = {
1144 { "euc_jp", JCODE_EUCJ },
1145 { "eucjp", JCODE_EUCJ },
1146 { "eucj", JCODE_EUCJ },
1147 { "euc_jisx0213", JCODE_EUCJ },
1148 { "shift_jis", JCODE_SJIS },
1149 { "shiftjis", JCODE_SJIS },
1150 { "sjis", JCODE_SJIS },
1151 { "utf-8", JCODE_UTF8 },
1152 { "utf8", JCODE_UTF8 },
1153 { "iso2022jp", JCODE_ISO2022JP },
1154 { "iso2022-jp", JCODE_ISO2022JP },
1155 { "iso-2022-jp", JCODE_ISO2022JP },
1156 { "csiso2022jp", JCODE_ISO2022JP },
1157 { "iso2022jp-1", JCODE_ISO2022JP },
1158 { "iso-2022jp-1", JCODE_ISO2022JP },
1159 { "iso2022jp-2", JCODE_ISO2022JP },
1160 { "iso-2022jp-2", JCODE_ISO2022JP },
1161 { "iso2022jp-3", JCODE_ISO2022JP },
1162 { "iso-2022jp-3", JCODE_ISO2022JP },
1163 { "none", JCODE_NONE },
1164 { NULL, 0 }
1165 };
1166
1167 static int conv_name_match(const char *s, const char *t)
1168 {
1169 const char *p, *q;
1170 for (p=s, q=t; *p && *q; p++, q++) {
1171 if (*p == '-' || *p == '_') {
1172 if (*q != '-' && *q != '_') return FALSE;
1173 } else {
1174 if (tolower(*p) != tolower(*q)) return FALSE;
1175 }
1176 }
1177 if (*p || *q) return FALSE;
1178 return TRUE;
1179 }
1180
1181 static int conv_name_find(const char *name)
1182 {
1183 struct conv_support_rec *cvtab = conv_supports;
1184 for (; cvtab->name; cvtab++) {
1185 if (conv_name_match(name, cvtab->name)) {
1186 return cvtab->code;
1187 }
1188 }
1189 return -1;
1190 }
1191
1192 /* Internal conversion handler.
1193 There are five cases to handle:
1194 (1) fromCode === toCode
1195 jconv just copies input to output. I take speed than safety; input
1196 is not checked if it is conforming fromCode.
1197 (2) fromCode === pivot, toCode =/= pivot, and pivot->toCode supported.
1198 (3) fromCode =/= pivot, toCode === pivot, and fromCode->pivot supported.
1199 we just need one conversion subroutine.
1200 (4) fromCode =/= pivot, toCode =/= pivot, and fromCode->pivot->toCode
1201 supported. we use two conversion subroutine cascaded.
1202 (5) other cases;
1203 we delegate the job to iconv.
1204 */
1205
1206 /* case (1) */
1207 static size_t jconv_ident(ScmConvInfo *info, const char **iptr,
1208 size_t *iroom, char **optr, size_t *oroom)
1209 {
1210 size_t inroom = *iroom, outroom = *oroom;
1211 #ifdef JCONV_DEBUG
1212 fprintf(stderr, "jconv_ident %s->%s\n", info->fromCode, info->toCode);
1213 #endif
1214 if (inroom <= outroom) {
1215 memcpy(*optr, *iptr, inroom);
1216 *optr += inroom;
1217 *iptr += inroom;
1218 *iroom = 0;
1219 *oroom -= inroom;
1220 return inroom;
1221 } else {
1222 memcpy(*optr, *iptr, outroom);
1223 *optr += outroom;
1224 *iptr += outroom;
1225 *iroom -= outroom;
1226 *oroom = 0;
1227 return OUTPUT_NOT_ENOUGH;
1228 }
1229 }
1230
1231 /* case (2) or (3) */
1232 static size_t jconv_1tier(ScmConvInfo *info, const char **iptr,
1233 size_t *iroom, char **optr, size_t *oroom)
1234 {
1235 ScmConvProc cvt = info->convproc[0];
1236 const char *inp = *iptr;
1237 char *outp = *optr;
1238 int inr = *iroom, outr = *oroom;
1239 size_t outchars, inchars, converted = 0;
1240
1241 #ifdef JCONV_DEBUG
1242 fprintf(stderr, "jconv_1tier %s->%s\n", info->fromCode, info->toCode);
1243 #endif
1244 SCM_ASSERT(cvt != NULL);
1245 while (inr > 0 && outr > 0) {
1246 inchars = cvt(info, inp, inr, outp, outr, &outchars);
1247 if (ERRP(inchars)) {
1248 converted = inchars;
1249 break;
1250 } else {
1251 converted += inchars;
1252 inp += inchars;
1253 inr -= inchars;
1254 outp += outchars;
1255 outr -= outchars;
1256 }
1257 }
1258 *iptr = inp;
1259 *iroom = inr;
1260 *optr = outp;
1261 *oroom = outr;
1262 return converted;
1263 }
1264
1265 /* case (4) */
1266 #define INTBUFSIZ 20 /* intermediate buffer size */
1267 static size_t jconv_2tier(ScmConvInfo *info, const char **iptr, size_t *iroom,
1268 char **optr, size_t *oroom)
1269 {
1270 char buf[INTBUFSIZ];
1271 ScmConvProc icvt = info->convproc[0];
1272 ScmConvProc ocvt = info->convproc[1];
1273 const char *inp = *iptr;
1274 char *outp = *optr;
1275 int inr = *iroom, outr = *oroom;
1276 size_t outchars, inchars, bufchars, converted = 0;
1277
1278 #ifdef JCONV_DEBUG
1279 fprintf(stderr, "jconv_2tier %s->%s\n", info->fromCode, info->toCode);
1280 #endif
1281 while (inr > 0 && outr > 0) {
1282 inchars = icvt(info, inp, inr, buf, INTBUFSIZ, &bufchars);
1283 if (ERRP(inchars)) {
1284 converted = inchars;
1285 break;
1286 }
1287 if (bufchars == 0) {
1288 outchars = 0;
1289 } else {
1290 bufchars = ocvt(info, buf, bufchars, outp, outr, &outchars);
1291 if (ERRP(bufchars)) {
1292 converted = bufchars;
1293 break;
1294 }
1295 }
1296 converted += inchars;
1297 inp += inchars;
1298 inr -= inchars;
1299 outp += outchars;
1300 outr -= outchars;
1301 }
1302 *iptr = inp;
1303 *iroom = inr;
1304 *optr = outp;
1305 *oroom = outr;
1306 return converted;
1307 }
1308
1309 /* case (5) */
1310 #ifdef HAVE_ICONV_H
1311 /* NB: although iconv manages states, we need to keep track of whether
1312 * we're sure in default status (JIS_ASCII) or not (we use JIS_UNKNOWN for it).
1313 * It's because jconv_iconv_reset will be called twice if there is any
1314 * reset sequence; the first call should emit the sequence, but the second
1315 * call shouldn't.
1316 */
1317 static size_t jconv_iconv(ScmConvInfo *info, const char **iptr, size_t *iroom,
1318 char **optr, size_t *oroom)
1319 {
1320 size_t r;
1321 #ifdef JCONV_DEBUG
1322 fprintf(stderr, "jconv_iconv %s->%s\n", info->fromCode, info->toCode);
1323 #endif
1324 r = iconv(info->handle, (char **)iptr, iroom, optr, oroom);
1325 info->ostate = JIS_UNKNOWN;
1326 if (r == (size_t)-1) {
1327 if (errno == EINVAL) return INPUT_NOT_ENOUGH;
1328 if (errno == E2BIG) return OUTPUT_NOT_ENOUGH;
1329 return ILLEGAL_SEQUENCE;
1330 } else {
1331 return (int)r;
1332 }
1333 }
1334
1335 /* reset routine for iconv */
1336 static size_t jconv_iconv_reset(ScmConvInfo *info, char *optr, size_t oroom)
1337 {
1338 size_t oroom_prev = oroom;
1339 size_t r;
1340 if (info->ostate == JIS_ASCII) return 0;
1341 r = iconv(info->handle, NULL, 0, &optr, &oroom);
1342 if (r == (size_t)-1) {
1343 if (errno == E2BIG) return OUTPUT_NOT_ENOUGH;
1344 Scm_Panic("jconv_iconv_reset: unknown error number %d\n", errno);
1345 }
1346 info->ostate = JIS_ASCII;
1347 return oroom_prev - oroom;
1348 }
1349 #endif /*HAVE_ICONV_H*/
1350
1351 /*------------------------------------------------------------------
1352 * JCONV_OPEN
1353 * Returns ScmConvInfo, setting up some fields.
1354 * If no conversion is possible, returns NULL.
1355 */
1356 ScmConvInfo *jconv_open(const char *toCode, const char *fromCode)
1357 {
1358 ScmConvInfo *info;
1359 ScmConvHandler handler = NULL;
1360 int incode, outcode;
1361 ScmConvProc convproc[2];
1362 ScmConvReset reset;
1363 iconv_t handle = (iconv_t)-1;
1364
1365 incode = conv_name_find(fromCode);
1366 outcode = conv_name_find(toCode);
1367
1368 if (incode == JCODE_NONE || outcode == JCODE_NONE) {
1369 /* conversion to/from none means no conversion */
1370 handler = jconv_ident;
1371 convproc[0] = convproc[1] = NULL;
1372 reset = NULL;
1373 } else if (incode < 0 || outcode < 0) {
1374 #ifdef HAVE_ICONV_H
1375 /* try iconv */
1376 handle = iconv_open(toCode, fromCode);
1377 if (handle == (iconv_t)-1) return NULL;
1378 handler = jconv_iconv;
1379 convproc[0] = convproc[1] = NULL;
1380 reset = jconv_iconv_reset;
1381 #else /*!HAVE_ICONV_H*/
1382 return NULL;
1383 #endif
1384 } else if (incode == outcode) {
1385 /* pattern (1) */
1386 handler = jconv_ident;
1387 convproc[0] = convproc[1] = NULL;
1388 reset = NULL;
1389 } else if (incode == JCODE_EUCJ) {
1390 /* pattern (2) */
1391 handler = jconv_1tier;
1392 convproc[0] = conv_converter[outcode].outconv;
1393 convproc[1] = NULL;
1394 reset = conv_converter[outcode].reset;
1395 } else if (outcode == JCODE_EUCJ) {
1396 /* pattern (3) */
1397 handler = jconv_1tier;
1398 convproc[0] = conv_converter[incode].inconv;
1399 convproc[1] = NULL;
1400 reset = NULL;
1401 } else {
1402 /* pattern (4) */
1403 handler = jconv_2tier;
1404 convproc[0] = conv_converter[incode].inconv;
1405 convproc[1] = conv_converter[outcode].outconv;
1406 reset = conv_converter[outcode].reset;
1407 }
1408 info = SCM_NEW(ScmConvInfo);
1409 info->jconv = handler;
1410 info->convproc[0] = convproc[0];
1411 info->convproc[1] = convproc[1];
1412 info->reset = reset;
1413 info->handle = handle;
1414 info->toCode = toCode;
1415 info->istate = info->ostate = JIS_ASCII;
1416 info->fromCode = fromCode;
1417 return info;
1418 }
1419
1420 /*------------------------------------------------------------------
1421 * JCONV_CLOSE
1422 */
1423 int jconv_close(ScmConvInfo *info)
1424 {
1425 int r = 0;
1426 #ifdef HAVE_ICONV_H
1427 if (info->handle != (iconv_t)-1) {
1428 r = iconv_close(info->handle);
1429 info->handle = (iconv_t)-1;
1430 }
1431 #endif /*HAVE_ICONV_H*/
1432 return r;
1433 }
1434
1435 /*------------------------------------------------------------------
1436 * JCONV - main conversion routine
1437 */
1438 size_t jconv(ScmConvInfo *info,
1439 const char **inptr, size_t *inroom,
1440 char **outptr, size_t *outroom)
1441 {
1442 SCM_ASSERT(info->jconv != NULL);
1443 return info->jconv(info, inptr, inroom, outptr, outroom);
1444 }
1445
1446 /*------------------------------------------------------------------
1447 * JCONV_RESET - reset
1448 */
1449 size_t jconv_reset(ScmConvInfo *info, char *outptr, size_t outroom)
1450 {
1451 if (info->reset) {
1452 return info->reset(info, outptr, outroom);
1453 } else {
1454 return 0;
1455 }
1456 }