root/ext/charconv/jconv.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sjis2eucj
  2. eucj2sjis
  3. utf2euc_emit_euc
  4. utf2euc_2
  5. utf2euc_3
  6. utf2euc_4
  7. utf2eucj
  8. jconv_ucs4_to_utf8
  9. eucj2utf_emit_utf
  10. eucj2utf
  11. jis_esc
  12. jis2eucj
  13. jis_ensure_state
  14. eucj2jis
  15. jis_reset
  16. pivot
  17. conv_name_match
  18. conv_name_find
  19. jconv_ident
  20. jconv_1tier
  21. jconv_2tier
  22. jconv_iconv
  23. jconv_iconv_reset
  24. jconv_open
  25. jconv_close
  26. jconv
  27. jconv_reset

   1 /*
   2  * jconv.c - alternative japanese code conversion routines
   3  *
   4  *   Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
   5  * 
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  * 
  10  *   1. Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *
  13  *   2. Redistributions in binary form must reproduce the above copyright
  14  *      notice, this list of conditions and the following disclaimer in the
  15  *      documentation and/or other materials provided with the distribution.
  16  *
  17  *   3. Neither the name of the authors nor the names of its contributors
  18  *      may be used to endorse or promote products derived from this
  19  *      software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  $Id: jconv.c,v 1.19 2004/09/15 00:37:12 shirok Exp $
  34  */
  35 
  36 /* Some iconv() implementations don't support japanese character encodings,
  37  * or have problems handling them.  This code provides an alternative way
  38  * to convert these encodings.
  39  */
  40 
  41 /* This file handles conversion among UTF8, Shift-JIS, EUC_JP, and ISO2022JP.
  42  * Shift-JIS and EUC_JP are based on JIS X 0213:2000.  ISO2022JP partially
  43  * handles ISO2022-JP-3 as well.
  44  *
  45  * EUC_JP is used as a 'pivot' encoding, for it can naturally handle
  46  * JISX 0201, JISX 0208, JISX 0212 and JISx 0213 characters.
  47  */
  48 
  49 #include <ctype.h>
  50 #include "charconv.h"
  51 
  52 #define INCHK(n)   do{if (inroom < (n)) return INPUT_NOT_ENOUGH;}while(0)
  53 #define OUTCHK(n)  do{if (outroom < (n)) return OUTPUT_NOT_ENOUGH;}while(0)
  54 
  55 #define ERRP(n)    ((n)==INPUT_NOT_ENOUGH||(n)==OUTPUT_NOT_ENOUGH||(n)==ILLEGAL_SEQUENCE)
  56 
  57 /* Substitution characters.
  58  *  Unrecognized 1-byte character is substituted by SUBST1_CHAR.
  59  *  It's common to all encodings.
  60  *  Unrecognized or uncovertable multibyte character is substituted
  61  *  by so-called 'Geta-sign'.
  62  */
  63 #define SUBST1_CHAR   '?'
  64 #define EUCJ_SUBST2_CHAR1  0xa2
  65 #define EUCJ_SUBST2_CHAR2  0xae
  66 #define JIS_SUBST2_CHAR1   0x02
  67 #define JIS_SUBST2_CHAR2   0x0e
  68 #define SJIS_SUBST2_CHAR1  0x81
  69 #define SJIS_SUBST2_CHAR2  0xac
  70 #define UTF8_SUBST2_CHAR1   0xe3
  71 #define UTF8_SUBST2_CHAR2   0x80
  72 #define UTF8_SUBST2_CHAR3   0x93
  73 
  74 #define EUCJ_SUBST                              \
  75   do { OUTCHK(2);                               \
  76        outptr[0] = EUCJ_SUBST2_CHAR1;           \
  77        outptr[1] = EUCJ_SUBST2_CHAR2;           \
  78        *outchars = 2; } while (0)
  79 
  80 #define SJIS_SUBST                              \
  81   do { OUTCHK(2);                               \
  82        outptr[0] = SJIS_SUBST2_CHAR1;           \
  83        outptr[1] = SJIS_SUBST2_CHAR2;           \
  84        *outchars = 2; } while (0)
  85 
  86 #define UTF8_SUBST                              \
  87   do { OUTCHK(3);                               \
  88        outptr[0] = UTF8_SUBST2_CHAR1;           \
  89        outptr[1] = UTF8_SUBST2_CHAR2;           \
  90        outptr[2] = UTF8_SUBST2_CHAR2;           \
  91        *outchars = 3; } while (0)
  92 
  93 /*=================================================================
  94  * Shift JIS
  95  */
  96 
  97 /* Shift_JISX0213 -> EUC-JP
  98  * 
  99  * Mapping anormalities
 100  *
 101  *   0x5c, 0x7e : Shift_JISX0213 mapping table maps 0x5c to U+00A5
 102  *       (YEN SIGN) and 0x7e to U+203E (OVERLINE).  But mapping so
 103  *       breaks the program code written in Shift JIS.   I map them
 104  *       to the corresponding ASCII chars.
 105  *   0xfd, 0xfe, 0xff : These are reserved bytes.  Apple uses these
 106  *       bytes for vendor extension:
 107  *        0xfd - U+00A9 COPYRIGHT SIGN     |EUC A9A6  |JISX0213
 108  *        0xfe - U+2122 TRADE MARK SIGN    |EUC 8FA2EF|JISX0212
 109  *        0xff - U+2026 HORIZONTAL ELLIPSIS|EUC A1C4  |JISX0208
 110  *       This is a one-direction mapping.
 111  *   0x80, 0xa0 : These are reserved bytes.  Replaced to the
 112  *       one-byte substitution character of destination encoding.
 113  *
 114  * Conversion scheme
 115  *   0x00-0x7f : corresponding ASCII range.
 116  *   0x80      : substitution character
 117  *   0x81 -- 0x9f : first byte (s1) of double byte range for JIS X 0213 m=1
 118  *   0xa0      : substitution character
 119  *   0xa1 -- 0xdf : JISX 0201 kana = s1-0x80
 120  *   0xe0 -- 0xef : first byte (s1) of double byte range for JIS X 0213 m=1
 121  *   0xf0 -- 0xfc : first byte (s1) of double byte range for JIS X 0213 m=2
 122  *   0xfd : U+00A9, EUC A9A6, JISX0213 (1, 0x09, 0x06)
 123  *   0xfe : U+2122, EUC 8FA2EF, JISX0212
 124  *   0xff : U+2026, EUC A1C4, JISX0208 (1, 0x01, 0x24)
 125  *
 126  *   For double-byte character, second byte s2 must be in the range of
 127  *   0x40 <= s2 <= 0x7e or 0x80 <= s2 <= 0xfc.  Otherwise, double-byte
 128  *   substitution character is used.
 129  *
 130  *     two bytes (s1, s2) maps to JIS X 0213 (m, k, t) by
 131  *        m = 1 if s1 <= 0xef, 2 otherwise
 132  *        k = (s1-0x80)*2 - ((s2 < 0x9f)? 1 : 0)  if s1 <= 0x9f
 133  *            (s1-0xc0)*2 - ((s2 < 0x9f)? 1 : 0)  if 0xe0 <= s1 <= 0xef
 134  *            (s1-0x9e)*2 - ((s2 < 0x89)? 1 : 0)  if s1 >= 0xf5
 135  *            otherwise, use the following table
 136  *               s1   k (s2>=0x80, s2<0x80)
 137  *              0xf0   (0x01, 0x08)
 138  *              0xf1   (0x03, 0x04)
 139  *              0xf2   (0x05, 0x0c)
 140  *              0xf3   (0x0e, 0x0d)
 141  *              0xf4   (0x0f, 0x4e)
 142  *        t = s2-0x3f if s2 < 0x7f
 143  *            s2-0x40 if s2 < 0x9f
 144  *            s2-0x9e otherwise
 145  *
 146  *     JIS X 0213 to EUC-JP is a straightfoward conversion.
 147  */
 148 
 149 static size_t sjis2eucj(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
 150                         char *outptr, size_t outroom, size_t *outchars)
 151 {
 152     unsigned char s1, s2;
 153     static unsigned char cvt[] = { 0xa1, 0xa8, 0xa3, 0xa4, 0xa5, 0xac, 0xae, 0xad, 0xaf, 0xee };
 154 
 155     s1 = inptr[0];
 156     if (s1 < 0x7f) {
 157         *outptr = s1;
 158         *outchars = 1;
 159         return 1;
 160     }
 161     if ((s1 > 0x80 && s1 < 0xa0) || (s1 >= 0xe0 && s1 <= 0xfc)) {
 162         /* Double byte char */
 163         unsigned char m, e1, e2;
 164         INCHK(2);
 165         s2 = inptr[1];
 166         if (s2 < 0x40 || s2 > 0xfc) {
 167             EUCJ_SUBST;
 168             return 2;
 169         }
 170 
 171         if (s1 <= 0x9f) {
 172             OUTCHK(2);
 173             m = 1;
 174             e1 = (s1-0x80)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
 175         } else if (s1 <= 0xef) {
 176             OUTCHK(2);
 177             m = 1;
 178             e1 = (s1-0xc0)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
 179         } else if (s1 >= 0xf5) {
 180             OUTCHK(3);
 181             m = 2;
 182             e1 = (s1-0xf5)*2 + 0x50 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
 183         } else {
 184             OUTCHK(3);
 185             m = 2;
 186             e1 = cvt[(s1-0xf0)*2+((s2 < 0x9f)? 1 : 0)];
 187         }
 188         
 189         if (s2 < 0x7f) {
 190             e2 = s2 - 0x3f + 0xa0;
 191         } else if (s2 < 0x9f) {
 192             e2 = s2 - 0x40 + 0xa0;
 193         } else {
 194             e2 = s2 - 0x9e + 0xa0;
 195         }
 196         if (m == 1) {
 197             outptr[0] = e1;
 198             outptr[1] = e2;
 199             *outchars = 2;
 200         } else {
 201             outptr[0] = 0x8f;
 202             outptr[1] = e1;
 203             outptr[2] = e2;
 204             *outchars = 3;
 205         }
 206         return 2;
 207     }
 208     if (s1 >= 0xa1 && s1 <= 0xdf) {
 209         /* JISX0201 KANA */
 210         OUTCHK(2);
 211         outptr[0] = 0x8e;
 212         outptr[1] = s1;
 213         *outchars = 2;
 214         return 1;
 215     }
 216     if (s1 == 0xfd) {
 217         /* copyright mark */
 218         OUTCHK(2);
 219         outptr[0] = 0xa9;
 220         outptr[1] = 0xa6;
 221         *outchars = 2;
 222         return 1;
 223     }
 224     if (s1 == 0xfe) {
 225         /* trademark sign.  this is not in JISX0213, but in JISX0212. */
 226         OUTCHK(3);
 227         outptr[0] = 0x8f;
 228         outptr[1] = 0xa2;
 229         outptr[2] = 0xef;
 230         *outchars = 3;
 231         return 1;
 232     }
 233     if (s1 == 0xff) {
 234         /* horizontal ellipsis. */
 235         OUTCHK(2);
 236         outptr[0] = 0xa1;
 237         outptr[1] = 0xc4;
 238         *outchars = 2;
 239         return 1;
 240     }
 241     
 242     /* s1 == 0x80 or 0xa0 */
 243     outptr[0] = SUBST1_CHAR;
 244     *outchars = 1;
 245     return 1;
 246 }
 247 
 248 /* EUC_JISX0213 -> Shift_JIS
 249  * 
 250  * Mapping anormalities
 251  *
 252  *   0x80--0xa0 except 0x8e and 0x8f : C1 region.
 253  *          Doesn't have corresponding SJIS bytes,
 254  *          so mapped to substitution char.
 255  *   0xff : reserved byte.  mapped to substitution char.
 256  *
 257  * Conversion scheme
 258  *   0x00-0x7f : corresponding ASCII range.
 259  *   0x80--0x8d : substitution char.
 260  *   0x8e : leading byte of JISX 0201 kana
 261  *   0x8f : leading byte of JISX 0212 or JISX 0213 plane 2
 262  *   0x90--0xa0 : substitution char.
 263  *   0xa1--0xfe : first byte (e1) of JISX 0213 plane 1
 264  *   0xff : substitution char
 265  *
 266  *   For double or trible-byte character, subsequent byte has to be in
 267  *   the range between 0xa1 and 0xfe inclusive.  If not, it is replaced
 268  *   for the substitution character.
 269  *   
 270  *   If the first byte is in the range of 0xa1--0xfe, two bytes (e1, e2)
 271  *   is mapped to SJIS (s1, s2) by:
 272  *
 273  *     s1 = (e1 - 0xa0 + 0x101)/2 if 0xa1 <= e1 <= 0xde
 274  *          (e1 - 0xa0 + 0x181)/2 if 0xdf <= e1 <= 0xfe
 275  *     s2 = (e2 - 0xa0 + 0x3f) if odd?(e1) && 0xa1 <= e2 <= 0xdf
 276  *          (e2 - 0xa0 + 0x40) if odd?(e1) && 0xe0 <= e2 <= 0xfe
 277  *          (e2 - 0xa0 + 0x9e) if even?(e1)
 278  *
 279  *   If the first byte is 0x8f, the second byte (e1) and the third byte
 280  *   (e2) is mapped to SJIS (s1, s2) by:
 281  *     if (0xee <= e1 <= 0xfe)  s1 = (e1 - 0xa0 + 0x19b)/2
 282  *     otherwise, follow the table:
 283  *       e1 == 0xa1 or 0xa8  => s1 = 0xf0
 284  *       e1 == 0xa3 or 0xa4  => s1 = 0xf1
 285  *       e1 == 0xa5 or 0xac  => s1 = 0xf2
 286  *       e1 == 0xae or 0xad  => s1 = 0xf3
 287  *       e1 == 0xaf          => s1 = 0xf4
 288  *     If e1 is other value, it is JISX0212; we use substitution char.
 289  *     s2 is mapped with the same rule above.
 290  */
 291 
 292 static size_t eucj2sjis(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
 293                         char *outptr, size_t outroom, size_t *outchars)
 294 {
 295     unsigned char e1, e2;
 296     e1 = inptr[0];
 297     if (e1 <= 0x7f) {
 298         outptr[0] = e1;
 299         *outchars = 1;
 300         return 1;
 301     }
 302     if (e1 >= 0xa1 && e1 <= 0xfe) {
 303         /* double byte char (JISX 0213 plane 1) */
 304         unsigned char s1, s2;
 305         INCHK(2);
 306         e2 = inptr[1];
 307         if (e2 < 0xa1 || e2 == 0xff) {
 308             SJIS_SUBST;
 309             return 2;
 310         }
 311         OUTCHK(2);
 312         if (e1 <= 0xde) s1 = (e1 - 0xa0 + 0x101)/2;
 313         else            s1 = (e1 - 0xa0 + 0x181)/2;
 314         if (e1%2 == 0) {
 315             s2 = e2 - 0xa0 + 0x9e;
 316         } else {
 317             if (e2 <= 0xdf) s2 = e2 - 0xa0 + 0x3f;
 318             else            s2 = e2 - 0xa0 + 0x40;
 319         }
 320         outptr[0] = s1;
 321         outptr[1] = s2;
 322         *outchars = 2;
 323         return 2;
 324     }
 325     if (e1 == 0x8e) {
 326         /* JISX 0201 kana */
 327         INCHK(2);
 328         e2 = inptr[1];
 329         if (e2 < 0xa1 || e2 == 0xff) {
 330             outptr[0] = SUBST1_CHAR;
 331         } else {
 332             outptr[0] = e2;
 333         }
 334         *outchars = 1;
 335         return 2;
 336     }
 337     if (e1 == 0x8f) {
 338         /* triple byte char */
 339         unsigned char s1, s2;
 340         unsigned char cvt[] = { 0xf0, 0, 0xf1, 0xf1, 0xf2, 0, 0, 0xf0, 0, 0, 0, 0xf2, 0xf3, 0xf3, 0xf4 };
 341         
 342         INCHK(3);
 343         OUTCHK(2);
 344         e1 = inptr[1];
 345         e2 = inptr[2];
 346         if (e1 < 0xa1 || e1 == 0xff || e2 < 0xa1 || e2 == 0xff) {
 347             SJIS_SUBST;
 348             return 3;
 349         }
 350         if (e1 >= 0xee) {
 351             s1 = (e1 - 0xa0 + 0x19b)/2;
 352         } else if (e1 >= 0xb0) {
 353             SJIS_SUBST;
 354             return 3;
 355         } else {
 356             s1 = cvt[e1-0xa1];
 357             if (s1 == 0) {
 358                 SJIS_SUBST;
 359                 return 3;
 360             }
 361         }
 362         if (e1%2 == 0) {
 363             s2 = e2 - 0xa0 + 0x9e;
 364         } else {
 365             if (e2 < 0xdf) s2 = e2 - 0xa0 + 0x3f;
 366             else           s2 = e2 - 0xa0 + 0x40;
 367         }
 368         outptr[0] = s1;
 369         outptr[1] = s2;
 370         *outchars = 2;
 371         return 3;
 372     }
 373     /* no corresponding char */
 374     *outptr = SUBST1_CHAR;
 375     *outchars = 1;
 376     return 1;
 377 }
 378 
 379 /*=================================================================
 380  * UTF8
 381  */
 382 
 383 /* Conversion between UTF8 and EUC_JP is based on the table found at
 384  * http://isweb11.infoseek.co.jp/computer/wakaba/table/jis-note.ja.html
 385  *
 386  * There are some characters in JISX0213 that can't be represented
 387  * in a single Unicode character, but can be with a combining character.
 388  * In such case, EUC_JP to UTF8 conversion uses combining character,
 389  * but UTF8 to EUC_JP conversion translates the combining character into
 390  * another character.  For example, a single JISX0213 katakana 'nga'
 391  * (hiragana "ka" with han-dakuon mark) will translates to Unicode
 392  * U+304B+309A (HIRAGANA LETTER KA + COMBINING KATAKANA-HIRAGANA SEMI-VOICED
 393  * SOUND MARK).  When this sequence is converted to EUC_JP again, it
 394  * becomes EUCJ 0xA4AB + 0xA1AC.  This is an implementation limitation,
 395  * and should be removed in later release.
 396  */
 397 
 398 /* [UTF8 -> EUC_JP conversion]
 399  *
 400  * EUC-JP has the corresponding characters to the wide range of
 401  * UCS characters.
 402  *
 403  *   UCS4 character   # of EUC_JP characters
 404  *   ---------------------------------------
 405  *     U+0000+0xxx    564
 406  *     U+0000+1xxx      6
 407  *     U+0000+2xxx    321
 408  *     U+0000+3xxx    422
 409  *     U+0000+4xxx    347
 410  *     U+0000+5xxx   1951
 411  *     U+0000+6xxx   2047
 412  *     U+0000+7xxx   1868
 413  *     U+0000+8xxx   1769
 414  *     U+0000+9xxx   1583
 415  *     U+0000+fxxx    241
 416  *     U+0002+xxxx    302
 417  *
 418  * It is so wide and so sparse that naive lookup table implementation from
 419  * UCS to EUC can be space-wasting.  I use hierarchical table with some
 420  * ad-hoc heuristics.   Since the hierarchical table is used, I directly
 421  * translates UTF8 to EUC_JP, without converting it to UCS4.
 422  *
 423  * Strategy outline: say input consists of bytes named u0, u1, ....
 424  *
 425  *  u0 <= 0x7f  : ASCII range
 426  *  u0 in [0xc2-0xd1] : UTF8 uses 2 bytes.  Some mappings within this range
 427  *         is either very regular or very small, and they are
 428  *         hardcoded.   Other mappings uses table lookup.
 429  *  u0 == 0xe1  : UTF8 uses 3 bytes.  There are only 6 characters in this
 430  *         range, and it is hardcoded.
 431  *  u0 in [0xe2-0xe9, 0xef] : Large number of characters are in this range.
 432  *         Two-level table of 64 entries each is used to dispatch the
 433  *         characters.
 434  *  u0 == 0xf0  : UTF8 uses 4 bytes.  u1 is in [0xa0-0xaa].  u2 and u3 is
 435  *         used for dispatch table of 64 entries each.
 436  *
 437  * The final table entry is unsigned short.  0x0000 means no corresponding
 438  * character is defined in EUC_JP.  >=0x8000 is the EUC_JP character itself.
 439  * < 0x8000 means the character is in G3 plane; 0x8f should be preceded,
 440  * and 0x8000 must be added to the value.
 441  */
 442 
 443 #include "ucs2eucj.c"
 444 
 445 /* Emit given euc char */
 446 static inline size_t utf2euc_emit_euc(unsigned short euc, size_t inchars, char *outptr, size_t outroom, size_t *outchars)
 447 {
 448     if (euc == 0) {
 449         EUCJ_SUBST;
 450     } else if (euc < 0x8000) {
 451         OUTCHK(3);
 452         outptr[0] = 0x8f;
 453         outptr[1] = (euc >> 8) + 0x80;
 454         outptr[2] = euc & 0xff;
 455         *outchars = 3;
 456     } else {
 457         OUTCHK(2);
 458         outptr[0] = (euc >> 8);
 459         outptr[1] = euc & 0xff;
 460         *outchars = 2;
 461     }
 462     return inchars;
 463 }
 464 
 465 /* handle 2-byte UTF8 sequence.  0xc0 <= u0 <= 0xdf */
 466 static inline size_t utf2euc_2(ScmConvInfo *cinfo, unsigned char u0,
 467                                const char *inptr, size_t inroom,
 468                                char *outptr, size_t outroom, size_t *outchars)
 469 {
 470     unsigned char u1;
 471     unsigned short *etab = NULL;
 472     
 473     INCHK(2);
 474     u1 = (unsigned char)inptr[1];
 475     if (u1 < 0x80 || u1 >= 0xc0) return ILLEGAL_SEQUENCE;
 476 
 477     switch (u0) {
 478     case 0xc2: etab = utf2euc_c2; break;
 479     case 0xc3: etab = utf2euc_c3; break;
 480     case 0xc4: etab = utf2euc_c4; break;
 481     case 0xc5: etab = utf2euc_c5; break;
 482     case 0xc6:
 483         if (u1 == 0x93) { /* U+0193 -> euc ABA9 */
 484             return utf2euc_emit_euc(0xaba9, 2, outptr, outroom, outchars);
 485         } else break;
 486     case 0xc7: etab = utf2euc_c7; break;
 487     case 0xc9: etab = utf2euc_c9; break;
 488     case 0xca: etab = utf2euc_ca; break;
 489     case 0xcb: etab = utf2euc_cb; break;
 490     case 0xcc: etab = utf2euc_cc; break;
 491     case 0xcd:
 492         if (u1 == 0xa1) { /* U+0361 -> euc ABD2 */
 493             return utf2euc_emit_euc(0xabd2, 2, outptr, outroom, outchars);
 494         } else break;
 495     case 0xce: etab = utf2euc_ce; break;
 496     case 0xcf: etab = utf2euc_cf; break;
 497     case 0xd0: etab = utf2euc_d0; break;
 498     case 0xd1: etab = utf2euc_d1; break;
 499     default:
 500         break;
 501     }
 502     if (etab != NULL) {
 503         /* table lookup */
 504         return utf2euc_emit_euc(etab[u1-0x80], 2, outptr, outroom, outchars);
 505     }
 506     EUCJ_SUBST;
 507     return 2;
 508 }
 509 
 510 /* handle 3-byte UTF8 sequence.  0xe0 <= u0 <= 0xef */
 511 static inline size_t utf2euc_3(ScmConvInfo *cinfo, unsigned char u0,
 512                                const char *inptr, size_t inroom,
 513                                char *outptr, size_t outroom, size_t *outchars)
 514 {
 515     unsigned char u1, u2;
 516     unsigned char *tab1 = NULL;
 517     unsigned short (*tab2)[64] = NULL;
 518 
 519     INCHK(3);
 520     u1 = (unsigned char)inptr[1];
 521     u2 = (unsigned char)inptr[2];
 522     
 523     switch (u0) {
 524     case 0xe1: /* special case : there's only 6 chars */
 525         {
 526             unsigned short euc = 0;
 527             if (u1 == 0xb8) {
 528                 if (u2 == 0xbe)      euc = 0xa8f2;
 529                 else if (u2 == 0xbf) euc = 0xa8f3;
 530             } else if (u1 == 0xbd) {
 531                 if (u2 == 0xb0)      euc = 0xabc6;
 532                 else if (u2 == 0xb1) euc = 0xabc7;
 533                 else if (u2 == 0xb2) euc = 0xabd0;
 534                 else if (u2 == 0xb3) euc = 0xabd1;
 535             }
 536             return utf2euc_emit_euc(euc, 3, outptr, outroom, outchars);
 537         }
 538     case 0xe2: tab1 = utf2euc_e2; tab2 = utf2euc_e2_xx; break;
 539     case 0xe3: tab1 = utf2euc_e3; tab2 = utf2euc_e3_xx; break;
 540     case 0xe4: tab1 = utf2euc_e4; tab2 = utf2euc_e4_xx; break;
 541     case 0xe5: tab1 = utf2euc_e5; tab2 = utf2euc_e5_xx; break;
 542     case 0xe6: tab1 = utf2euc_e6; tab2 = utf2euc_e6_xx; break;
 543     case 0xe7: tab1 = utf2euc_e7; tab2 = utf2euc_e7_xx; break;
 544     case 0xe8: tab1 = utf2euc_e8; tab2 = utf2euc_e8_xx; break;
 545     case 0xe9: tab1 = utf2euc_e9; tab2 = utf2euc_e9_xx; break;
 546     case 0xef: tab1 = utf2euc_ef; tab2 = utf2euc_ef_xx; break;
 547     default:
 548         break;
 549     }
 550     if (tab1 != NULL) {
 551         unsigned char ind = tab1[u1-0x80];
 552         if (ind != 0) {
 553             return utf2euc_emit_euc(tab2[ind-1][u2-0x80], 3, outptr, outroom, outchars);
 554         }
 555     }
 556     EUCJ_SUBST;
 557     return 3;
 558 }
 559 
 560 /* handle 4-byte UTF8 sequence.  u0 == 0xf0, 0xa0 <= u1 <= 0xaa */
 561 static inline size_t utf2euc_4(ScmConvInfo *cinfo, unsigned char u0,
 562                                const char *inptr, size_t inroom,
 563                                char *outptr, size_t outroom, size_t *outchars)
 564 {
 565     unsigned char u1, u2, u3;
 566     unsigned short *tab = NULL;
 567 
 568     INCHK(4);
 569     if (u0 != 0xf0) {
 570         EUCJ_SUBST;
 571         return 4;
 572     }
 573     u1 = (unsigned char)inptr[1];
 574     u2 = (unsigned char)inptr[2];
 575     u3 = (unsigned char)inptr[3];
 576     
 577     switch (u1) {
 578     case 0xa0: tab = utf2euc_f0_a0; break;
 579     case 0xa1: tab = utf2euc_f0_a1; break;
 580     case 0xa2: tab = utf2euc_f0_a2; break;
 581     case 0xa3: tab = utf2euc_f0_a3; break;
 582     case 0xa4: tab = utf2euc_f0_a4; break;
 583     case 0xa5: tab = utf2euc_f0_a5; break;
 584     case 0xa6: tab = utf2euc_f0_a6; break;
 585     case 0xa7: tab = utf2euc_f0_a7; break;
 586     case 0xa8: tab = utf2euc_f0_a8; break;
 587     case 0xa9: tab = utf2euc_f0_a9; break;
 588     case 0xaa: tab = utf2euc_f0_aa; break;
 589     default:
 590         break;
 591     }
 592     if (tab != NULL) {
 593         int i;
 594         unsigned short u2u3 = u2*256 + u3;
 595         for (i=0; tab[i]; i+=2) {
 596             if (tab[i] == u2u3) {
 597                 return utf2euc_emit_euc(tab[i+1], 4, outptr, outroom, outchars);
 598             }
 599         }
 600     }
 601     EUCJ_SUBST;
 602     return 4;
 603 }
 604 
 605 /* Body of UTF8 -> EUC_JP conversion */
 606 static size_t utf2eucj(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
 607                        char *outptr, size_t outroom, size_t *outchars)
 608 {
 609     unsigned char u0;
 610     
 611     u0 = (unsigned char)inptr[0];
 612     if (u0 <= 0x7f) {
 613         *outptr = u0;
 614         *outchars = 1;
 615         return 1;
 616     }
 617     if (u0 <= 0xbf) {
 618         /* invalid UTF8 sequence */
 619         return ILLEGAL_SEQUENCE;
 620     }
 621     if (u0 <= 0xdf) {
 622         /* 2-byte UTF8 sequence */
 623         return utf2euc_2(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
 624     }
 625     if (u0 <= 0xef) {
 626         /* 3-byte UTF8 sequence */
 627         return utf2euc_3(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
 628     }
 629     if (u0 <= 0xf7) {
 630         /* 4-byte UTF8 sequence */
 631         return utf2euc_4(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
 632     }
 633     if (u0 <= 0xfb) {
 634         /* 5-byte UTF8 sequence */
 635         INCHK(5);
 636         EUCJ_SUBST;
 637         return 5;
 638     }
 639     if (u0 <= 0xfd) {
 640         /* 6-byte UTF8 sequence */
 641         INCHK(6);
 642         EUCJ_SUBST;
 643         return 6;
 644     }
 645     return ILLEGAL_SEQUENCE;
 646 }
 647 
 648 /* [EUC_JP -> UTF8 conversion]
 649  *
 650  * Conversion strategy:
 651  *   If euc0 is in ASCII range, or C1 range except 0x8e or 0x8f, map it as is.
 652  *   If euc0 is 0x8e, use JISX0201-KANA table.
 653  *   If euc0 is 0x8f, use JISX0213 plane 2 table.
 654  *   If euc0 is in [0xa1-0xfe], use JISX0213 plane1 table.
 655  *   If euc0 is 0xa0 or 0xff, return ILLEGAL_SEQUENCE.
 656  *
 657  * JISX0213 plane2 table is consisted by a 2-level tree.  The first-level
 658  * returns an index to the second-level table by (euc1 - 0xa1).  Only the
 659  * range of JISX0213 defined region is converted; JISX0212 region will be
 660  * mapped to the substitution char.
 661  */
 662 
 663 #include "eucj2ucs.c"
 664 
 665 /* UTF8 utility.  Similar stuff is included in gauche/char_utf_8.h
 666    if the native encoding is UTF8, but not otherwise.
 667    So I include them here as well. */
 668 
 669 void jconv_ucs4_to_utf8(unsigned int ucs, char *cp)
 670 {
 671     if (ucs < 0x80) {
 672         *cp = ucs;
 673     }
 674     else if (ucs < 0x800) {
 675         *cp++ = ((ucs>>6)&0x1f) | 0xc0;
 676         *cp = (ucs&0x3f) | 0x80;
 677     }
 678     else if (ucs < 0x10000) {
 679         *cp++ = ((ucs>>12)&0x0f) | 0xe0;
 680         *cp++ = ((ucs>>6)&0x3f) | 0x80;
 681         *cp = (ucs&0x3f) | 0x80;
 682     }
 683     else if (ucs < 0x200000) {
 684         *cp++ = ((ucs>>18)&0x07) | 0xf0;
 685         *cp++ = ((ucs>>12)&0x3f) | 0x80;
 686         *cp++ = ((ucs>>6)&0x3f) | 0x80;
 687         *cp = (ucs&0x3f) | 0x80;
 688     }
 689     else if (ucs < 0x4000000) {
 690         *cp++ = ((ucs>>24)&0x03) | 0xf8;
 691         *cp++ = ((ucs>>18)&0x3f) | 0x80;
 692         *cp++ = ((ucs>>12)&0x3f) | 0x80;
 693         *cp++ = ((ucs>>6)&0x3f) | 0x80;
 694         *cp = (ucs&0x3f) | 0x80;
 695     } else {
 696         *cp++ = ((ucs>>30)&0x1) | 0xfc;
 697         *cp++ = ((ucs>>24)&0x3f) | 0x80;
 698         *cp++ = ((ucs>>18)&0x3f) | 0x80;
 699         *cp++ = ((ucs>>12)&0x3f) | 0x80;
 700         *cp++ = ((ucs>>6)&0x3f) | 0x80;
 701         *cp++ = (ucs&0x3f) | 0x80;
 702     }
 703 }
 704 
 705 /* Given 'encoded' ucs, emit utf8.  'Encoded' ucs is the entry of the
 706    conversion table.  If ucs >= 0x100000, it is composed by two UCS2
 707    character.  Otherwise, it is one UCS4 character. */
 708 static inline size_t eucj2utf_emit_utf(unsigned int ucs, size_t inchars,
 709                                        char *outptr, size_t outroom,
 710                                        size_t *outchars)
 711 {
 712     if (ucs == 0) {
 713         UTF8_SUBST;
 714     } else if (ucs < 0x100000) {
 715         int outreq = UCS2UTF_NBYTES(ucs);
 716         OUTCHK(outreq);
 717         jconv_ucs4_to_utf8(ucs, outptr);
 718         *outchars = outreq;
 719     } else {
 720         /* we need two UCS characters */
 721         unsigned int ucs0 = (ucs >> 16) & 0xffff;
 722         unsigned int ucs1 = ucs & 0xfff;
 723         int outreq0 = UCS2UTF_NBYTES(ucs0);
 724         int outreq1 = UCS2UTF_NBYTES(ucs1);
 725         OUTCHK(outreq0+outreq1);
 726         jconv_ucs4_to_utf8(ucs0, outptr);
 727         jconv_ucs4_to_utf8(ucs1, outptr+outreq0);
 728         *outchars = outreq0+outreq1;
 729     }
 730     return inchars;
 731 }
 732 
 733 static size_t eucj2utf(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
 734                        char *outptr, size_t outroom, size_t *outchars)
 735 {
 736     unsigned char e0, e1, e2;
 737     unsigned int ucs;
 738     
 739     e0 = (unsigned char)inptr[0];
 740     if (e0 < 0xa0) {
 741         if (e0 == 0x8e) {
 742             /* JIS X 0201 KANA */
 743             INCHK(2);
 744             e1 = (unsigned char)inptr[1];
 745             if (e1 < 0xa1 || e1 > 0xdf) return ILLEGAL_SEQUENCE;
 746             ucs = 0xff61 + (e1 - 0xa1);
 747             return eucj2utf_emit_utf(ucs, 2, outptr, outroom, outchars);
 748         }
 749         else if (e0 == 0x8f) {
 750             /* JIS X 0213 plane 2 */
 751             int index;
 752             
 753             INCHK(3);
 754             e1 = (unsigned char)inptr[1];
 755             e2 = (unsigned char)inptr[2];
 756             if (e1 < 0xa1 || e1 > 0xfe || e2 < 0xa1 || e2 > 0xfe) {
 757                 return ILLEGAL_SEQUENCE;
 758             }
 759             index = euc_jisx0213_2_index[e1 - 0xa1];
 760             if (index < 0) {
 761                 UTF8_SUBST;
 762                 return 3;
 763             }
 764             ucs = euc_jisx0213_2_to_ucs2[index][e2 - 0xa1];
 765             return eucj2utf_emit_utf(ucs, 3, outptr, outroom, outchars);
 766         }
 767         else {
 768             /* ASCII or C1 region */
 769             outptr[0] = e0;
 770             *outchars = 1;
 771             return 1;
 772         }
 773     }
 774     if (e0 > 0xa0 && e0 < 0xff) {
 775         /* JIS X 0213 plane 1 */
 776         INCHK(2);
 777         e1 = (unsigned char)inptr[1];
 778         if (e1 < 0xa1 || e1 > 0xfe) return ILLEGAL_SEQUENCE;
 779         ucs = euc_jisx0213_1_to_ucs2[e0 - 0xa1][e1 - 0xa1];
 780         return eucj2utf_emit_utf(ucs, 2, outptr, outroom, outchars);
 781     }
 782     return ILLEGAL_SEQUENCE;
 783 }
 784 
 785 /*=================================================================
 786  * ISO2022-JP
 787  */
 788 
 789 /* ISO2022-JP{-1(,2),3} -> EUC_JP
 790  * Strategy: accepts as many possibilities as possible.
 791  * The following escape sequence is recognized:
 792  * (See Lunde, CJKV information processing, O'Reilly, pp.155--158)
 793  *
 794  *  <ESC> ( B     ASCII
 795  *  <ESC> ( J     JIS-Roman
 796  *  <ESC> ( H     JIS-Roman (for compatibility)
 797  *  <ESC> ( I     Half-width katakana (JIS X 0201 kana)
 798  *  <ESC> $ @     JIS C 6226-1978 (78JIS)
 799  *  <ESC> $ B     JIS X 0208-1983 (83JIS)
 800  *  <ESC> $ ( D   JIS X 0212-1990
 801  *  <ESC> $ ( O   JIS X 0213:2000 plane 1
 802  *  <ESC> $ ( P   JIS X 0213:2000 plane 2
 803  *  <ESC> & @ <ESC> $ B   JIS X 0208-1990, JIS X 0208:1997
 804  *  0x0e          JIS7 half-width katakana shift-out
 805  *  0x0f          JIS7 half-width katakana shift-in
 806  *
 807  * The state is reset to ASCII whenever newline character is read.
 808  *
 809  * The following escape sequences defined in ISO2022-JP-2 are recognized,
 810  * but all the characters within the sequence will be replaced by '?'.
 811  *
 812  *  <ESC> $ A     (GB2312-80) unsupported
 813  *  <ESC> $ ( C   (KS X 1001:1992) unsupported
 814  *  <ESC> . A     (ISO8859-1:1998) unsupported
 815  *  <ESC> . F     (ISO8859-7:1998) unsupported
 816  * 
 817  * If other escape sequence is seen, the converter returns ILLEGAL_SEQUENCE.
 818  *
 819  * JIS8 kana is allowed.
 820  */
 821 
 822 /* input states */
 823 enum {
 824     JIS_ASCII,
 825     JIS_ROMAN,
 826     JIS_KANA,
 827     JIS_78,
 828     JIS_0212,
 829     JIS_0213_1,
 830     JIS_0213_2,
 831     JIS_UNKNOWN,
 832 };
 833 
 834 /* deal with escape sequence.  escape byte itself is already consumed.
 835    returns # of input bytes consumed by the escape sequence,
 836    or an error code.  cinfo->istate is updated accordingly. */
 837 static size_t jis_esc(ScmConvInfo *cinfo, const char *inptr, size_t inroom)
 838 {
 839     unsigned char j1, j2;
 840     INCHK(2);
 841     j1 = inptr[0];
 842     j2 = inptr[1];
 843     switch (j1) {
 844     case '(':
 845         switch (j2) {
 846         case 'B': cinfo->istate = JIS_ASCII; break;
 847         case 'J': cinfo->istate = JIS_ROMAN; break;
 848         case 'H': cinfo->istate = JIS_ROMAN; break;
 849         case 'I': cinfo->istate = JIS_KANA;  break;
 850         default: return ILLEGAL_SEQUENCE;
 851         }
 852         return 2;
 853     case '$':
 854         switch (j2) {
 855         case '@': cinfo->istate = JIS_78; break;
 856         case 'B': cinfo->istate =  JIS_0213_1; break;
 857         case 'A': cinfo->istate =  JIS_UNKNOWN; break;
 858         case '(':
 859             {
 860                 INCHK(3);
 861                 switch (inptr[2]) {
 862                 case 'D': cinfo->istate = JIS_0212; break;
 863                 case 'O': cinfo->istate = JIS_0213_1; break;
 864                 case 'P': cinfo->istate = JIS_0213_2; break;
 865                 case 'C': cinfo->istate = JIS_UNKNOWN; break;
 866                 default:  return ILLEGAL_SEQUENCE;
 867                 }
 868                 return 3;
 869                 break;
 870             }
 871         default: return ILLEGAL_SEQUENCE;
 872         }
 873         return 2;
 874     case '&':
 875         {
 876             INCHK(6);
 877             if (inptr[2] == '@' && inptr[3] == 0x1b && inptr[4] == '$'
 878                 && inptr[5] == 'B') {
 879                 cinfo->istate = JIS_0213_1;
 880                 return 5;
 881             } else {
 882                 return ILLEGAL_SEQUENCE;
 883             }
 884         }
 885     case '.':
 886         switch (inptr[2]) {
 887         case 'A':/*fallthrough*/;
 888         case 'F':   cinfo->istate = JIS_UNKNOWN; break;
 889         default:    return ILLEGAL_SEQUENCE;
 890         }
 891         return 2;
 892     default: return ILLEGAL_SEQUENCE;
 893     }
 894 }
 895 
 896 /* main routine for iso2022-jp -> euc_jp */
 897 static size_t jis2eucj(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
 898                        char *outptr, size_t outroom, size_t *outchars)
 899 {
 900     unsigned char j0, j1;
 901     size_t inoffset = 0, r;
 902     
 903     j0 = inptr[inoffset];
 904     /* skip escape sequence */
 905     while (j0 == 0x1b) {
 906         inoffset++;
 907         r = jis_esc(cinfo, inptr+inoffset, inroom-inoffset);
 908         if (ERRP(r)) return r;
 909         inoffset += r;
 910         if (inoffset >= inroom) {
 911             *outchars = 0;
 912             return inoffset;
 913         }
 914         j0 = inptr[inoffset];
 915     }
 916     
 917     if (j0 == '\n' || j0 == '\r') {
 918         cinfo->istate = JIS_ASCII;
 919         outptr[0] = j0;
 920         *outchars = 1;
 921         return 1+inoffset;
 922     } else if (j0 < 0x20) {
 923         outptr[0] = j0;
 924         *outchars = 1;
 925         return 1+inoffset;
 926     } else if (j0 >= 0xa1 && j0 <= 0xdf) {
 927         /* JIS8 kana */
 928         OUTCHK(2);
 929         outptr[0] = 0x8e;
 930         outptr[1] = j0;
 931         *outchars = 2;
 932         return 1+inoffset;
 933     } else {
 934         switch (cinfo->istate) {
 935         case JIS_ROMAN:
 936             /* jis-roman and ascii differs on 0x5c and 0x7e -- for now,
 937                I ignore the difference. */
 938             /* FALLTHROUGH */
 939         case JIS_ASCII:
 940             outptr[0] = j0;
 941             *outchars = 1;
 942             return 1+inoffset;
 943         case JIS_KANA:
 944             OUTCHK(2);
 945             outptr[0] = 0x8e;
 946             outptr[1] = j0 + 0x80;
 947             *outchars = 2;
 948             return 1+inoffset;
 949         case JIS_78:
 950             /* for now, I ignore the difference between JIS78 and JIS83 */
 951             /* FALLTHROUGH */
 952         case JIS_0213_1:
 953             INCHK(inoffset+2);
 954             OUTCHK(2);
 955             j1 = inptr[inoffset+1];
 956             outptr[0] = j0 + 0x80;
 957             outptr[1] = j1 + 0x80;
 958             *outchars = 2;
 959             return 2+inoffset;
 960         case JIS_0212:
 961             /* jis x 0212 and jis x 0213 plane 2 are different character sets,
 962                but uses the same conversion scheme. */
 963             /* FALLTHROUGH */
 964         case JIS_0213_2:
 965             INCHK(inoffset+2);
 966             OUTCHK(3);
 967             j1 = inptr[inoffset+1];
 968             outptr[0] = 0x8f;
 969             outptr[1] = j0 + 0x80;
 970             outptr[2] = j1 + 0x80;
 971             *outchars = 3;
 972             return 2+inoffset;
 973         case JIS_UNKNOWN:
 974             outptr[0] = SUBST1_CHAR;
 975             *outchars = 1;
 976             return 1+inoffset;
 977         default:
 978             Scm_Error("internal state of ISO2022-JP -> EUC_JP got messed up (%d).  Implementation error?", cinfo->istate);
 979         }
 980     }
 981     return ILLEGAL_SEQUENCE;
 982 }
 983 
 984 /* EUC_JP -> ISO2022JP(-3)
 985  *
 986  * For now, I follow the strategy of iso2022jp-3-compatible behavior.
 987  */
 988 
 989 /* ensure the current state is newstate.  returns # of output chars.
 990    may return OUTPUT_NOT_ENOUGH. */
 991 static size_t jis_ensure_state(ScmConvInfo *cinfo, int newstate, size_t outbytes,
 992                                char *outptr, size_t outroom)
 993 {
 994     const char *escseq = NULL;
 995     size_t esclen = 0;
 996 
 997     if (cinfo->ostate == newstate) {
 998         OUTCHK(outbytes);
 999         return 0;
1000     }
1001     switch (newstate) {
1002     case JIS_ASCII:
1003         escseq = "\033(B";  esclen = 3; break;
1004     case JIS_KANA:
1005         escseq = "\033(I";  esclen = 3; break;
1006     case JIS_0213_1:
1007         escseq = "\033$B";  esclen = 3; break;
1008     case JIS_0213_2:
1009         escseq = "\033$(P"; esclen = 4; break;
1010     case JIS_0212:
1011         escseq = "\033$(D"; esclen = 4; break;
1012     default:
1013         Scm_Error("something wrong in jis_ensure_state: implementation error?");
1014         return 0;               /* dummy */
1015     }
1016     OUTCHK(esclen + outbytes);
1017     memcpy(outptr, escseq, esclen);
1018     cinfo->ostate = newstate;
1019     return esclen;
1020 }
1021 
1022 static size_t eucj2jis(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
1023                        char *outptr, size_t outroom, size_t *outchars)
1024 {
1025     unsigned char e0, e1;
1026     size_t outoffset = 0;
1027     e0 = inptr[0];
1028     if (e0 < 0x80) {
1029         outoffset = jis_ensure_state(cinfo, JIS_ASCII, 1, outptr, outroom);
1030         if (ERRP(outoffset)) return outoffset;
1031         outptr[outoffset] = e0;
1032         *outchars = outoffset+1;
1033         return 1;
1034     } else if (e0 == 0x8e) {
1035         INCHK(2);
1036         e1 = inptr[1];
1037         if (e1 > 0xa0 && e1 < 0xff) {
1038             outoffset = jis_ensure_state(cinfo, JIS_KANA, 1, outptr, outroom);
1039             if (ERRP(outoffset)) return outoffset;
1040             outptr[outoffset] = e1 - 0x80;
1041             *outchars = outoffset+1;
1042             return 2;
1043         }
1044     } else if (e0 == 0x8f) {
1045         INCHK(3);
1046         e0 = inptr[1];
1047         e1 = inptr[2];
1048         if (e0 > 0xa0 && e0 < 0xff && e1 > 0xa0 && e1 < 0xff) {
1049             int newstate = JIS_0212;
1050             switch (e0) {
1051             case 0xa1:; case 0xa3:; case 0xa4:; case 0xa5:;
1052             case 0xa8:; case 0xac:; case 0xad:; case 0xae:; case 0xaf:;
1053                 newstate = JIS_0213_2; break;
1054             default:
1055                 if (e0 >= 0xee) newstate = JIS_0213_2;
1056             }
1057             outoffset = jis_ensure_state(cinfo, newstate, 2, outptr, outroom);
1058             outptr[outoffset] = e0 - 0x80;
1059             outptr[outoffset+1] = e1 - 0x80;
1060             *outchars = outoffset+1;
1061             return 3;
1062         }
1063     } else if (e0 > 0xa0 && e0 < 0xff) {
1064         INCHK(2);
1065         e1 = inptr[1];
1066         if (e1 > 0xa0 && e1 < 0xff) {
1067             outoffset = jis_ensure_state(cinfo, JIS_0213_1, 2, outptr, outroom);
1068             if (ERRP(outoffset)) return outoffset;
1069             outptr[outoffset] = e0 - 0x80;
1070             outptr[outoffset+1] = e1 - 0x80;
1071             *outchars = outoffset+2;
1072             return 2;
1073         }
1074     }
1075     return ILLEGAL_SEQUENCE;
1076 }
1077 
1078 /* reset proc */
1079 static size_t jis_reset(ScmConvInfo *cinfo, char *outptr, size_t outroom)
1080 {
1081     if (outptr == NULL) {
1082         /* just reset */
1083         cinfo->ostate = JIS_ASCII;
1084         return 0;
1085     } else {
1086         if (cinfo->ostate == JIS_ASCII) return 0;
1087         if (outroom < 3) return OUTPUT_NOT_ENOUGH;
1088         outptr[0] = 0x1b;
1089         outptr[1] = '(';
1090         outptr[2] = 'B';
1091         cinfo->ostate = JIS_ASCII;
1092         return 3;
1093     }
1094 }
1095 
1096 /*=================================================================
1097  * EUC_JP
1098  */
1099 
1100 /* EUC_JP is a pivot code, so we don't need to convert.  This function
1101    is just a placeholder. */
1102 static size_t pivot(ScmConvInfo *cinfo, const char *inptr, size_t inroom,
1103                     char *outptr, size_t outroom, size_t *outchars)
1104 {
1105     return 0;
1106 }
1107 
1108 /*=================================================================
1109  * JCONV - the entry
1110  */
1111 
1112 /* canonical code designator */
1113 enum {
1114     JCODE_EUCJ,
1115     JCODE_SJIS,
1116     JCODE_UTF8,
1117     JCODE_ISO2022JP,
1118     JCODE_NONE,    /* a special entry standing for byte stream */
1119 #if 0
1120     JCODE_ISO2022JP-2,
1121     JCODE_ISO2022JP-3
1122 #endif
1123 };
1124 
1125 /* map canonical code designator to inconv and outconv.  the order of
1126    entry must match with the above designators. */
1127 static struct conv_converter_rec {
1128     ScmConvProc inconv;
1129     ScmConvProc outconv;
1130     ScmConvReset reset;
1131 } conv_converter[] = {
1132     { pivot, pivot, NULL },              /* EUCJ */
1133     { sjis2eucj, eucj2sjis, NULL },      /* SJIS */
1134     { utf2eucj,  eucj2utf,  NULL },      /* UTF8 */
1135     { jis2eucj,  eucj2jis,  jis_reset }, /* ISO2022JP */
1136     { pivot, pivot, NULL },              /* NONE */
1137 };
1138 
1139 /* map convesion name to the canonical code */
1140 static struct conv_support_rec {
1141     const char *name;
1142     int code;
1143 } conv_supports[] = {
1144     { "euc_jp",       JCODE_EUCJ },
1145     { "eucjp",        JCODE_EUCJ },
1146     { "eucj",         JCODE_EUCJ },
1147     { "euc_jisx0213", JCODE_EUCJ },
1148     { "shift_jis",    JCODE_SJIS },
1149     { "shiftjis",     JCODE_SJIS },
1150     { "sjis",         JCODE_SJIS },
1151     { "utf-8",        JCODE_UTF8 },
1152     { "utf8",         JCODE_UTF8 },
1153     { "iso2022jp",    JCODE_ISO2022JP },
1154     { "iso2022-jp",   JCODE_ISO2022JP },
1155     { "iso-2022-jp",  JCODE_ISO2022JP },
1156     { "csiso2022jp",  JCODE_ISO2022JP },
1157     { "iso2022jp-1",  JCODE_ISO2022JP },
1158     { "iso-2022jp-1", JCODE_ISO2022JP },
1159     { "iso2022jp-2",  JCODE_ISO2022JP },
1160     { "iso-2022jp-2", JCODE_ISO2022JP },
1161     { "iso2022jp-3",  JCODE_ISO2022JP },
1162     { "iso-2022jp-3", JCODE_ISO2022JP },
1163     { "none",         JCODE_NONE },
1164     { NULL, 0 }
1165 };
1166 
1167 static int conv_name_match(const char *s, const char *t)
1168 {
1169     const char *p, *q;
1170     for (p=s, q=t; *p && *q; p++, q++) {
1171         if (*p == '-' || *p == '_') {
1172             if (*q != '-' && *q != '_') return FALSE;
1173         } else {
1174             if (tolower(*p) != tolower(*q)) return FALSE;
1175         }
1176     }
1177     if (*p || *q) return FALSE;
1178     return TRUE;
1179 }
1180 
1181 static int conv_name_find(const char *name)
1182 {
1183     struct conv_support_rec *cvtab = conv_supports;
1184     for (; cvtab->name; cvtab++) {
1185         if (conv_name_match(name, cvtab->name)) {
1186             return cvtab->code;
1187         }
1188     }
1189     return -1;
1190 }
1191 
1192 /* Internal conversion handler.
1193    There are five cases to handle:
1194    (1) fromCode === toCode
1195      jconv just copies input to output.  I take speed than safety; input
1196      is not checked if it is conforming fromCode.
1197    (2) fromCode === pivot, toCode =/= pivot, and pivot->toCode supported.
1198    (3) fromCode =/= pivot, toCode === pivot, and fromCode->pivot supported.
1199      we just need one conversion subroutine.
1200    (4) fromCode =/= pivot, toCode =/= pivot, and fromCode->pivot->toCode
1201      supported.  we use two conversion subroutine cascaded.
1202    (5) other cases;
1203      we delegate the job to iconv.
1204 */
1205 
1206 /* case (1) */
1207 static size_t jconv_ident(ScmConvInfo *info, const char **iptr,
1208                           size_t *iroom, char **optr, size_t *oroom)
1209 {
1210     size_t inroom = *iroom, outroom = *oroom;
1211 #ifdef JCONV_DEBUG
1212     fprintf(stderr, "jconv_ident %s->%s\n", info->fromCode, info->toCode);
1213 #endif
1214     if (inroom <= outroom) {
1215         memcpy(*optr, *iptr, inroom);
1216         *optr += inroom;
1217         *iptr += inroom;
1218         *iroom = 0;
1219         *oroom -= inroom;
1220         return inroom;
1221     } else {
1222         memcpy(*optr, *iptr, outroom);
1223         *optr += outroom;
1224         *iptr += outroom;
1225         *iroom -= outroom;
1226         *oroom = 0;
1227         return OUTPUT_NOT_ENOUGH;
1228     }
1229 }
1230    
1231 /* case (2) or (3) */
1232 static size_t jconv_1tier(ScmConvInfo *info, const char **iptr,
1233                           size_t *iroom, char **optr, size_t *oroom)
1234 {
1235     ScmConvProc cvt = info->convproc[0];
1236     const char *inp = *iptr;
1237     char *outp = *optr;
1238     int inr = *iroom, outr = *oroom; 
1239     size_t outchars, inchars, converted = 0;
1240 
1241 #ifdef JCONV_DEBUG
1242     fprintf(stderr, "jconv_1tier %s->%s\n", info->fromCode, info->toCode);
1243 #endif
1244     SCM_ASSERT(cvt != NULL);
1245     while (inr > 0 && outr > 0) {
1246         inchars = cvt(info, inp, inr, outp, outr, &outchars);
1247         if (ERRP(inchars)) {
1248             converted = inchars;
1249             break;
1250         } else {
1251             converted += inchars;
1252             inp += inchars;
1253             inr -= inchars;
1254             outp += outchars;
1255             outr -= outchars;
1256         }
1257     }
1258     *iptr = inp;
1259     *iroom = inr;
1260     *optr = outp;
1261     *oroom = outr;
1262     return converted;
1263 }
1264    
1265 /* case (4) */
1266 #define INTBUFSIZ 20            /* intermediate buffer size */
1267 static size_t jconv_2tier(ScmConvInfo *info, const char **iptr, size_t *iroom,
1268                           char **optr, size_t *oroom)
1269 {
1270     char buf[INTBUFSIZ];
1271     ScmConvProc icvt = info->convproc[0];
1272     ScmConvProc ocvt = info->convproc[1];
1273     const char *inp = *iptr;
1274     char *outp = *optr;
1275     int inr = *iroom, outr = *oroom;
1276     size_t outchars, inchars, bufchars, converted = 0;
1277 
1278 #ifdef JCONV_DEBUG
1279     fprintf(stderr, "jconv_2tier %s->%s\n", info->fromCode, info->toCode);
1280 #endif
1281     while (inr > 0 && outr > 0) {
1282         inchars  = icvt(info, inp, inr, buf, INTBUFSIZ, &bufchars);
1283         if (ERRP(inchars)) {
1284             converted = inchars;
1285             break;
1286         }
1287         if (bufchars == 0) {
1288             outchars = 0;
1289         } else {
1290             bufchars = ocvt(info, buf, bufchars, outp, outr, &outchars);
1291             if (ERRP(bufchars)) {
1292                 converted = bufchars;
1293                 break;
1294             }
1295         }
1296         converted += inchars;
1297         inp += inchars;
1298         inr -= inchars;
1299         outp += outchars;
1300         outr -= outchars;
1301     }
1302     *iptr = inp;
1303     *iroom = inr;
1304     *optr = outp;
1305     *oroom = outr;
1306     return converted;
1307 }
1308 
1309 /* case (5) */
1310 #ifdef HAVE_ICONV_H
1311 /* NB: although iconv manages states, we need to keep track of whether
1312  * we're sure in default status (JIS_ASCII) or not (we use JIS_UNKNOWN for it).
1313  * It's because jconv_iconv_reset will be called twice if there is any
1314  * reset sequence; the first call should emit the sequence, but the second
1315  * call shouldn't.
1316  */
1317 static size_t jconv_iconv(ScmConvInfo *info, const char **iptr, size_t *iroom,
1318                           char **optr, size_t *oroom)
1319 {
1320     size_t r;
1321 #ifdef JCONV_DEBUG
1322     fprintf(stderr, "jconv_iconv %s->%s\n", info->fromCode, info->toCode);
1323 #endif
1324     r = iconv(info->handle, (char **)iptr, iroom, optr, oroom);
1325     info->ostate = JIS_UNKNOWN;
1326     if (r == (size_t)-1) {
1327         if (errno == EINVAL) return INPUT_NOT_ENOUGH;
1328         if (errno == E2BIG)  return OUTPUT_NOT_ENOUGH;
1329         return ILLEGAL_SEQUENCE;
1330     } else {
1331         return (int)r;
1332     }
1333 }
1334 
1335 /* reset routine for iconv */
1336 static size_t jconv_iconv_reset(ScmConvInfo *info, char *optr, size_t oroom)
1337 {
1338     size_t oroom_prev = oroom;
1339     size_t r;
1340     if (info->ostate == JIS_ASCII) return 0;
1341     r = iconv(info->handle, NULL, 0, &optr, &oroom);
1342     if (r == (size_t)-1) {
1343         if (errno == E2BIG)  return OUTPUT_NOT_ENOUGH;
1344         Scm_Panic("jconv_iconv_reset: unknown error number %d\n", errno);
1345     }
1346     info->ostate = JIS_ASCII;
1347     return oroom_prev - oroom;
1348 }
1349 #endif /*HAVE_ICONV_H*/
1350 
1351 /*------------------------------------------------------------------
1352  * JCONV_OPEN
1353  *  Returns ScmConvInfo, setting up some fields.
1354  *  If no conversion is possible, returns NULL.
1355  */
1356 ScmConvInfo *jconv_open(const char *toCode, const char *fromCode)
1357 {
1358     ScmConvInfo *info;
1359     ScmConvHandler handler = NULL;
1360     int incode, outcode;
1361     ScmConvProc convproc[2];
1362     ScmConvReset reset;
1363     iconv_t handle = (iconv_t)-1;
1364 
1365     incode  = conv_name_find(fromCode);
1366     outcode = conv_name_find(toCode);
1367 
1368     if (incode == JCODE_NONE || outcode == JCODE_NONE) {
1369         /* conversion to/from none means no conversion */
1370         handler = jconv_ident;
1371         convproc[0] = convproc[1] = NULL;
1372         reset = NULL;
1373     } else if (incode < 0 || outcode < 0) {
1374 #ifdef HAVE_ICONV_H        
1375         /* try iconv */
1376         handle = iconv_open(toCode, fromCode);
1377         if (handle == (iconv_t)-1) return NULL;
1378         handler = jconv_iconv;
1379         convproc[0] = convproc[1] = NULL;
1380         reset = jconv_iconv_reset;
1381 #else /*!HAVE_ICONV_H*/
1382         return NULL;
1383 #endif
1384     } else if (incode == outcode) {
1385         /* pattern (1) */
1386         handler = jconv_ident;
1387         convproc[0] = convproc[1] = NULL;
1388         reset = NULL;
1389     } else if (incode == JCODE_EUCJ) {
1390         /* pattern (2) */
1391         handler = jconv_1tier;
1392         convproc[0] = conv_converter[outcode].outconv;
1393         convproc[1] = NULL;
1394         reset = conv_converter[outcode].reset;
1395     } else if (outcode == JCODE_EUCJ) {
1396         /* pattern (3) */
1397         handler = jconv_1tier;
1398         convproc[0] = conv_converter[incode].inconv;
1399         convproc[1] = NULL;
1400         reset = NULL;
1401     } else {
1402         /* pattern (4) */
1403         handler = jconv_2tier;
1404         convproc[0] = conv_converter[incode].inconv;
1405         convproc[1] = conv_converter[outcode].outconv;
1406         reset = conv_converter[outcode].reset;
1407     }
1408     info = SCM_NEW(ScmConvInfo);
1409     info->jconv = handler;
1410     info->convproc[0] = convproc[0];
1411     info->convproc[1] = convproc[1];
1412     info->reset = reset;
1413     info->handle = handle;
1414     info->toCode = toCode;
1415     info->istate = info->ostate = JIS_ASCII;
1416     info->fromCode = fromCode;
1417     return info;
1418 }
1419 
1420 /*------------------------------------------------------------------
1421  * JCONV_CLOSE
1422  */
1423 int jconv_close(ScmConvInfo *info)
1424 {
1425     int r = 0;
1426 #ifdef HAVE_ICONV_H
1427     if (info->handle != (iconv_t)-1) {
1428         r = iconv_close(info->handle);
1429         info->handle = (iconv_t)-1;
1430     }
1431 #endif /*HAVE_ICONV_H*/
1432     return r;
1433 }
1434 
1435 /*------------------------------------------------------------------
1436  * JCONV - main conversion routine
1437  */
1438 size_t jconv(ScmConvInfo *info,
1439              const char **inptr, size_t *inroom,
1440              char **outptr, size_t *outroom)
1441 {
1442     SCM_ASSERT(info->jconv != NULL);
1443     return info->jconv(info, inptr, inroom, outptr, outroom);
1444 }
1445 
1446 /*------------------------------------------------------------------
1447  * JCONV_RESET - reset
1448  */
1449 size_t jconv_reset(ScmConvInfo *info, char *outptr, size_t outroom)
1450 {
1451     if (info->reset) {
1452         return info->reset(info, outptr, outroom);
1453     } else {
1454         return 0;
1455     }
1456 }

/* [<][>][^][v][top][bottom][index][help] */