root/ext/charconv/charconv.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. conv_guess
  2. Scm_GetCESName
  3. Scm_ConversionSupportedP
  4. Scm_RegisterCodeGuessingProc
  5. findGuessingProc
  6. conv_fileno
  7. conv_ready
  8. conv_name
  9. conv_input_filler
  10. conv_input_closer
  11. Scm_MakeInputConversionPort
  12. coding_aware_conv
  13. conv_output_closer
  14. conv_output_flusher
  15. Scm_MakeOutputConversionPort
  16. Scm_GuessCES
  17. ucstochar
  18. chartoucs
  19. Scm_Init_libcharconv

   1 /*
   2  * charconv.c - character code conversion library
   3  *
   4  *   Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
   5  * 
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  * 
  10  *   1. Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *
  13  *   2. Redistributions in binary form must reproduce the above copyright
  14  *      notice, this list of conditions and the following disclaimer in the
  15  *      documentation and/or other materials provided with the distribution.
  16  *
  17  *   3. Neither the name of the authors nor the names of its contributors
  18  *      may be used to endorse or promote products derived from this
  19  *      software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  $Id: charconv.c,v 1.53 2005/11/02 06:03:26 shirok Exp $
  34  */
  35 
  36 #include <string.h>
  37 #include <errno.h>
  38 #include <gauche.h>
  39 #include <gauche/extend.h>
  40 #include "charconv.h"
  41 
  42 #define DEFAULT_CONVERSION_BUFFER_SIZE 1024
  43 #define MINIMUM_CONVERSION_BUFFER_SIZE 16
  44 
  45 typedef struct conv_guess_rec {
  46     const char *codeName;
  47     ScmCodeGuessingProc proc;
  48     void *data;
  49     struct conv_guess_rec *next;
  50 } conv_guess;
  51 
  52 /* anchor of the chain of conversion guessing procedures */
  53 static struct {
  54     int dummy;                  /* trick to place this in .data section */
  55     conv_guess *procs;
  56     ScmInternalMutex mutex;
  57 } guess = { 1 };
  58 
  59 /* anchor of the conversion context used for UCS -> internal char routine */
  60 static struct {
  61     int dummy;                  /* trick to place this in .data section */
  62     ScmConvInfo *ucs2char;
  63     ScmConvInfo *char2ucs;
  64     ScmInternalMutex mutex;
  65 } ucsconv = { 1 };
  66 
  67 /*------------------------------------------------------------
  68  * Query
  69  */
  70 
  71 /* Auxiliary function */
  72 const char* Scm_GetCESName(ScmObj code, const char *argname)
  73 {
  74     const char *c = NULL;
  75     if (SCM_UNBOUNDP(code) || SCM_FALSEP(code)) {
  76         c = Scm_SupportedCharacterEncodings()[0];
  77     } else if (SCM_STRINGP(code)) {
  78         c = Scm_GetStringConst(SCM_STRING(code));
  79     } else if (SCM_SYMBOLP(code)) {
  80         c = Scm_GetStringConst(SCM_SYMBOL_NAME(code));
  81     } else {
  82         Scm_Error("string, symbol or #f is required for %s, but got %S",
  83                   argname, code);
  84     }
  85     return c;
  86 }
  87 
  88 int Scm_ConversionSupportedP(const char *from, const char *to)
  89 {
  90     ScmConvInfo *info = jconv_open(to, from);
  91     if (info == NULL) return FALSE;
  92     jconv_close(info);
  93     return TRUE;
  94 }
  95 
  96 void Scm_RegisterCodeGuessingProc(const char *code,
  97                                   ScmCodeGuessingProc proc,
  98                                   void *data)
  99 {
 100     conv_guess *rec = SCM_NEW(conv_guess);
 101     rec->codeName = code;
 102     rec->proc = proc;
 103     rec->data = data;
 104     (void)SCM_INTERNAL_MUTEX_LOCK(guess.mutex);
 105     rec->next = guess.procs;
 106     guess.procs = rec;
 107     (void)SCM_INTERNAL_MUTEX_UNLOCK(guess.mutex);
 108 }
 109 
 110 static conv_guess *findGuessingProc(const char *code)
 111 {
 112     conv_guess *rec;
 113     (void)SCM_INTERNAL_MUTEX_LOCK(guess.mutex);
 114     for (rec = guess.procs; rec; rec = rec->next) {
 115         if (strcasecmp(rec->codeName, code) == 0) break;
 116     }
 117     (void)SCM_INTERNAL_MUTEX_UNLOCK(guess.mutex);
 118     return rec;
 119 }
 120 
 121 static int conv_fileno(ScmPort *port)
 122 {
 123     ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
 124     return Scm_PortFileNo(info->remote);
 125 }
 126 
 127 static int conv_ready(ScmPort *port)
 128 {
 129     ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
 130     /* This isn't accurate, but for now ... */
 131     return Scm_CharReady(info->remote);
 132 }
 133 
 134 static ScmObj conv_name(int dir, ScmPort *remote, const char *from, const char *to)
 135 {
 136     ScmObj out = Scm_MakeOutputStringPort(TRUE);
 137     Scm_Printf(SCM_PORT(out), "[conv(%s->%s) %s %S]",
 138                from, to, (dir == SCM_PORT_INPUT? "from" : "to"),
 139                Scm_PortName(remote));
 140     return Scm_GetOutputStringUnsafe(SCM_PORT(out));
 141 }
 142 
 143 /*------------------------------------------------------------
 144  * Input conversion
 145  *
 146  *  <-- Buffered port <--- filler <--(info->buf)--- getz(remote)
 147  */
 148 
 149 static int conv_input_filler(ScmPort *port, int mincnt)
 150 {
 151     ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
 152     size_t insize, inroom, outroom, result;
 153     int nread;
 154     const char *inbuf = info->buf;
 155     char *outbuf = port->src.buf.end;
 156 
 157     if (info->remoteClosed) return 0;
 158     
 159     /* Fill the input buffer.  There may be some remaining bytes in the
 160        inbuf from the last conversion (insize), so we try to fill the
 161        rest. */
 162     insize = info->ptr - info->buf;
 163     nread = Scm_Getz(info->ptr, info->bufsiz - insize, info->remote);
 164     if (nread <= 0) {
 165         /* input reached EOF.  finish the output state */
 166         if (insize == 0) {
 167             outroom = SCM_PORT_BUFFER_ROOM(port);
 168             result = jconv_reset(info, outbuf, outroom);
 169             if (result < 0) {
 170                 /* The port buffer doesn't have enough space to contain the
 171                    finishing sequence.  Its unusual, for the port buffer
 172                    must be almost empty at this time, and the finishing
 173                    sequence is usually just a few bytes.
 174                    We signal an error. */
 175                 Scm_Error("couldn't flush the ending escape sequence in the character encoding conversion port (%s -> %s).  possibly an implementation error",
 176                           info->fromCode, info->toCode);
 177                 }
 178             if (info->ownerp) {
 179                 Scm_ClosePort(info->remote);
 180                 info->remoteClosed = TRUE;
 181             }
 182 #ifdef JCONV_DEBUG
 183             fprintf(stderr, "<= r=%d (reset), out(%p)%d\n",
 184                     result, outbuf, outroom);
 185 #endif
 186             return result;
 187         }
 188     } else {
 189         insize += nread;
 190     }
 191 
 192     /* Conversion. */
 193     inroom = insize;
 194     outroom = SCM_PORT_BUFFER_ROOM(port);
 195 
 196 #ifdef JCONV_DEBUG
 197     fprintf(stderr, "=> in(%p)%d out(%p)%d\n", inbuf, insize, outbuf, outroom);
 198 #endif
 199     result = jconv(info, &inbuf, &inroom, &outbuf, &outroom);
 200 #ifdef JCONV_DEBUG
 201     fprintf(stderr, "<= r=%d, in(%p)%d out(%p)%d\n",
 202             result, inbuf, inroom, outbuf, outroom);
 203 #endif
 204     /* we've got an error. */
 205     if (result == INPUT_NOT_ENOUGH || result == OUTPUT_NOT_ENOUGH) {
 206         /* Conversion stopped due to an incomplete character at the
 207            end of the input buffer, or the output buffer is full.
 208            We shift the unconverted bytes to the beginning of input
 209            buffer. */
 210         memmove(info->buf, info->buf+insize-inroom, inroom);
 211         info->ptr = info->buf + inroom;
 212         return info->bufsiz - outroom;
 213     } else if (result == ILLEGAL_SEQUENCE) {
 214         /* it's likely that the input contains invalid sequence. */
 215         int cnt = inroom >= 6 ? 6 : inroom;
 216         ScmObj s = Scm_MakeString(info->buf+insize-inroom, cnt, cnt,
 217                                   SCM_MAKSTR_COPYING|SCM_MAKSTR_INCOMPLETE);
 218         Scm_Error("invalid character sequence in the input stream: %S ...", s);
 219     }
 220 
 221     /* Conversion is done completely. */
 222     /* NB: There are cases that some bytes are left in the input buffer
 223        even iconv returns positive value.  We need to shift those bytes. */
 224     if (inroom > 0) {
 225         memmove(info->buf, info->buf+insize-inroom, inroom);
 226         info->ptr = info->buf + inroom;
 227         return info->bufsiz - outroom;
 228     } else {
 229         info->ptr = info->buf;
 230         return info->bufsiz - outroom;
 231     }
 232 }
 233 
 234 static void conv_input_closer(ScmPort *p)
 235 {
 236     ScmConvInfo *info = (ScmConvInfo*)p->src.buf.data;
 237     jconv_close(info);
 238 }
 239 
 240 ScmObj Scm_MakeInputConversionPort(ScmPort *fromPort,
 241                                    const char *fromCode,
 242                                    const char *toCode,
 243                                    ScmObj handler,
 244                                    int bufsiz,
 245                                    int ownerp)
 246 {
 247     ScmConvInfo *cinfo;
 248     conv_guess *guess;
 249     char *inbuf = NULL;
 250     int preread = 0;
 251     ScmPortBuffer bufrec;
 252     ScmObj name;
 253 
 254     if (!SCM_IPORTP(fromPort))
 255         Scm_Error("input port required, but got %S", fromPort);
 256 
 257     if (bufsiz <= 0) bufsiz = DEFAULT_CONVERSION_BUFFER_SIZE;
 258     if (bufsiz <= MINIMUM_CONVERSION_BUFFER_SIZE) {
 259         bufsiz = MINIMUM_CONVERSION_BUFFER_SIZE;
 260     }
 261     guess = findGuessingProc(fromCode);
 262     if (guess) {
 263         const char *guessed;
 264         
 265         inbuf = SCM_NEW_ATOMIC2(char *, bufsiz);
 266         preread = Scm_Getz(inbuf, bufsiz, fromPort);
 267         if (preread <= 0) {
 268             /* Input buffer is already empty or unreadable.
 269                Determining character code is not necessary.
 270                We just return a dummy empty port. */
 271             return Scm_MakeInputStringPort(SCM_STRING(SCM_MAKE_STR("")), FALSE);
 272         }
 273         guessed = guess->proc(inbuf, preread, guess->data);
 274         if (guessed == NULL)
 275             Scm_Error("%s: failed to guess input encoding", fromCode);
 276         fromCode = guessed;
 277     }
 278 
 279     cinfo = jconv_open(toCode, fromCode);
 280     if (cinfo == NULL) {
 281         Scm_Error("conversion from code %s to code %s is not supported",
 282                   fromCode, toCode);
 283     }
 284     cinfo->remote = fromPort;
 285     cinfo->ownerp = ownerp;
 286     cinfo->bufsiz = bufsiz;
 287     cinfo->remoteClosed = FALSE;
 288     if (preread > 0) {
 289         cinfo->buf = inbuf;
 290         cinfo->ptr = inbuf + preread;
 291     } else {
 292         cinfo->buf = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
 293         cinfo->ptr = cinfo->buf;
 294     }
 295 
 296     memset(&bufrec, 0, sizeof(bufrec));
 297     bufrec.size = cinfo->bufsiz;
 298     bufrec.buffer = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
 299     bufrec.mode = SCM_PORT_BUFFER_FULL;
 300     bufrec.filler = conv_input_filler;
 301     bufrec.flusher = NULL;
 302     bufrec.closer = conv_input_closer;
 303     bufrec.ready = conv_ready;
 304     bufrec.filenum = conv_fileno;
 305     bufrec.data = (void*)cinfo;
 306 
 307     name = conv_name(SCM_PORT_INPUT, fromPort, fromCode, toCode);
 308     return Scm_MakeBufferedPort(SCM_CLASS_PORT, name, SCM_PORT_INPUT, TRUE, &bufrec);
 309 }
 310 
 311 /* a special case of input conversion port --- coding-aware port coversion.
 312    this function is called via Scm_CodingAwarePortHook from
 313    src/port.c */
 314 
 315 static ScmPort *coding_aware_conv(ScmPort *src, const char *encoding)
 316 {
 317     return SCM_PORT(Scm_MakeInputConversionPort(src,
 318                                                 encoding,
 319                                                 Scm_SupportedCharacterEncodings()[0],
 320                                                 SCM_FALSE,
 321                                                 0, TRUE));
 322 }
 323 
 324 /*------------------------------------------------------------
 325  * Output conversion
 326  *
 327  *   Buffered port ----> flusher -->(info->buf)--> putz(remote)
 328  */
 329 
 330 /* NB: Glibc-2.1.2's iconv() has a bug in SJIS handling.  If output
 331  * is in SJIS and output buffer overflows in the middle of two-byte
 332  * sequence, it leaves the first byte in the output buffer as if
 333  * it were valid converted character, while the input buffer pointer
 334  * stops just before the unconverted character, as supposed.
 335  * There's no way to detect that unless I scan the output by myself
 336  * to see the last byte of conversion is invalid or not.
 337  *
 338  * As a workaround, I flush the output buffer more frequently than
 339  * needed, avoiding the situation that the output buffer overflow.
 340  * Hoping the bugs are fixed in the future release of glibc.
 341  */
 342 
 343 #define GLIBC_2_1_ICONV_BUG
 344 
 345 static void conv_output_closer(ScmPort *port)
 346 {
 347     ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
 348     int r;
 349 
 350     /* if there's remaining bytes in buf, send them to the remote port. */
 351     if (info->ptr > info->buf) {
 352         Scm_Putz(info->buf, info->ptr - info->buf, info->remote);
 353         info->ptr = info->buf;
 354     }
 355     /* sends out the closing sequence, if any */
 356     r = jconv_reset(info, info->buf, info->bufsiz);
 357 #ifdef JCONV_DEBUG
 358     fprintf(stderr, "<= r=%d(reset), buf(%p)\n",
 359             r, info->buf);
 360 #endif
 361     if (r < 0) {
 362         Scm_Error("something wrong in resetting output character encoding conversion (%s -> %s).  possibly an implementation error.",
 363                   info->fromCode, info->toCode);
 364     }
 365     if (r > 0) {
 366         Scm_Putz(info->buf, r, info->remote);
 367     }
 368     /* flush remove port */
 369     Scm_Flush(info->remote);
 370     if (info->ownerp) {
 371         Scm_ClosePort(info->remote);
 372         info->remoteClosed = TRUE;
 373     }
 374     jconv_close(info);
 375 }
 376 
 377 static int conv_output_flusher(ScmPort *port, int cnt, int forcep)
 378 {
 379     ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
 380     size_t outsize, inroom, outroom, result, len;
 381     const char *inbuf;
 382     char *outbuf;
 383 
 384     inbuf = port->src.buf.buffer;
 385     inroom = len = SCM_PORT_BUFFER_AVAIL(port);
 386     for (;;) {
 387         /* Conversion. */
 388         outbuf = info->ptr;
 389         outsize = info->bufsiz - (info->ptr - info->buf);
 390         outroom = outsize;
 391 #ifdef JCONV_DEBUG
 392         fprintf(stderr, "=> in(%p,%p)%d out(%p,%p)%d\n",
 393                 inbuf, len, inroom,
 394                 info->buf, info->ptr, outroom);
 395 #endif
 396         result = jconv(info, &inbuf, &inroom, &outbuf, &outroom);
 397 #ifdef JCONV_DEBUG
 398         fprintf(stderr, "<= r=%d, in(%p)%d out(%p)%d\n",
 399                 result, inbuf, inroom, outbuf, outroom);
 400 #endif
 401         if (result == INPUT_NOT_ENOUGH) {
 402 #ifndef GLIBC_2_1_ICONV_BUG
 403             /* Conversion stopped due to an incomplete character at the
 404                end of the input buffer.  We just return # of bytes
 405                flushed.  (Shifting unconverted characters is done by
 406                buffered port routine) */
 407             info->ptr = outbuf;
 408 #else
 409             /* See the above notes.  We always flush the output buffer
 410                here, so that we can avoid output buffer overrun. */
 411             Scm_Putz(info->buf, outbuf - info->buf, info->remote);
 412             info->ptr = info->buf;
 413 #endif
 414             return len - inroom;
 415         } else if (result == OUTPUT_NOT_ENOUGH) {
 416             /* Output buffer got full.  Flush it, and continue
 417                conversion. */
 418             Scm_Putz(info->buf, outbuf - info->buf, info->remote);
 419             info->ptr = info->buf;
 420             continue;
 421         } else if (result == ILLEGAL_SEQUENCE) {
 422             /* it's likely that input contains invalid sequence.
 423                TODO: we should handle this case gracefully. */
 424             Scm_Error("invalid character sequence in the input stream");
 425             return 0;           /* dummy */
 426         } else {
 427 #ifndef GLIBC_2_1_ICONV_BUG
 428             /* Conversion is done completely.  Update outptr. */
 429             info->ptr = outbuf;
 430 #else
 431             /* See the above notes.  We always flush the output buffer here,
 432                so that we can avoid output buffer overrun. */
 433             Scm_Putz(info->buf, outbuf - info->buf, info->remote);
 434             info->ptr = info->buf;
 435 #endif
 436             if (forcep && len - inroom != cnt) continue;
 437             return len - inroom;
 438         }
 439     }
 440 }
 441 
 442 ScmObj Scm_MakeOutputConversionPort(ScmPort *toPort,
 443                                     const char *toCode,
 444                                     const char *fromCode,
 445                                     int bufsiz, int ownerp)
 446 {
 447     ScmConvInfo *cinfo;
 448     ScmPortBuffer bufrec;
 449     ScmObj name;
 450     
 451     if (!SCM_OPORTP(toPort))
 452         Scm_Error("output port required, but got %S", toPort);
 453 
 454     if (bufsiz <= 0) bufsiz = DEFAULT_CONVERSION_BUFFER_SIZE;
 455     if (bufsiz <= MINIMUM_CONVERSION_BUFFER_SIZE) {
 456         bufsiz = MINIMUM_CONVERSION_BUFFER_SIZE;
 457     }
 458     
 459     cinfo = jconv_open(toCode, fromCode);
 460     if (cinfo == NULL) {
 461         Scm_Error("conversion from code %s to code %s is not supported",
 462                   fromCode, toCode);
 463     }
 464     cinfo->remote = toPort;
 465     cinfo->ownerp = ownerp;
 466     cinfo->bufsiz = (bufsiz > 0)? bufsiz : DEFAULT_CONVERSION_BUFFER_SIZE;
 467     cinfo->remoteClosed = FALSE;
 468     cinfo->buf = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
 469     cinfo->ptr = cinfo->buf;
 470 
 471     memset(&bufrec, 0, sizeof(bufrec));
 472     bufrec.size = cinfo->bufsiz;
 473     bufrec.buffer = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
 474     bufrec.mode = SCM_PORT_BUFFER_FULL;
 475     bufrec.filler = NULL;
 476     bufrec.flusher = conv_output_flusher;
 477     bufrec.closer = conv_output_closer;
 478     bufrec.ready = conv_ready;
 479     bufrec.filenum = conv_fileno;
 480     bufrec.data = (void*)cinfo;
 481     
 482     name = conv_name(SCM_PORT_OUTPUT, toPort, fromCode, toCode);
 483     return Scm_MakeBufferedPort(SCM_CLASS_PORT, name, SCM_PORT_OUTPUT, TRUE, &bufrec);
 484 }
 485 
 486 /*------------------------------------------------------------
 487  * Direct interface for code guessing
 488  */
 489 const char *Scm_GuessCES(const char *code, const char *buf, int buflen)
 490 {
 491     conv_guess *guess = findGuessingProc(code);
 492     if (guess == NULL)
 493         Scm_Error("unknown code guessing scheme: %s", code);
 494     return guess->proc(buf, buflen, guess->data);
 495 }
 496 
 497 /*------------------------------------------------------------
 498  * UCS4 <-> internal character routine
 499  *
 500  * These routines are called when the literal character is given by
 501  * unicode notation (#\uXXXx, #\UXXXXXXXX or \uXXXX, \UXXXXXXXX inside 
 502  * string), or unicode->char routine is called.
 503  * For this purpose, we keep two global conversion context.
 504  * Since internal encodings are stateless, we can reuse those
 505  * context, instead of calling jconv_open every time.
 506  */
 507 
 508 static ScmChar ucstochar(int ucs4)
 509 {
 510 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
 511     return (ScmChar)ucs4;
 512 #else  /*!GAUCHE_CHAR_ENCODING_UTF_8*/
 513     char inbuf[6], outbuf[6];
 514     const char *inb = inbuf;
 515     char *outb = outbuf;
 516     size_t inroom, outroom, r;
 517     
 518     if (ucsconv.ucs2char == NULL) return SCM_CHAR_INVALID;
 519     inroom = UCS2UTF_NBYTES(ucs4);
 520     outroom = 6;
 521     jconv_ucs4_to_utf8(ucs4, inbuf);
 522     (void)SCM_INTERNAL_MUTEX_LOCK(ucsconv.mutex);
 523     r = jconv(ucsconv.ucs2char, &inb, &inroom, &outb, &outroom);
 524     (void)SCM_INTERNAL_MUTEX_UNLOCK(ucsconv.mutex);
 525     if (r == INPUT_NOT_ENOUGH || r == OUTPUT_NOT_ENOUGH) {
 526         Scm_Error("can't convert UCS4 code %d to a character: implementation problem?", ucs4);
 527     }
 528     if (r == ILLEGAL_SEQUENCE) {
 529         return SCM_CHAR_INVALID;
 530     } else {
 531         ScmChar out;
 532         SCM_CHAR_GET(outbuf, out);
 533         return out;
 534     }
 535 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
 536 }
 537 
 538 static int chartoucs(ScmChar ch)
 539 {
 540 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
 541     if (ch == SCM_CHAR_INVALID) return -1;
 542     return (int)ch;
 543 #else  /*!GAUCHE_CHAR_ENCODING_UTF_8*/
 544     char inbuf[6], outbuf[6];
 545     const char *inb = inbuf;
 546     char *outb = outbuf;
 547     size_t inroom, outroom, r;
 548 
 549     if (ch == SCM_CHAR_INVALID) return -1;
 550     if (ucsconv.char2ucs == NULL) return -1;
 551     inroom = SCM_CHAR_NBYTES(ch);
 552     outroom = 6;
 553     SCM_CHAR_PUT(inbuf, ch);
 554     (void)SCM_INTERNAL_MUTEX_LOCK(ucsconv.mutex);
 555     r = jconv(ucsconv.char2ucs, &inb, &inroom, &outb, &outroom);
 556     (void)SCM_INTERNAL_MUTEX_UNLOCK(ucsconv.mutex);
 557     if (r == INPUT_NOT_ENOUGH || r == OUTPUT_NOT_ENOUGH) {
 558         Scm_Error("can't convert character %u to UCS4 code: implementation problem?", ch);
 559     }
 560     if (r == ILLEGAL_SEQUENCE) {
 561         return -1;
 562     } else {
 563         unsigned char *ucp = (unsigned char*)outbuf;
 564         if (ucp[0] < 0x80) return (int)ucp[0];
 565         if (ucp[0] < 0xe0) {
 566             return ((ucp[0]&0x1f)<<6) + (ucp[1]&0x3f);
 567         }
 568         if (ucp[0] < 0xf0) {
 569             return ((ucp[0]&0x0f)<<12)
 570                    + ((ucp[1]&0x3f)<<6)
 571                    + (ucp[2]&0x3f);
 572         }
 573         if (ucp[0] < 0xf8) {
 574             return ((ucp[0]&0x07)<<18)
 575                    + ((ucp[1]&0x3f)<<12)
 576                    + ((ucp[2]&0x3f)<<6)
 577                    + (ucp[3]&0x3f);
 578         }
 579         if (ucp[0] < 0xfc) {
 580             return ((ucp[0]&0x03)<<24)
 581                    + ((ucp[1]&0x3f)<<18)
 582                    + ((ucp[2]&0x3f)<<12)
 583                    + ((ucp[3]&0x3f)<<6)
 584                    + (ucp[4]&0x3f);
 585         }
 586         if (ucp[0] < 0xfe) {
 587             return ((ucp[0]&0x01)<<30)
 588                    + ((ucp[1]&0x3f)<<24)
 589                    + ((ucp[2]&0x3f)<<18)
 590                    + ((ucp[3]&0x3f)<<12)
 591                    + ((ucp[4]&0x3f)<<6)
 592                    + (ucp[5]&0x3f);
 593         }
 594         return -1;
 595     }
 596 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
 597 }
 598 
 599 /*====================================================================
 600  * Initialization
 601  */
 602 extern void Scm_Init_convlib(ScmModule *module);
 603 extern void Scm_Init_convaux(void);
 604 extern void Scm_Init_convguess(void);
 605 SCM_EXTERN ScmChar (*Scm_UcsToCharHook)(int ucs4);
 606 SCM_EXTERN int (*Scm_CharToUcsHook)(ScmChar ch);
 607 SCM_EXTERN ScmPort *(*Scm_CodingAwarePortHook)(ScmPort *src,
 608                                                const char *encoding);
 609 
 610 void Scm_Init_libcharconv(void)
 611 {
 612     ScmModule *mod;
 613     SCM_INIT_EXTENSION(charconv);
 614     mod = SCM_FIND_MODULE("gauche.charconv", SCM_FIND_MODULE_CREATE);
 615     guess.procs = NULL;
 616     (void)SCM_INTERNAL_MUTEX_INIT(guess.mutex);
 617 #if   defined(GAUCHE_CHAR_ENCODING_UTF_8)
 618     ucsconv.ucs2char = ucsconv.char2ucs = NULL;
 619 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP)
 620     ucsconv.ucs2char = jconv_open("EUCJP", "UTF-8");
 621     ucsconv.char2ucs = jconv_open("UTF-8", "EUCJP");
 622 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
 623     ucsconv.ucs2char = jconv_open("SJIS", "UTF-8");
 624     ucsconv.char2ucs = jconv_open("UTF-8", "SJIS");
 625 #else
 626     ucsconv.ucs2char = ucsconv.char2ucs = NULL;
 627 #endif
 628     (void)SCM_INTERNAL_MUTEX_INIT(ucsconv.mutex);
 629     Scm_Init_convguess();
 630     Scm_Init_convlib(mod);
 631     Scm_Init_convaux();
 632     Scm_UcsToCharHook = ucstochar;
 633     Scm_CharToUcsHook = chartoucs;
 634     Scm_CodingAwarePortHook = coding_aware_conv;
 635 }

/* [<][>][^][v][top][bottom][index][help] */