/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- conv_guess
- Scm_GetCESName
- Scm_ConversionSupportedP
- Scm_RegisterCodeGuessingProc
- findGuessingProc
- conv_fileno
- conv_ready
- conv_name
- conv_input_filler
- conv_input_closer
- Scm_MakeInputConversionPort
- coding_aware_conv
- conv_output_closer
- conv_output_flusher
- Scm_MakeOutputConversionPort
- Scm_GuessCES
- ucstochar
- chartoucs
- Scm_Init_libcharconv
1 /*
2 * charconv.c - character code conversion library
3 *
4 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * $Id: charconv.c,v 1.53 2005/11/02 06:03:26 shirok Exp $
34 */
35
36 #include <string.h>
37 #include <errno.h>
38 #include <gauche.h>
39 #include <gauche/extend.h>
40 #include "charconv.h"
41
42 #define DEFAULT_CONVERSION_BUFFER_SIZE 1024
43 #define MINIMUM_CONVERSION_BUFFER_SIZE 16
44
45 typedef struct conv_guess_rec {
46 const char *codeName;
47 ScmCodeGuessingProc proc;
48 void *data;
49 struct conv_guess_rec *next;
50 } conv_guess;
51
52 /* anchor of the chain of conversion guessing procedures */
53 static struct {
54 int dummy; /* trick to place this in .data section */
55 conv_guess *procs;
56 ScmInternalMutex mutex;
57 } guess = { 1 };
58
59 /* anchor of the conversion context used for UCS -> internal char routine */
60 static struct {
61 int dummy; /* trick to place this in .data section */
62 ScmConvInfo *ucs2char;
63 ScmConvInfo *char2ucs;
64 ScmInternalMutex mutex;
65 } ucsconv = { 1 };
66
67 /*------------------------------------------------------------
68 * Query
69 */
70
71 /* Auxiliary function */
72 const char* Scm_GetCESName(ScmObj code, const char *argname)
73 {
74 const char *c = NULL;
75 if (SCM_UNBOUNDP(code) || SCM_FALSEP(code)) {
76 c = Scm_SupportedCharacterEncodings()[0];
77 } else if (SCM_STRINGP(code)) {
78 c = Scm_GetStringConst(SCM_STRING(code));
79 } else if (SCM_SYMBOLP(code)) {
80 c = Scm_GetStringConst(SCM_SYMBOL_NAME(code));
81 } else {
82 Scm_Error("string, symbol or #f is required for %s, but got %S",
83 argname, code);
84 }
85 return c;
86 }
87
88 int Scm_ConversionSupportedP(const char *from, const char *to)
89 {
90 ScmConvInfo *info = jconv_open(to, from);
91 if (info == NULL) return FALSE;
92 jconv_close(info);
93 return TRUE;
94 }
95
96 void Scm_RegisterCodeGuessingProc(const char *code,
97 ScmCodeGuessingProc proc,
98 void *data)
99 {
100 conv_guess *rec = SCM_NEW(conv_guess);
101 rec->codeName = code;
102 rec->proc = proc;
103 rec->data = data;
104 (void)SCM_INTERNAL_MUTEX_LOCK(guess.mutex);
105 rec->next = guess.procs;
106 guess.procs = rec;
107 (void)SCM_INTERNAL_MUTEX_UNLOCK(guess.mutex);
108 }
109
110 static conv_guess *findGuessingProc(const char *code)
111 {
112 conv_guess *rec;
113 (void)SCM_INTERNAL_MUTEX_LOCK(guess.mutex);
114 for (rec = guess.procs; rec; rec = rec->next) {
115 if (strcasecmp(rec->codeName, code) == 0) break;
116 }
117 (void)SCM_INTERNAL_MUTEX_UNLOCK(guess.mutex);
118 return rec;
119 }
120
121 static int conv_fileno(ScmPort *port)
122 {
123 ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
124 return Scm_PortFileNo(info->remote);
125 }
126
127 static int conv_ready(ScmPort *port)
128 {
129 ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
130 /* This isn't accurate, but for now ... */
131 return Scm_CharReady(info->remote);
132 }
133
134 static ScmObj conv_name(int dir, ScmPort *remote, const char *from, const char *to)
135 {
136 ScmObj out = Scm_MakeOutputStringPort(TRUE);
137 Scm_Printf(SCM_PORT(out), "[conv(%s->%s) %s %S]",
138 from, to, (dir == SCM_PORT_INPUT? "from" : "to"),
139 Scm_PortName(remote));
140 return Scm_GetOutputStringUnsafe(SCM_PORT(out));
141 }
142
143 /*------------------------------------------------------------
144 * Input conversion
145 *
146 * <-- Buffered port <--- filler <--(info->buf)--- getz(remote)
147 */
148
149 static int conv_input_filler(ScmPort *port, int mincnt)
150 {
151 ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
152 size_t insize, inroom, outroom, result;
153 int nread;
154 const char *inbuf = info->buf;
155 char *outbuf = port->src.buf.end;
156
157 if (info->remoteClosed) return 0;
158
159 /* Fill the input buffer. There may be some remaining bytes in the
160 inbuf from the last conversion (insize), so we try to fill the
161 rest. */
162 insize = info->ptr - info->buf;
163 nread = Scm_Getz(info->ptr, info->bufsiz - insize, info->remote);
164 if (nread <= 0) {
165 /* input reached EOF. finish the output state */
166 if (insize == 0) {
167 outroom = SCM_PORT_BUFFER_ROOM(port);
168 result = jconv_reset(info, outbuf, outroom);
169 if (result < 0) {
170 /* The port buffer doesn't have enough space to contain the
171 finishing sequence. Its unusual, for the port buffer
172 must be almost empty at this time, and the finishing
173 sequence is usually just a few bytes.
174 We signal an error. */
175 Scm_Error("couldn't flush the ending escape sequence in the character encoding conversion port (%s -> %s). possibly an implementation error",
176 info->fromCode, info->toCode);
177 }
178 if (info->ownerp) {
179 Scm_ClosePort(info->remote);
180 info->remoteClosed = TRUE;
181 }
182 #ifdef JCONV_DEBUG
183 fprintf(stderr, "<= r=%d (reset), out(%p)%d\n",
184 result, outbuf, outroom);
185 #endif
186 return result;
187 }
188 } else {
189 insize += nread;
190 }
191
192 /* Conversion. */
193 inroom = insize;
194 outroom = SCM_PORT_BUFFER_ROOM(port);
195
196 #ifdef JCONV_DEBUG
197 fprintf(stderr, "=> in(%p)%d out(%p)%d\n", inbuf, insize, outbuf, outroom);
198 #endif
199 result = jconv(info, &inbuf, &inroom, &outbuf, &outroom);
200 #ifdef JCONV_DEBUG
201 fprintf(stderr, "<= r=%d, in(%p)%d out(%p)%d\n",
202 result, inbuf, inroom, outbuf, outroom);
203 #endif
204 /* we've got an error. */
205 if (result == INPUT_NOT_ENOUGH || result == OUTPUT_NOT_ENOUGH) {
206 /* Conversion stopped due to an incomplete character at the
207 end of the input buffer, or the output buffer is full.
208 We shift the unconverted bytes to the beginning of input
209 buffer. */
210 memmove(info->buf, info->buf+insize-inroom, inroom);
211 info->ptr = info->buf + inroom;
212 return info->bufsiz - outroom;
213 } else if (result == ILLEGAL_SEQUENCE) {
214 /* it's likely that the input contains invalid sequence. */
215 int cnt = inroom >= 6 ? 6 : inroom;
216 ScmObj s = Scm_MakeString(info->buf+insize-inroom, cnt, cnt,
217 SCM_MAKSTR_COPYING|SCM_MAKSTR_INCOMPLETE);
218 Scm_Error("invalid character sequence in the input stream: %S ...", s);
219 }
220
221 /* Conversion is done completely. */
222 /* NB: There are cases that some bytes are left in the input buffer
223 even iconv returns positive value. We need to shift those bytes. */
224 if (inroom > 0) {
225 memmove(info->buf, info->buf+insize-inroom, inroom);
226 info->ptr = info->buf + inroom;
227 return info->bufsiz - outroom;
228 } else {
229 info->ptr = info->buf;
230 return info->bufsiz - outroom;
231 }
232 }
233
234 static void conv_input_closer(ScmPort *p)
235 {
236 ScmConvInfo *info = (ScmConvInfo*)p->src.buf.data;
237 jconv_close(info);
238 }
239
240 ScmObj Scm_MakeInputConversionPort(ScmPort *fromPort,
241 const char *fromCode,
242 const char *toCode,
243 ScmObj handler,
244 int bufsiz,
245 int ownerp)
246 {
247 ScmConvInfo *cinfo;
248 conv_guess *guess;
249 char *inbuf = NULL;
250 int preread = 0;
251 ScmPortBuffer bufrec;
252 ScmObj name;
253
254 if (!SCM_IPORTP(fromPort))
255 Scm_Error("input port required, but got %S", fromPort);
256
257 if (bufsiz <= 0) bufsiz = DEFAULT_CONVERSION_BUFFER_SIZE;
258 if (bufsiz <= MINIMUM_CONVERSION_BUFFER_SIZE) {
259 bufsiz = MINIMUM_CONVERSION_BUFFER_SIZE;
260 }
261 guess = findGuessingProc(fromCode);
262 if (guess) {
263 const char *guessed;
264
265 inbuf = SCM_NEW_ATOMIC2(char *, bufsiz);
266 preread = Scm_Getz(inbuf, bufsiz, fromPort);
267 if (preread <= 0) {
268 /* Input buffer is already empty or unreadable.
269 Determining character code is not necessary.
270 We just return a dummy empty port. */
271 return Scm_MakeInputStringPort(SCM_STRING(SCM_MAKE_STR("")), FALSE);
272 }
273 guessed = guess->proc(inbuf, preread, guess->data);
274 if (guessed == NULL)
275 Scm_Error("%s: failed to guess input encoding", fromCode);
276 fromCode = guessed;
277 }
278
279 cinfo = jconv_open(toCode, fromCode);
280 if (cinfo == NULL) {
281 Scm_Error("conversion from code %s to code %s is not supported",
282 fromCode, toCode);
283 }
284 cinfo->remote = fromPort;
285 cinfo->ownerp = ownerp;
286 cinfo->bufsiz = bufsiz;
287 cinfo->remoteClosed = FALSE;
288 if (preread > 0) {
289 cinfo->buf = inbuf;
290 cinfo->ptr = inbuf + preread;
291 } else {
292 cinfo->buf = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
293 cinfo->ptr = cinfo->buf;
294 }
295
296 memset(&bufrec, 0, sizeof(bufrec));
297 bufrec.size = cinfo->bufsiz;
298 bufrec.buffer = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
299 bufrec.mode = SCM_PORT_BUFFER_FULL;
300 bufrec.filler = conv_input_filler;
301 bufrec.flusher = NULL;
302 bufrec.closer = conv_input_closer;
303 bufrec.ready = conv_ready;
304 bufrec.filenum = conv_fileno;
305 bufrec.data = (void*)cinfo;
306
307 name = conv_name(SCM_PORT_INPUT, fromPort, fromCode, toCode);
308 return Scm_MakeBufferedPort(SCM_CLASS_PORT, name, SCM_PORT_INPUT, TRUE, &bufrec);
309 }
310
311 /* a special case of input conversion port --- coding-aware port coversion.
312 this function is called via Scm_CodingAwarePortHook from
313 src/port.c */
314
315 static ScmPort *coding_aware_conv(ScmPort *src, const char *encoding)
316 {
317 return SCM_PORT(Scm_MakeInputConversionPort(src,
318 encoding,
319 Scm_SupportedCharacterEncodings()[0],
320 SCM_FALSE,
321 0, TRUE));
322 }
323
324 /*------------------------------------------------------------
325 * Output conversion
326 *
327 * Buffered port ----> flusher -->(info->buf)--> putz(remote)
328 */
329
330 /* NB: Glibc-2.1.2's iconv() has a bug in SJIS handling. If output
331 * is in SJIS and output buffer overflows in the middle of two-byte
332 * sequence, it leaves the first byte in the output buffer as if
333 * it were valid converted character, while the input buffer pointer
334 * stops just before the unconverted character, as supposed.
335 * There's no way to detect that unless I scan the output by myself
336 * to see the last byte of conversion is invalid or not.
337 *
338 * As a workaround, I flush the output buffer more frequently than
339 * needed, avoiding the situation that the output buffer overflow.
340 * Hoping the bugs are fixed in the future release of glibc.
341 */
342
343 #define GLIBC_2_1_ICONV_BUG
344
345 static void conv_output_closer(ScmPort *port)
346 {
347 ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
348 int r;
349
350 /* if there's remaining bytes in buf, send them to the remote port. */
351 if (info->ptr > info->buf) {
352 Scm_Putz(info->buf, info->ptr - info->buf, info->remote);
353 info->ptr = info->buf;
354 }
355 /* sends out the closing sequence, if any */
356 r = jconv_reset(info, info->buf, info->bufsiz);
357 #ifdef JCONV_DEBUG
358 fprintf(stderr, "<= r=%d(reset), buf(%p)\n",
359 r, info->buf);
360 #endif
361 if (r < 0) {
362 Scm_Error("something wrong in resetting output character encoding conversion (%s -> %s). possibly an implementation error.",
363 info->fromCode, info->toCode);
364 }
365 if (r > 0) {
366 Scm_Putz(info->buf, r, info->remote);
367 }
368 /* flush remove port */
369 Scm_Flush(info->remote);
370 if (info->ownerp) {
371 Scm_ClosePort(info->remote);
372 info->remoteClosed = TRUE;
373 }
374 jconv_close(info);
375 }
376
377 static int conv_output_flusher(ScmPort *port, int cnt, int forcep)
378 {
379 ScmConvInfo *info = (ScmConvInfo*)port->src.buf.data;
380 size_t outsize, inroom, outroom, result, len;
381 const char *inbuf;
382 char *outbuf;
383
384 inbuf = port->src.buf.buffer;
385 inroom = len = SCM_PORT_BUFFER_AVAIL(port);
386 for (;;) {
387 /* Conversion. */
388 outbuf = info->ptr;
389 outsize = info->bufsiz - (info->ptr - info->buf);
390 outroom = outsize;
391 #ifdef JCONV_DEBUG
392 fprintf(stderr, "=> in(%p,%p)%d out(%p,%p)%d\n",
393 inbuf, len, inroom,
394 info->buf, info->ptr, outroom);
395 #endif
396 result = jconv(info, &inbuf, &inroom, &outbuf, &outroom);
397 #ifdef JCONV_DEBUG
398 fprintf(stderr, "<= r=%d, in(%p)%d out(%p)%d\n",
399 result, inbuf, inroom, outbuf, outroom);
400 #endif
401 if (result == INPUT_NOT_ENOUGH) {
402 #ifndef GLIBC_2_1_ICONV_BUG
403 /* Conversion stopped due to an incomplete character at the
404 end of the input buffer. We just return # of bytes
405 flushed. (Shifting unconverted characters is done by
406 buffered port routine) */
407 info->ptr = outbuf;
408 #else
409 /* See the above notes. We always flush the output buffer
410 here, so that we can avoid output buffer overrun. */
411 Scm_Putz(info->buf, outbuf - info->buf, info->remote);
412 info->ptr = info->buf;
413 #endif
414 return len - inroom;
415 } else if (result == OUTPUT_NOT_ENOUGH) {
416 /* Output buffer got full. Flush it, and continue
417 conversion. */
418 Scm_Putz(info->buf, outbuf - info->buf, info->remote);
419 info->ptr = info->buf;
420 continue;
421 } else if (result == ILLEGAL_SEQUENCE) {
422 /* it's likely that input contains invalid sequence.
423 TODO: we should handle this case gracefully. */
424 Scm_Error("invalid character sequence in the input stream");
425 return 0; /* dummy */
426 } else {
427 #ifndef GLIBC_2_1_ICONV_BUG
428 /* Conversion is done completely. Update outptr. */
429 info->ptr = outbuf;
430 #else
431 /* See the above notes. We always flush the output buffer here,
432 so that we can avoid output buffer overrun. */
433 Scm_Putz(info->buf, outbuf - info->buf, info->remote);
434 info->ptr = info->buf;
435 #endif
436 if (forcep && len - inroom != cnt) continue;
437 return len - inroom;
438 }
439 }
440 }
441
442 ScmObj Scm_MakeOutputConversionPort(ScmPort *toPort,
443 const char *toCode,
444 const char *fromCode,
445 int bufsiz, int ownerp)
446 {
447 ScmConvInfo *cinfo;
448 ScmPortBuffer bufrec;
449 ScmObj name;
450
451 if (!SCM_OPORTP(toPort))
452 Scm_Error("output port required, but got %S", toPort);
453
454 if (bufsiz <= 0) bufsiz = DEFAULT_CONVERSION_BUFFER_SIZE;
455 if (bufsiz <= MINIMUM_CONVERSION_BUFFER_SIZE) {
456 bufsiz = MINIMUM_CONVERSION_BUFFER_SIZE;
457 }
458
459 cinfo = jconv_open(toCode, fromCode);
460 if (cinfo == NULL) {
461 Scm_Error("conversion from code %s to code %s is not supported",
462 fromCode, toCode);
463 }
464 cinfo->remote = toPort;
465 cinfo->ownerp = ownerp;
466 cinfo->bufsiz = (bufsiz > 0)? bufsiz : DEFAULT_CONVERSION_BUFFER_SIZE;
467 cinfo->remoteClosed = FALSE;
468 cinfo->buf = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
469 cinfo->ptr = cinfo->buf;
470
471 memset(&bufrec, 0, sizeof(bufrec));
472 bufrec.size = cinfo->bufsiz;
473 bufrec.buffer = SCM_NEW_ATOMIC2(char *, cinfo->bufsiz);
474 bufrec.mode = SCM_PORT_BUFFER_FULL;
475 bufrec.filler = NULL;
476 bufrec.flusher = conv_output_flusher;
477 bufrec.closer = conv_output_closer;
478 bufrec.ready = conv_ready;
479 bufrec.filenum = conv_fileno;
480 bufrec.data = (void*)cinfo;
481
482 name = conv_name(SCM_PORT_OUTPUT, toPort, fromCode, toCode);
483 return Scm_MakeBufferedPort(SCM_CLASS_PORT, name, SCM_PORT_OUTPUT, TRUE, &bufrec);
484 }
485
486 /*------------------------------------------------------------
487 * Direct interface for code guessing
488 */
489 const char *Scm_GuessCES(const char *code, const char *buf, int buflen)
490 {
491 conv_guess *guess = findGuessingProc(code);
492 if (guess == NULL)
493 Scm_Error("unknown code guessing scheme: %s", code);
494 return guess->proc(buf, buflen, guess->data);
495 }
496
497 /*------------------------------------------------------------
498 * UCS4 <-> internal character routine
499 *
500 * These routines are called when the literal character is given by
501 * unicode notation (#\uXXXx, #\UXXXXXXXX or \uXXXX, \UXXXXXXXX inside
502 * string), or unicode->char routine is called.
503 * For this purpose, we keep two global conversion context.
504 * Since internal encodings are stateless, we can reuse those
505 * context, instead of calling jconv_open every time.
506 */
507
508 static ScmChar ucstochar(int ucs4)
509 {
510 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
511 return (ScmChar)ucs4;
512 #else /*!GAUCHE_CHAR_ENCODING_UTF_8*/
513 char inbuf[6], outbuf[6];
514 const char *inb = inbuf;
515 char *outb = outbuf;
516 size_t inroom, outroom, r;
517
518 if (ucsconv.ucs2char == NULL) return SCM_CHAR_INVALID;
519 inroom = UCS2UTF_NBYTES(ucs4);
520 outroom = 6;
521 jconv_ucs4_to_utf8(ucs4, inbuf);
522 (void)SCM_INTERNAL_MUTEX_LOCK(ucsconv.mutex);
523 r = jconv(ucsconv.ucs2char, &inb, &inroom, &outb, &outroom);
524 (void)SCM_INTERNAL_MUTEX_UNLOCK(ucsconv.mutex);
525 if (r == INPUT_NOT_ENOUGH || r == OUTPUT_NOT_ENOUGH) {
526 Scm_Error("can't convert UCS4 code %d to a character: implementation problem?", ucs4);
527 }
528 if (r == ILLEGAL_SEQUENCE) {
529 return SCM_CHAR_INVALID;
530 } else {
531 ScmChar out;
532 SCM_CHAR_GET(outbuf, out);
533 return out;
534 }
535 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
536 }
537
538 static int chartoucs(ScmChar ch)
539 {
540 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
541 if (ch == SCM_CHAR_INVALID) return -1;
542 return (int)ch;
543 #else /*!GAUCHE_CHAR_ENCODING_UTF_8*/
544 char inbuf[6], outbuf[6];
545 const char *inb = inbuf;
546 char *outb = outbuf;
547 size_t inroom, outroom, r;
548
549 if (ch == SCM_CHAR_INVALID) return -1;
550 if (ucsconv.char2ucs == NULL) return -1;
551 inroom = SCM_CHAR_NBYTES(ch);
552 outroom = 6;
553 SCM_CHAR_PUT(inbuf, ch);
554 (void)SCM_INTERNAL_MUTEX_LOCK(ucsconv.mutex);
555 r = jconv(ucsconv.char2ucs, &inb, &inroom, &outb, &outroom);
556 (void)SCM_INTERNAL_MUTEX_UNLOCK(ucsconv.mutex);
557 if (r == INPUT_NOT_ENOUGH || r == OUTPUT_NOT_ENOUGH) {
558 Scm_Error("can't convert character %u to UCS4 code: implementation problem?", ch);
559 }
560 if (r == ILLEGAL_SEQUENCE) {
561 return -1;
562 } else {
563 unsigned char *ucp = (unsigned char*)outbuf;
564 if (ucp[0] < 0x80) return (int)ucp[0];
565 if (ucp[0] < 0xe0) {
566 return ((ucp[0]&0x1f)<<6) + (ucp[1]&0x3f);
567 }
568 if (ucp[0] < 0xf0) {
569 return ((ucp[0]&0x0f)<<12)
570 + ((ucp[1]&0x3f)<<6)
571 + (ucp[2]&0x3f);
572 }
573 if (ucp[0] < 0xf8) {
574 return ((ucp[0]&0x07)<<18)
575 + ((ucp[1]&0x3f)<<12)
576 + ((ucp[2]&0x3f)<<6)
577 + (ucp[3]&0x3f);
578 }
579 if (ucp[0] < 0xfc) {
580 return ((ucp[0]&0x03)<<24)
581 + ((ucp[1]&0x3f)<<18)
582 + ((ucp[2]&0x3f)<<12)
583 + ((ucp[3]&0x3f)<<6)
584 + (ucp[4]&0x3f);
585 }
586 if (ucp[0] < 0xfe) {
587 return ((ucp[0]&0x01)<<30)
588 + ((ucp[1]&0x3f)<<24)
589 + ((ucp[2]&0x3f)<<18)
590 + ((ucp[3]&0x3f)<<12)
591 + ((ucp[4]&0x3f)<<6)
592 + (ucp[5]&0x3f);
593 }
594 return -1;
595 }
596 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
597 }
598
599 /*====================================================================
600 * Initialization
601 */
602 extern void Scm_Init_convlib(ScmModule *module);
603 extern void Scm_Init_convaux(void);
604 extern void Scm_Init_convguess(void);
605 SCM_EXTERN ScmChar (*Scm_UcsToCharHook)(int ucs4);
606 SCM_EXTERN int (*Scm_CharToUcsHook)(ScmChar ch);
607 SCM_EXTERN ScmPort *(*Scm_CodingAwarePortHook)(ScmPort *src,
608 const char *encoding);
609
610 void Scm_Init_libcharconv(void)
611 {
612 ScmModule *mod;
613 SCM_INIT_EXTENSION(charconv);
614 mod = SCM_FIND_MODULE("gauche.charconv", SCM_FIND_MODULE_CREATE);
615 guess.procs = NULL;
616 (void)SCM_INTERNAL_MUTEX_INIT(guess.mutex);
617 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
618 ucsconv.ucs2char = ucsconv.char2ucs = NULL;
619 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP)
620 ucsconv.ucs2char = jconv_open("EUCJP", "UTF-8");
621 ucsconv.char2ucs = jconv_open("UTF-8", "EUCJP");
622 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
623 ucsconv.ucs2char = jconv_open("SJIS", "UTF-8");
624 ucsconv.char2ucs = jconv_open("UTF-8", "SJIS");
625 #else
626 ucsconv.ucs2char = ucsconv.char2ucs = NULL;
627 #endif
628 (void)SCM_INTERNAL_MUTEX_INIT(ucsconv.mutex);
629 Scm_Init_convguess();
630 Scm_Init_convlib(mod);
631 Scm_Init_convaux();
632 Scm_UcsToCharHook = ucstochar;
633 Scm_CharToUcsHook = chartoucs;
634 Scm_CodingAwarePortHook = coding_aware_conv;
635 }