/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- Scm_CharEncodingName
- Scm_SupportedCharacterEncodings
- Scm_SupportedCharacterEncodingP
- Scm_DigitToInt
- Scm_IntToDigit
- Scm_UcsToChar
- Scm_CharToUcs
- charset_print_ch
- charset_print
- make_charset
- Scm_MakeEmptyCharSet
- Scm_CopyCharSet
- Scm_ReadXdigitsFromString
- Scm_ReadXdigitsFromPort
- charset_compare
- Scm_CharSetEq
- Scm_CharSetLE
- newrange
- Scm_CharSetAddRange
- Scm_CharSetAdd
- Scm_CharSetComplement
- Scm_CharSetCaseFold
- Scm_CharSetContains
- Scm_CharSetRanges
- Scm_CharSetDump
- read_charset_xdigits
- Scm_CharSetRead
- read_predef_charset
- install_charsets
- Scm_GetStandardCharSet
- Scm__InitChar
1 /*
2 * char.c - character and character set operations
3 *
4 * Copyright (c) 2000-2004 Shiro Kawai, All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * $Id: char.c,v 1.43 2005/10/28 02:53:10 shirok Exp $
34 */
35
36 #include <ctype.h>
37 #define LIBGAUCHE_BODY
38 #include "gauche.h"
39
40 /*=======================================================================
41 * Character functions
42 */
43
44 ScmObj Scm_CharEncodingName(void)
45 {
46 return SCM_INTERN(SCM_CHAR_ENCODING_NAME);
47 }
48
49 /* includes encoding-specific auxiliary functions */
50 #define SCM_CHAR_ENCODING_BODY
51 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP)
52 #include "gauche/char_euc_jp.h"
53 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
54 #include "gauche/char_utf_8.h"
55 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
56 #include "gauche/char_sjis.h"
57 #else
58 #include "gauche/char_none.h"
59 #endif
60
61 const char **Scm_SupportedCharacterEncodings(void)
62 {
63 return supportedCharacterEncodings;
64 }
65
66 int Scm_SupportedCharacterEncodingP(const char *encoding)
67 {
68 const char **cs = supportedCharacterEncodings;
69 for (;*cs;cs++) {
70 const char *p = *cs;
71 const char *q = encoding;
72 for (;*p && *q; p++, q++) {
73 if (tolower(*p) != tolower(*q)) break;
74 }
75 if (*p == '\0' && *q == '\0') return TRUE;
76 }
77 return FALSE;
78 }
79
80 /* '0' -> 0, 'a' -> 10, etc.
81 Radix is assumed in the range [2, 36] */
82 int Scm_DigitToInt(ScmChar ch, int radix)
83 {
84 if (ch < '0') return -1;
85 if (radix <= 10) {
86 if (ch <= '0' + radix) return (ch - '0');
87 } else {
88 if (ch <= '9') return (ch - '0');
89 if (ch < 'A') return -1;
90 if (ch < 'A' + radix - 10) return (ch - 'A' + 10);
91 if (ch < 'a') return -1;
92 if (ch < 'a' + radix - 10) return (ch - 'a' + 10);
93 }
94 return -1;
95 }
96
97 ScmChar Scm_IntToDigit(int n, int radix)
98 {
99 if (n < 0) return SCM_CHAR_INVALID;
100 if (radix <= 10) {
101 if (n < radix) return (ScmChar)(n + '0');
102 else return SCM_CHAR_INVALID;
103 } else {
104 if (n < 10) return (ScmChar)(n + '0');
105 if (n < radix) return (ScmChar)(n - 10 + 'a');
106 else return SCM_CHAR_INVALID;
107 }
108 }
109
110 /*
111 * Convert UCS4 code <-> character
112 * If the native encoding is not utf-8, gauche.charconv module is loaded.
113 */
114 ScmChar (*Scm_UcsToCharHook)(int ucs4) = NULL; /* filled by ext/charconv */
115 int (*Scm_CharToUcsHook)(ScmChar ch) = NULL; /* filled by ext/charconv */
116
117 ScmChar Scm_UcsToChar(int n)
118 {
119 if (n < 0) Scm_Error("bad character code: %d", n);
120 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
121 return (ScmChar)n;
122 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
123 if (n < 0x80) return (ScmChar)n; /*ASCII range*/
124 if (Scm_UcsToCharHook == NULL) {
125 /* NB: we don't need mutex here, for the loading of gauche.charconv
126 is serialized in Scm_Require. */
127 Scm_Require(SCM_MAKE_STR("gauche/charconv"));
128 if (Scm_UcsToCharHook == NULL) {
129 Scm_Error("couldn't autoload gauche.charconv");
130 }
131 }
132 return Scm_UcsToCharHook(n);
133 #else
134 if (n < 0x100) return (ScmChar)n; /* ISO8859-1 */
135 else return SCM_CHAR_INVALID;
136 #endif
137 }
138
139 int Scm_CharToUcs(ScmChar ch)
140 {
141 if (ch == SCM_CHAR_INVALID) Scm_Error("bad character");
142 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
143 return (int)ch;
144 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
145 if (ch < 0x80) return (int)ch; /*ASCII range*/
146 if (Scm_CharToUcsHook == NULL) {
147 /* NB: we don't need mutex here, for the loading of gauche.charconv
148 is serialized in Scm_Require. */
149 Scm_Require(SCM_MAKE_STR("gauche/charconv"));
150 if (Scm_CharToUcsHook == NULL) {
151 Scm_Error("couldn't autoload gauche.charconv");
152 }
153 }
154 return Scm_CharToUcsHook(ch);
155 #else
156 return (int)ch; /* ISO8859-1 */
157 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
158 }
159
160 /*=======================================================================
161 * Character set (cf. SRFI-14)
162 */
163 /* NB: operations on charset are not very optimized, for I don't see
164 * the immediate needs to do so, except Scm_CharSetContains.
165 */
166
167 static void charset_print(ScmObj obj, ScmPort *out, ScmWriteContext*);
168 static int charset_compare(ScmObj x, ScmObj y, int equalp);
169 SCM_DEFINE_BUILTIN_CLASS(Scm_CharSetClass,
170 charset_print, charset_compare, NULL, NULL,
171 SCM_CLASS_DEFAULT_CPL);
172
173 /* masks */
174 #if SIZEOF_LONG == 4
175 #define MASK_BIT_SHIFT 5
176 #define MASK_BIT_MASK 0x1f
177 #elif SIZEOF_LONG == 8
178 #define MASK_BIT_SHIFT 6
179 #define MASK_BIT_MASK 0x3f
180 #elif SIZEOF_LONG == 16 /* maybe, in some future ... */
181 #define MASK_BIT_SHIFT 7
182 #define MASK_BIT_MASK 0x7f
183 #else
184 #error need to set SIZEOF_LONG
185 #endif
186
187 #define MASK_INDEX(ch) ((ch) >> MASK_BIT_SHIFT)
188 #define MASK_BIT(ch) (1L << ((ch) & MASK_BIT_MASK))
189 #define MASK_ISSET(cs, ch) (!!(cs->mask[MASK_INDEX(ch)] & MASK_BIT(ch)))
190 #define MASK_SET(cs, ch) (cs->mask[MASK_INDEX(ch)] |= MASK_BIT(ch))
191 #define MASK_RESET(cs, ch) (cs->mask[MASK_INDEX(ch)] &= ~MASK_BIT(ch))
192
193 /*----------------------------------------------------------------------
194 * Printer
195 */
196 static void charset_print_ch(ScmPort *out, ScmChar ch)
197 {
198 if (ch < 0x20 || ch == 0x7f) {
199 Scm_Printf(out, "\\x%02x", ch);
200 } else {
201 char chbuf[SCM_CHAR_MAX_BYTES];
202 int i;
203 SCM_CHAR_PUT(chbuf, ch);
204 for (i=0; i<SCM_CHAR_NBYTES(ch); i++) {
205 Scm_Printf(out, "%c", chbuf[i]);
206 }
207 }
208 }
209
210 static void charset_print(ScmObj obj, ScmPort *out, ScmWriteContext *ctx)
211 {
212 int prev, code;
213 ScmCharSet *cs = SCM_CHARSET(obj);
214 struct ScmCharSetRange *r;
215
216 Scm_Printf(out, "#[");
217 for (prev = -1, code = 0; code < SCM_CHARSET_MASK_CHARS; code++) {
218 if (MASK_ISSET(cs, code) && prev < 0) {
219 charset_print_ch(out, code);
220 prev = code;
221 }
222 else if (!MASK_ISSET(cs, code) && prev >= 0) {
223 if (code - prev > 1) {
224 if (code - prev > 2) Scm_Printf(out, "-");
225 charset_print_ch(out, code-1);
226 }
227 prev = -1;
228 }
229 }
230 if (prev >= 0) {
231 if (code - prev > 1) {
232 if (prev < 0x7e) Scm_Printf(out, "-");
233 charset_print_ch(out, code-1);
234 }
235 }
236 for (r = cs->ranges; r; r = r->next) {
237 charset_print_ch(out, r->lo);
238 if (r->hi == r->lo) continue;
239 if (r->hi - r->lo > 2) Scm_Printf(out, "-");
240 charset_print_ch(out, r->hi);
241 }
242 Scm_Printf(out, "]", obj);
243 }
244
245 /*-----------------------------------------------------------------
246 * Constructors
247 */
248 static ScmCharSet *make_charset(void)
249 {
250 ScmCharSet *cs = SCM_NEW(ScmCharSet);
251 int i;
252 SCM_SET_CLASS(cs, SCM_CLASS_CHARSET);
253 for (i=0; i<SCM_CHARSET_MASK_SIZE; i++) cs->mask[i] = 0;
254 cs->ranges = NULL;
255 return cs;
256 }
257
258 ScmObj Scm_MakeEmptyCharSet(void)
259 {
260 return SCM_OBJ(make_charset());
261 }
262
263 ScmObj Scm_CopyCharSet(ScmCharSet *src)
264 {
265 ScmCharSet *dst = make_charset();
266 struct ScmCharSetRange *rs, *rd = dst->ranges;
267 int i;
268
269 for (i=0; i<SCM_CHARSET_MASK_SIZE; i++) dst->mask[i] = src->mask[i];
270 for (rs = src->ranges; rs; rs = rs->next) {
271 if (rd == NULL) {
272 rd = dst->ranges = SCM_NEW(struct ScmCharSetRange);
273 } else {
274 rd->next = SCM_NEW(struct ScmCharSetRange);
275 rd = rd->next;
276 }
277 rd->lo = rs->lo;
278 rd->hi = rs->hi;
279 }
280 if (rd) rd->next = NULL;
281 return SCM_OBJ(dst);
282 }
283
284 /* Helper functions to read the escaped character code sequence, such as
285 \xXX, \uXXXX, or \UXXXXXXXX.
286 Scm_ReadXdigitsFromString reads from char* buffer (note that hex digits
287 consist of single-byte characters in any encoding, we don't need to
288 do the cumbersome multibyte handling). Scm_ReadXdigitsFromPort reads
289 from the port. Both should be called after the prefix 'x', 'u' or 'U'
290 char is read. NDIGITS specifies either exact number of digits to be
291 expected or maximum number of digits. */
292
293 /* If nextbuf == NULL, ndigits specifies exact # of digits. Returns
294 SCM_CHAR_INVALID if there are less digits. Otherwise, ndigis specifies
295 max # of digits, and the ptr to the next char is stored in nextbuf. */
296 ScmChar Scm_ReadXdigitsFromString(const char *buf, int ndigits,
297 const char **nextbuf)
298 {
299 int i, val = 0;
300 for (i=0; i<ndigits; i++) {
301 if (!isxdigit(buf[i])) {
302 if (nextbuf == NULL) return SCM_CHAR_INVALID;
303 else {
304 *nextbuf = buf;
305 return val;
306 }
307 }
308 val = val * 16 + Scm_DigitToInt(buf[i], 16);
309 }
310 return (ScmChar)val;
311 }
312
313 /* ndigits specifies exact # of digits. read chars are stored in buf
314 so that they can be used in the error message. Caller must provide
315 a sufficient space for buf. */
316 ScmChar Scm_ReadXdigitsFromPort(ScmPort *port, int ndigits,
317 char *buf, int *nread)
318 {
319 int i, c, val = 0, dig;
320
321 for (i = 0; i < ndigits; i++) {
322 SCM_GETC(c, port);
323 if (c == EOF) break;
324 dig = Scm_DigitToInt(c, 16);
325 if (dig < 0) {
326 SCM_UNGETC(c, port);
327 break;
328 }
329 buf[i] = (char)c; /* we know c is single byte char here. */
330 val = val * 16 + dig;
331 }
332 *nread = i;
333 if (i < ndigits) { /* error */
334 return SCM_CHAR_INVALID;
335 } else {
336 return (ScmChar)val;
337 }
338 }
339
340 /*-----------------------------------------------------------------
341 * Comparison
342 */
343 static int charset_compare(ScmObj x, ScmObj y, int equalp)
344 {
345 ScmCharSet *xx = SCM_CHARSET(x);
346 ScmCharSet *yy = SCM_CHARSET(y);
347
348 if (equalp) {
349 return (Scm_CharSetEq(xx, yy)? 0 : 1);
350 } else {
351 if (Scm_CharSetEq(xx, yy)) return 0;
352 if (Scm_CharSetLE(xx, yy)) return -1;
353 if (Scm_CharSetLE(yy, xx)) return 1;
354 Scm_Error("cannot compare char-sets: %S vs %S", x, y);
355 return 0; /* dummy */
356 }
357 }
358
359 int Scm_CharSetEq(ScmCharSet *x, ScmCharSet *y)
360 {
361 int i;
362 struct ScmCharSetRange *rx, *ry;
363 for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
364 if (x->mask[i] != y->mask[i]) return FALSE;
365 for (rx=x->ranges, ry=y->ranges; rx && ry; rx=rx->next, ry=ry->next) {
366 if (rx->lo != ry->lo || rx->hi != ry->hi) return FALSE;
367 }
368 if (rx || ry) return FALSE;
369 return TRUE;
370 }
371
372 /* whether x <= y */
373 int Scm_CharSetLE(ScmCharSet *x, ScmCharSet *y)
374 {
375 int i;
376 struct ScmCharSetRange *rx, *ry;
377 for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
378 if ((x->mask[i] | y->mask[i]) != y->mask[i]) return FALSE;
379 rx = x->ranges;
380 ry = y->ranges;
381 while (rx && ry) {
382 if (rx->lo < ry->lo) return FALSE;
383 if (rx->lo > ry->hi) { ry = ry->next; continue; }
384 if (rx->hi > ry->hi) return FALSE;
385 rx = rx->next;
386 }
387 if (rx) return FALSE;
388 return TRUE;
389 }
390
391 /*-----------------------------------------------------------------
392 * Modification
393 */
394
395 static struct ScmCharSetRange *newrange(int lo, int hi,
396 struct ScmCharSetRange *next)
397 {
398 struct ScmCharSetRange *n = SCM_NEW(struct ScmCharSetRange);
399 n->next = next;
400 n->lo = lo;
401 n->hi = hi;
402 return n;
403 }
404
405 ScmObj Scm_CharSetAddRange(ScmCharSet *cs, ScmChar from, ScmChar to)
406 {
407 int i;
408 struct ScmCharSetRange *lo, *lop, *hi;
409
410 if (to < from) return SCM_OBJ(cs);
411 if (from < SCM_CHARSET_MASK_CHARS) {
412 if (to < SCM_CHARSET_MASK_CHARS) {
413 for (i=from; i<=to; i++) MASK_SET(cs, i);
414 return SCM_OBJ(cs);
415 }
416 for (i=from; i<SCM_CHARSET_MASK_CHARS; i++) MASK_SET(cs, i);
417 from = SCM_CHARSET_MASK_CHARS;
418 }
419 if (cs->ranges == NULL) {
420 cs->ranges = newrange(from, to, NULL);
421 return SCM_OBJ(cs);
422 }
423 /* Add range. Ranges are chained from lower character code to higher,
424 without any overlap. */
425 /* First, we scan the ranges so that we'll get...
426 - if FROM is in a range, lo points to it.
427 - if FROM is out of any ranges, lo points to the closest range that
428 is higher than FROM.
429 - if TO is in a range, hi points to the range.
430 - if TO is out of any ranges, hi points to the closest range that
431 is higher than TO. */
432 for (lop = NULL, lo = cs->ranges; lo; lop = lo, lo = lo->next) {
433 if (from <= lo->hi+1) break;
434 }
435 if (!lo) {
436 lop->next = newrange(from, to, NULL);
437 return SCM_OBJ(cs);
438 }
439 for (hi = lo; hi; hi = hi->next) {
440 if (to <= hi->hi) break;
441 }
442 /* Then we insert, extend and/or merge the ranges accordingly. */
443 if (from < lo->lo) { /* FROM extends the LO */
444 if (lo == hi) {
445 if (to < hi->lo-1) {
446 if (lop == NULL) cs->ranges = newrange(from, to, lo);
447 else lop->next = newrange(from, to, lo);
448 } else {
449 lo->lo = from;
450 }
451 } else if (hi == NULL || to < hi->lo-1) {
452 lo->lo = from;
453 lo->hi = to;
454 lo->next = hi;
455 } else {
456 lo->lo = from;
457 lo->hi = hi->hi;
458 lo->next = hi->next;
459 }
460 } else { /* FROM included in LO */
461 if (lo != hi) {
462 if (hi == NULL || to < hi->lo-1) {
463 lo->hi = to;
464 lo->next = hi;
465 } else {
466 lo->hi = hi->hi;
467 lo->next = hi->next;
468 }
469 }
470 }
471 /* WRITE ME */
472 return SCM_OBJ(cs);
473 }
474
475 ScmObj Scm_CharSetAdd(ScmCharSet *dst, ScmCharSet *src)
476 {
477 int i;
478 struct ScmCharSetRange *r;
479 for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
480 dst->mask[i] |= src->mask[i];
481 for (r = src->ranges; r; r = r->next) {
482 Scm_CharSetAddRange(dst, r->lo, r->hi);
483 }
484 return SCM_OBJ(dst);
485 }
486
487 ScmObj Scm_CharSetComplement(ScmCharSet *cs)
488 {
489 int i, last;
490 struct ScmCharSetRange *r, *p;
491 for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
492 cs->mask[i] = ~cs->mask[i];
493 last = SCM_CHARSET_MASK_CHARS;
494 for (p = NULL, r = cs->ranges; r; p = r, r = r->next) {
495 int hi = r->hi+1;
496 if (r->lo != SCM_CHARSET_MASK_CHARS) {
497 r->hi = r->lo - 1;
498 r->lo = last;
499 } else {
500 cs->ranges = r->next;
501 }
502 last = hi;
503 }
504 if (last < SCM_CHAR_MAX) {
505 if (!p) cs->ranges = newrange(last, SCM_CHAR_MAX, NULL);
506 else p->next = newrange(last, SCM_CHAR_MAX, NULL);
507 }
508 return SCM_OBJ(cs);
509 }
510
511 /* Make charset case-insensitive. For now, we only deal with
512 ASCII range. */
513 ScmObj Scm_CharSetCaseFold(ScmCharSet *cs)
514 {
515 int ch;
516 for (ch='a'; ch<='z'; ch++) {
517 if (MASK_ISSET(cs, ch) || MASK_ISSET(cs, (ch-('a'-'A')))) {
518 MASK_SET(cs, ch);
519 MASK_SET(cs, (ch-('a'-'A')));
520 }
521 }
522 return SCM_OBJ(cs);
523 }
524
525 /*-----------------------------------------------------------------
526 * Query
527 */
528
529 int Scm_CharSetContains(ScmCharSet *cs, ScmChar c)
530 {
531 if (c < 0) return FALSE;
532 if (c < SCM_CHARSET_MASK_CHARS) return MASK_ISSET(cs, c);
533 else {
534 struct ScmCharSetRange *r;
535 for (r = cs->ranges; r; r = r->next) {
536 if (r->lo <= c && c <= r->hi) return TRUE;
537 }
538 return FALSE;
539 }
540 }
541
542 /*-----------------------------------------------------------------
543 * Inspection
544 */
545
546 /* returns a list of ranges contained in the charset */
547 ScmObj Scm_CharSetRanges(ScmCharSet *cs)
548 {
549 ScmObj h = SCM_NIL, t = SCM_NIL, cell;
550 int ind, begin = 0, prev = FALSE;
551 struct ScmCharSetRange *r;
552
553 for (ind = 0; ind < SCM_CHARSET_MASK_CHARS; ind++) {
554 int bit = MASK_ISSET(cs, ind);
555 if (!prev && bit) begin = ind;
556 if (prev && !bit) {
557 cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(ind-1));
558 SCM_APPEND1(h, t, cell);
559 }
560 prev = bit;
561 }
562 if (prev) {
563 if (!cs->ranges || cs->ranges->lo != SCM_CHARSET_MASK_CHARS) {
564 cell = Scm_Cons(SCM_MAKE_INT(begin),
565 SCM_MAKE_INT(SCM_CHARSET_MASK_CHARS-1));
566 SCM_APPEND1(h, t, cell);
567 r = cs->ranges;
568 } else {
569 cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(cs->ranges->hi));
570 SCM_APPEND1(h, t, cell);
571 r = cs->ranges->next;
572 }
573 } else {
574 r = cs->ranges;
575 }
576 for (; r; r = r->next) {
577 cell = Scm_Cons(SCM_MAKE_INT(r->lo), SCM_MAKE_INT(r->hi));
578 SCM_APPEND1(h, t, cell);
579 }
580 return h;
581 }
582
583 #if SCM_DEBUG_HELPER
584 void Scm_CharSetDump(ScmCharSet *cs, ScmPort *port)
585 {
586 int i;
587 struct ScmCharSetRange *r;
588 Scm_Printf(port, "CharSet %p\nmask:", cs);
589 for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
590 Scm_Printf(port, "[%08x]", cs->mask[i]);
591 Scm_Printf(port, "\nranges:");
592 for (r=cs->ranges; r; r=r->next)
593 Scm_Printf(port, "(%d-%d)", r->lo, r->hi);
594 Scm_Printf(port, "\n");
595 }
596 #endif /* SCM_DEBUG_HELPER */
597
598 /*-----------------------------------------------------------------
599 * Reader
600 */
601
602 /* Read \x, \u, \U escape sequence in the charset spec. */
603 static ScmChar read_charset_xdigits(ScmPort *port, int ndigs, int key)
604 {
605 char buf[8];
606 int nread;
607 ScmChar r;
608 SCM_ASSERT(ndigs <= 8);
609 r = Scm_ReadXdigitsFromPort(port, ndigs, buf, &nread);
610 if (r == SCM_CHAR_INVALID) {
611 ScmDString ds;
612 int c, i;
613 /* skip chars to the end of regexp, so that the reader will read
614 after the erroneous string */
615 for (;;) {
616 SCM_GETC(c, port);
617 if (c == EOF || c == ']') break;
618 if (c == '\\') SCM_GETC(c, port);
619 }
620 /* construct an error message */
621 Scm_DStringInit(&ds);
622 Scm_DStringPutc(&ds, '\\');
623 Scm_DStringPutc(&ds, key);
624 for (i=0; i<nread; i++) Scm_DStringPutc(&ds, (unsigned char)buf[i]);
625 Scm_Error("Bad '\\%c' escape sequence in a char-set literal: %s",
626 key, Scm_DStringGetz(&ds));
627 }
628 return r;
629 }
630
631 /* Parse regexp-style character set specification (e.g. [a-zA-Z]).
632 Assumes the opening bracket is already read.
633 Always return a fresh charset, that can be modified afterwards.
634
635 If the input syntax is invalid, either signals an error or returns
636 #f, depending error_p flag.
637
638 If bracket_syntax is TRUE, the first closing bracket ']' in the
639 charset (except the complimenting caret) is taken as a literal
640 character, instead of terminating the charset. It should be TRUE
641 during reading the regexp syntax for compatibility to POSIX regexp.
642
643 If complement_p is not NULL, the location get a boolean value of
644 whether complement character (caret in the beginning) appeared or not.
645 In that case, the returned charset is not complemented. */
646
647 static ScmObj read_predef_charset(ScmPort*, ScmObj*);
648
649 ScmObj Scm_CharSetRead(ScmPort *input, int *complement_p,
650 int error_p, int bracket_syntax)
651 {
652 #define REAL_BEGIN 1
653 #define CARET_BEGIN 2
654 int begin = REAL_BEGIN, complement = FALSE;
655 int lastchar = -1, inrange = FALSE, moreset_complement = FALSE;
656 ScmCharSet *set = SCM_CHARSET(Scm_MakeEmptyCharSet());
657 ScmObj moreset;
658 ScmObj chars = SCM_NIL;
659 ScmChar ch = 0;
660
661 for (;;) {
662 SCM_GETC(ch, input);
663 if (ch == EOF) goto err;
664 chars = Scm_Cons(SCM_MAKE_CHAR(ch), chars);
665
666 if (begin == REAL_BEGIN && ch == '^') {
667 complement = TRUE;
668 begin = CARET_BEGIN;
669 continue;
670 }
671 if (bracket_syntax && begin && ch == ']') {
672 Scm_CharSetAddRange(set, ch, ch);
673 lastchar = ch;
674 begin = FALSE;
675 continue;
676 }
677 begin = FALSE;
678
679 switch (ch) {
680 case '-':
681 if (inrange) goto ordchar;
682 inrange = TRUE;
683 continue;
684 case ']':
685 if (inrange) {
686 if (lastchar >= 0) {
687 Scm_CharSetAddRange(set, lastchar, lastchar);
688 Scm_CharSetAddRange(set, '-', '-');
689 } else {
690 Scm_CharSetAddRange(set, '-', '-');
691 }
692 }
693 break;
694 case '\\':
695 SCM_GETC(ch, input);
696 if (ch == SCM_CHAR_INVALID) goto err;
697 chars = Scm_Cons(SCM_MAKE_CHAR(ch), chars);
698 switch (ch) {
699 case 'a': ch = 7; goto ordchar;
700 case 'b': ch = 8; goto ordchar;
701 case 'n': ch = '\n'; goto ordchar;
702 case 'r': ch = '\r'; goto ordchar;
703 case 't': ch = '\t'; goto ordchar;
704 case 'f': ch = '\f'; goto ordchar;
705 case 'e': ch = 0x1b; goto ordchar;
706 case 'x':
707 ch = read_charset_xdigits(input, 2, 'x'); goto ordchar;
708 case 'u':
709 ch = Scm_UcsToChar(read_charset_xdigits(input, 4, 'u'));
710 goto ordchar;
711 case 'U':
712 ch = Scm_UcsToChar(read_charset_xdigits(input, 8, 'U'));
713 goto ordchar;
714 case 'd':
715 moreset_complement = FALSE;
716 moreset = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
717 break;
718 case 'D':
719 moreset_complement = TRUE;
720 moreset = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
721 break;
722 case 's':
723 moreset_complement = FALSE;
724 moreset = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
725 break;
726 case 'S':
727 moreset_complement = TRUE;
728 moreset = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
729 break;
730 case 'w':
731 moreset_complement = FALSE;
732 moreset = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
733 break;
734 case 'W':
735 moreset_complement = TRUE;
736 moreset = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
737 break;
738 default:
739 goto ordchar;
740 }
741 if (moreset_complement) {
742 moreset = Scm_CharSetComplement(SCM_CHARSET(Scm_CopyCharSet(SCM_CHARSET(moreset))));
743 }
744 Scm_CharSetAdd(set, SCM_CHARSET(moreset));
745 continue;
746 case '[':
747 moreset = read_predef_charset(input, &chars);
748 if (!SCM_CHARSETP(moreset)) goto err;
749 Scm_CharSetAdd(set, SCM_CHARSET(moreset));
750 continue;
751 ordchar:
752 default:
753 if (inrange) {
754 if (lastchar < 0) {
755 Scm_CharSetAddRange(set, '-', '-');
756 Scm_CharSetAddRange(set, ch, ch);
757 lastchar = ch;
758 } else {
759 Scm_CharSetAddRange(set, lastchar, ch);
760 lastchar = -1;
761 }
762 inrange = FALSE;
763 } else {
764 Scm_CharSetAddRange(set, ch, ch);
765 lastchar = ch;
766 }
767 continue;
768 }
769 break;
770 }
771 if (complement_p) {
772 *complement_p = complement;
773 return SCM_OBJ(set);
774 } else {
775 if (complement) Scm_CharSetComplement(set);
776 return SCM_OBJ(set);
777 }
778 err:
779 if (error_p)
780 Scm_Error("Unclosed bracket in charset syntax [%A",
781 Scm_ListToString(Scm_ReverseX(chars)));
782 return SCM_FALSE;
783 }
784
785 /* Read posix [:alpha:] etc. The first '[' is already read.
786 Return #f on error. Set reverse list of read chars in *chars */
787 #define MAX_CHARSET_NAME_LEN 10
788 ScmObj read_predef_charset(ScmPort *input, ScmObj *chars)
789 {
790 int i;
791 char name[MAX_CHARSET_NAME_LEN];
792 ScmChar ch;
793 for (i=0; i<MAX_CHARSET_NAME_LEN; i++) {
794 SCM_GETC(ch, input);
795 if (ch == SCM_CHAR_INVALID) return SCM_FALSE;
796 *chars = Scm_Cons(SCM_MAKE_CHAR(ch), *chars);
797 if (!SCM_CHAR_ASCII_P(ch)) break;
798 if (ch != ']') {
799 name[i] = ch;
800 continue;
801 }
802 if (strncmp(name, ":alnum:", 7) == 0) {
803 return Scm_GetStandardCharSet(SCM_CHARSET_ALNUM);
804 } else if (strncmp(name, ":alpha:", 7) == 0) {
805 return Scm_GetStandardCharSet(SCM_CHARSET_ALPHA);
806 } else if (strncmp(name, ":blank:", 7) == 0) {
807 return Scm_GetStandardCharSet(SCM_CHARSET_BLANK);
808 } else if (strncmp(name, ":cntrl:", 7) == 0) {
809 return Scm_GetStandardCharSet(SCM_CHARSET_CNTRL);
810 } else if (strncmp(name, ":digit:", 7) == 0) {
811 return Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
812 } else if (strncmp(name, ":graph:", 7) == 0) {
813 return Scm_GetStandardCharSet(SCM_CHARSET_GRAPH);
814 } else if (strncmp(name, ":lower:", 7) == 0) {
815 return Scm_GetStandardCharSet(SCM_CHARSET_LOWER);
816 } else if (strncmp(name, ":print:", 7) == 0) {
817 return Scm_GetStandardCharSet(SCM_CHARSET_PRINT);
818 } else if (strncmp(name, ":punct:", 7) == 0) {
819 return Scm_GetStandardCharSet(SCM_CHARSET_PUNCT);
820 } else if (strncmp(name, ":space:", 7) == 0) {
821 return Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
822 } else if (strncmp(name, ":upper:", 7) == 0) {
823 return Scm_GetStandardCharSet(SCM_CHARSET_UPPER);
824 } else if (strncmp(name, ":xdigit:", 8) == 0) {
825 return Scm_GetStandardCharSet(SCM_CHARSET_XDIGIT);
826 } else break;
827 }
828 /* here we got invalid charset name */
829 name[i] = '\0';
830 Scm_Error("invalid or unsupported POSIX charset '[%s]'", name);
831 return SCM_FALSE;
832 }
833
834 /*-----------------------------------------------------------------
835 * Pre-defined charset
836 */
837 /* TODO: We need different definitions of character classes for different
838 * character sets. For now, I prepare the predefined table only for
839 * ASCII range, that all character sets agree on.
840 */
841
842 static ScmCharSet *predef_charsets[SCM_CHARSET_NUM_PREDEFINED_SETS] = {NULL};
843 static ScmInternalMutex predef_charsets_mutex;
844
845 static void install_charsets(void)
846 {
847 int i, code;
848
849 SCM_INTERNAL_MUTEX_LOCK(predef_charsets_mutex);
850
851 #define CS(n) predef_charsets[n]
852 for (i = 0; i < SCM_CHARSET_NUM_PREDEFINED_SETS; i++) {
853 CS(i) = SCM_CHARSET(Scm_MakeEmptyCharSet());
854 }
855 for (code = 0; code < SCM_CHARSET_MASK_CHARS; code++) {
856 if (isalnum(code)) MASK_SET(CS(SCM_CHARSET_ALNUM), code);
857 if (isalpha(code)) MASK_SET(CS(SCM_CHARSET_ALPHA), code);
858 if (iscntrl(code)) MASK_SET(CS(SCM_CHARSET_CNTRL), code);
859 if (isdigit(code)) MASK_SET(CS(SCM_CHARSET_DIGIT), code);
860 if (isgraph(code)) MASK_SET(CS(SCM_CHARSET_GRAPH), code);
861 if (islower(code)) MASK_SET(CS(SCM_CHARSET_LOWER), code);
862 if (isprint(code)) MASK_SET(CS(SCM_CHARSET_PRINT), code);
863 if (ispunct(code)) MASK_SET(CS(SCM_CHARSET_PUNCT), code);
864 if (isspace(code)) MASK_SET(CS(SCM_CHARSET_SPACE), code);
865 if (isupper(code)) MASK_SET(CS(SCM_CHARSET_UPPER), code);
866 if (isxdigit(code)) MASK_SET(CS(SCM_CHARSET_XDIGIT), code);
867 /* Default word constituent chars #[\w]. NB: in future versions,
868 a parameter might be introduced to customize this set. */
869 if (isalnum(code)||code=='_')
870 MASK_SET(CS(SCM_CHARSET_WORD), code);
871 /* isblank() is not in posix. for now, I hardcode it. */
872 if (code == ' ' || code == '\t')
873 MASK_SET(CS(SCM_CHARSET_BLANK), code);
874 }
875 #undef CS
876 SCM_INTERNAL_MUTEX_UNLOCK(predef_charsets_mutex);
877 }
878
879 ScmObj Scm_GetStandardCharSet(int id)
880 {
881 if (id < 0 || id >= SCM_CHARSET_NUM_PREDEFINED_SETS)
882 Scm_Error("bad id for predefined charset index: %d", id);
883 if (predef_charsets[id] == NULL) {
884 install_charsets();
885 }
886 return SCM_OBJ(predef_charsets[id]);
887 }
888
889 void Scm__InitChar(void)
890 {
891 SCM_INTERNAL_MUTEX_INIT(predef_charsets_mutex);
892 }