src/char.c

root/src/char.c

/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS

This source file includes following definitions.
Scm_CharEncodingName
Scm_SupportedCharacterEncodings
Scm_SupportedCharacterEncodingP
Scm_DigitToInt
Scm_IntToDigit
Scm_UcsToChar
Scm_CharToUcs
charset_print_ch
charset_print
make_charset
Scm_MakeEmptyCharSet
Scm_CopyCharSet
Scm_ReadXdigitsFromString
Scm_ReadXdigitsFromPort
charset_compare
Scm_CharSetEq
Scm_CharSetLE
newrange
Scm_CharSetAddRange
Scm_CharSetAdd
Scm_CharSetComplement
Scm_CharSetCaseFold
Scm_CharSetContains
Scm_CharSetRanges
Scm_CharSetDump
read_charset_xdigits
Scm_CharSetRead
read_predef_charset
install_charsets
Scm_GetStandardCharSet
Scm__InitChar
   1 /*
   2  * char.c - character and character set operations
   3  *
   4  *   Copyright (c) 2000-2004 Shiro Kawai, All rights reserved.
   5  * 
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  * 
  10  *   1. Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *
  13  *   2. Redistributions in binary form must reproduce the above copyright
  14  *      notice, this list of conditions and the following disclaimer in the
  15  *      documentation and/or other materials provided with the distribution.
  16  *
  17  *   3. Neither the name of the authors nor the names of its contributors
  18  *      may be used to endorse or promote products derived from this
  19  *      software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  $Id: char.c,v 1.43 2005/10/28 02:53:10 shirok Exp $
  34  */
  35 
  36 #include <ctype.h>
  37 #define LIBGAUCHE_BODY
  38 #include "gauche.h"
  39 
  40 /*=======================================================================
  41  * Character functions
  42  */
  43 
  44 ScmObj Scm_CharEncodingName(void)
  45 {
  46     return SCM_INTERN(SCM_CHAR_ENCODING_NAME);
  47 }
  48 
  49 /* includes encoding-specific auxiliary functions */
  50 #define SCM_CHAR_ENCODING_BODY
  51 #if   defined(GAUCHE_CHAR_ENCODING_EUC_JP)
  52 #include "gauche/char_euc_jp.h"
  53 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
  54 #include "gauche/char_utf_8.h"
  55 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
  56 #include "gauche/char_sjis.h"
  57 #else
  58 #include "gauche/char_none.h"
  59 #endif
  60 
  61 const char **Scm_SupportedCharacterEncodings(void)
  62 {
  63     return supportedCharacterEncodings;
  64 }
  65 
  66 int Scm_SupportedCharacterEncodingP(const char *encoding)
  67 {
  68     const char **cs = supportedCharacterEncodings;
  69     for (;*cs;cs++) {
  70         const char *p = *cs;
  71         const char *q = encoding;
  72         for (;*p && *q; p++, q++) {
  73             if (tolower(*p) != tolower(*q)) break;
  74         }
  75         if (*p == '\0' && *q == '\0') return TRUE;
  76     }
  77     return FALSE;
  78 }
  79 
  80 /* '0' -> 0, 'a' -> 10, etc.
  81    Radix is assumed in the range [2, 36] */
  82 int Scm_DigitToInt(ScmChar ch, int radix)
  83 {
  84     if (ch < '0') return -1;
  85     if (radix <= 10) {
  86         if (ch <= '0' + radix) return (ch - '0');
  87     } else {
  88         if (ch <= '9') return (ch - '0');
  89         if (ch < 'A') return -1;
  90         if (ch < 'A' + radix - 10) return (ch - 'A' + 10);
  91         if (ch < 'a') return -1;
  92         if (ch < 'a' + radix - 10) return (ch - 'a' + 10);
  93     }
  94     return -1;
  95 }
  96 
  97 ScmChar Scm_IntToDigit(int n, int radix)
  98 {
  99     if (n < 0) return SCM_CHAR_INVALID;
 100     if (radix <= 10) {
 101         if (n < radix) return (ScmChar)(n + '0');
 102         else return SCM_CHAR_INVALID;
 103     } else {
 104         if (n < 10) return (ScmChar)(n + '0');
 105         if (n < radix) return (ScmChar)(n - 10 + 'a');
 106         else return SCM_CHAR_INVALID;
 107     }
 108 }
 109 
 110 /*
 111  * Convert UCS4 code <-> character
 112  * If the native encoding is not utf-8, gauche.charconv module is loaded.
 113  */
 114 ScmChar (*Scm_UcsToCharHook)(int ucs4) = NULL;  /* filled by ext/charconv */
 115 int (*Scm_CharToUcsHook)(ScmChar ch) = NULL;    /* filled by ext/charconv */
 116 
 117 ScmChar Scm_UcsToChar(int n)
 118 {
 119     if (n < 0) Scm_Error("bad character code: %d", n);
 120 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
 121     return (ScmChar)n;
 122 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
 123     if (n < 0x80) return (ScmChar)n; /*ASCII range*/
 124     if (Scm_UcsToCharHook == NULL) {
 125         /* NB: we don't need mutex here, for the loading of gauche.charconv
 126            is serialized in Scm_Require. */
 127         Scm_Require(SCM_MAKE_STR("gauche/charconv"));
 128         if (Scm_UcsToCharHook == NULL) {
 129             Scm_Error("couldn't autoload gauche.charconv");
 130         }
 131     }
 132     return Scm_UcsToCharHook(n);
 133 #else
 134     if (n < 0x100) return (ScmChar)n; /* ISO8859-1 */
 135     else return SCM_CHAR_INVALID;
 136 #endif
 137 }
 138 
 139 int Scm_CharToUcs(ScmChar ch)
 140 {
 141     if (ch == SCM_CHAR_INVALID) Scm_Error("bad character");
 142 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
 143     return (int)ch;
 144 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
 145     if (ch < 0x80) return (int)ch; /*ASCII range*/
 146     if (Scm_CharToUcsHook == NULL) {
 147         /* NB: we don't need mutex here, for the loading of gauche.charconv
 148            is serialized in Scm_Require. */
 149         Scm_Require(SCM_MAKE_STR("gauche/charconv"));
 150         if (Scm_CharToUcsHook == NULL) {
 151             Scm_Error("couldn't autoload gauche.charconv");
 152         }
 153     }
 154     return Scm_CharToUcsHook(ch);
 155 #else
 156     return (int)ch;             /* ISO8859-1 */
 157 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
 158 }
 159 
 160 /*=======================================================================
 161  * Character set (cf. SRFI-14)
 162  */
 163 /* NB: operations on charset are not very optimized, for I don't see
 164  * the immediate needs to do so, except Scm_CharSetContains.
 165  */
 166 
 167 static void charset_print(ScmObj obj, ScmPort *out, ScmWriteContext*);
 168 static int charset_compare(ScmObj x, ScmObj y, int equalp);
 169 SCM_DEFINE_BUILTIN_CLASS(Scm_CharSetClass,
 170                          charset_print, charset_compare, NULL, NULL,
 171                          SCM_CLASS_DEFAULT_CPL);
 172 
 173 /* masks */
 174 #if SIZEOF_LONG == 4
 175 #define MASK_BIT_SHIFT  5
 176 #define MASK_BIT_MASK   0x1f
 177 #elif SIZEOF_LONG == 8
 178 #define MASK_BIT_SHIFT  6
 179 #define MASK_BIT_MASK   0x3f
 180 #elif SIZEOF_LONG == 16    /* maybe, in some future ... */
 181 #define MASK_BIT_SHIFT  7
 182 #define MASK_BIT_MASK   0x7f
 183 #else
 184 #error need to set SIZEOF_LONG
 185 #endif
 186 
 187 #define MASK_INDEX(ch)       ((ch) >> MASK_BIT_SHIFT)
 188 #define MASK_BIT(ch)         (1L << ((ch) & MASK_BIT_MASK))
 189 #define MASK_ISSET(cs, ch)   (!!(cs->mask[MASK_INDEX(ch)] & MASK_BIT(ch)))
 190 #define MASK_SET(cs, ch)     (cs->mask[MASK_INDEX(ch)] |= MASK_BIT(ch))
 191 #define MASK_RESET(cs, ch)   (cs->mask[MASK_INDEX(ch)] &= ~MASK_BIT(ch))
 192 
 193 /*----------------------------------------------------------------------
 194  * Printer
 195  */
 196 static void charset_print_ch(ScmPort *out, ScmChar ch)
 197 {
 198     if (ch < 0x20 || ch == 0x7f) {
 199         Scm_Printf(out, "\\x%02x", ch);
 200     } else {
 201         char chbuf[SCM_CHAR_MAX_BYTES];
 202         int i;
 203         SCM_CHAR_PUT(chbuf, ch);
 204         for (i=0; i<SCM_CHAR_NBYTES(ch); i++) {
 205             Scm_Printf(out, "%c", chbuf[i]);
 206         }
 207     }
 208 }
 209 
 210 static void charset_print(ScmObj obj, ScmPort *out, ScmWriteContext *ctx)
 211 {
 212     int prev, code;
 213     ScmCharSet *cs = SCM_CHARSET(obj);
 214     struct ScmCharSetRange *r;
 215 
 216     Scm_Printf(out, "#[");
 217     for (prev = -1, code = 0; code < SCM_CHARSET_MASK_CHARS; code++) {
 218         if (MASK_ISSET(cs, code) && prev < 0) {
 219             charset_print_ch(out, code);
 220             prev = code;
 221         } 
 222         else if (!MASK_ISSET(cs, code) && prev >= 0) {
 223             if (code - prev > 1) {
 224                 if (code - prev > 2) Scm_Printf(out, "-");
 225                 charset_print_ch(out, code-1);
 226             }
 227             prev = -1;
 228         }
 229     }
 230     if (prev >= 0) {
 231         if (code - prev > 1) {
 232             if (prev < 0x7e) Scm_Printf(out, "-");
 233             charset_print_ch(out, code-1);
 234         }
 235     }
 236     for (r = cs->ranges; r; r = r->next) {
 237         charset_print_ch(out, r->lo);
 238         if (r->hi == r->lo) continue;
 239         if (r->hi - r->lo > 2) Scm_Printf(out, "-");
 240         charset_print_ch(out, r->hi);
 241     }
 242     Scm_Printf(out, "]", obj);
 243 }
 244 
 245 /*-----------------------------------------------------------------
 246  * Constructors
 247  */
 248 static ScmCharSet *make_charset(void)
 249 {
 250     ScmCharSet *cs = SCM_NEW(ScmCharSet);
 251     int i;
 252     SCM_SET_CLASS(cs, SCM_CLASS_CHARSET);
 253     for (i=0; i<SCM_CHARSET_MASK_SIZE; i++) cs->mask[i] = 0;
 254     cs->ranges = NULL;
 255     return cs;
 256 }
 257 
 258 ScmObj Scm_MakeEmptyCharSet(void)
 259 {
 260     return SCM_OBJ(make_charset());
 261 }
 262 
 263 ScmObj Scm_CopyCharSet(ScmCharSet *src)
 264 {
 265     ScmCharSet *dst = make_charset();
 266     struct ScmCharSetRange *rs, *rd = dst->ranges;
 267     int i;
 268     
 269     for (i=0; i<SCM_CHARSET_MASK_SIZE; i++) dst->mask[i] = src->mask[i];
 270     for (rs = src->ranges; rs; rs = rs->next) {
 271         if (rd == NULL) {
 272             rd = dst->ranges = SCM_NEW(struct ScmCharSetRange);
 273         } else {
 274             rd->next = SCM_NEW(struct ScmCharSetRange);
 275             rd = rd->next;
 276         }
 277         rd->lo = rs->lo;
 278         rd->hi = rs->hi;
 279     }
 280     if (rd) rd->next = NULL;
 281     return SCM_OBJ(dst);
 282 }
 283 
 284 /* Helper functions to read the escaped character code sequence, such as
 285    \xXX, \uXXXX, or \UXXXXXXXX.
 286    Scm_ReadXdigitsFromString reads from char* buffer (note that hex digits
 287    consist of single-byte characters in any encoding, we don't need to
 288    do the cumbersome multibyte handling).  Scm_ReadXdigitsFromPort reads
 289    from the port.  Both should be called after the prefix 'x', 'u' or 'U'
 290    char is read.  NDIGITS specifies either exact number of digits to be
 291    expected or maximum number of digits. */
 292 
 293 /* If nextbuf == NULL, ndigits specifies exact # of digits.  Returns
 294    SCM_CHAR_INVALID if there are less digits.  Otherwise, ndigis specifies
 295    max # of digits, and the ptr to the next char is stored in nextbuf. */
 296 ScmChar Scm_ReadXdigitsFromString(const char *buf, int ndigits,
 297                                   const char **nextbuf)
 298 {
 299     int i, val = 0;
 300     for (i=0; i<ndigits; i++) {
 301         if (!isxdigit(buf[i])) {
 302             if (nextbuf == NULL) return SCM_CHAR_INVALID;
 303             else {
 304                 *nextbuf = buf;
 305                 return val;
 306             }
 307         }
 308         val = val * 16 + Scm_DigitToInt(buf[i], 16);
 309     }
 310     return (ScmChar)val;
 311 }
 312 
 313 /* ndigits specifies exact # of digits.  read chars are stored in buf
 314    so that they can be used in the error message.  Caller must provide
 315    a sufficient space for buf. */
 316 ScmChar Scm_ReadXdigitsFromPort(ScmPort *port, int ndigits,
 317                                 char *buf, int *nread)
 318 {
 319     int i, c, val = 0, dig;
 320     
 321     for (i = 0; i < ndigits; i++) {
 322         SCM_GETC(c, port);
 323         if (c == EOF) break;
 324         dig = Scm_DigitToInt(c, 16);
 325         if (dig < 0) {
 326             SCM_UNGETC(c, port);
 327             break;
 328         }
 329         buf[i] = (char)c;       /* we know c is single byte char here. */
 330         val = val * 16 + dig;
 331     }
 332     *nread = i;
 333     if (i < ndigits) { /* error */
 334         return SCM_CHAR_INVALID;
 335     } else {
 336         return (ScmChar)val;
 337     }
 338 }
 339 
 340 /*-----------------------------------------------------------------
 341  * Comparison
 342  */
 343 static int charset_compare(ScmObj x, ScmObj y, int equalp)
 344 {
 345     ScmCharSet *xx = SCM_CHARSET(x);
 346     ScmCharSet *yy = SCM_CHARSET(y);
 347     
 348     if (equalp) {
 349         return (Scm_CharSetEq(xx, yy)? 0 : 1);
 350     } else {
 351         if (Scm_CharSetEq(xx, yy)) return 0;
 352         if (Scm_CharSetLE(xx, yy)) return -1;
 353         if (Scm_CharSetLE(yy, xx)) return 1;
 354         Scm_Error("cannot compare char-sets: %S vs %S", x, y);
 355         return 0;               /* dummy */
 356     }
 357 }
 358 
 359 int Scm_CharSetEq(ScmCharSet *x, ScmCharSet *y)
 360 {
 361     int i;
 362     struct ScmCharSetRange *rx, *ry;
 363     for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
 364         if (x->mask[i] != y->mask[i]) return FALSE;
 365     for (rx=x->ranges, ry=y->ranges; rx && ry; rx=rx->next, ry=ry->next) {
 366         if (rx->lo != ry->lo || rx->hi != ry->hi) return FALSE;
 367     }
 368     if (rx || ry) return FALSE;
 369     return TRUE;
 370 }
 371 
 372 /* whether x <= y */
 373 int Scm_CharSetLE(ScmCharSet *x, ScmCharSet *y)
 374 {
 375     int i;
 376     struct ScmCharSetRange *rx, *ry;
 377     for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
 378         if ((x->mask[i] | y->mask[i]) != y->mask[i]) return FALSE;
 379     rx = x->ranges;
 380     ry = y->ranges;
 381     while (rx && ry) {
 382         if (rx->lo < ry->lo) return FALSE;
 383         if (rx->lo > ry->hi) { ry = ry->next; continue; }
 384         if (rx->hi > ry->hi) return FALSE;
 385         rx = rx->next;
 386     }
 387     if (rx) return FALSE;
 388     return TRUE;
 389 }
 390 
 391 /*-----------------------------------------------------------------
 392  * Modification
 393  */
 394 
 395 static struct ScmCharSetRange *newrange(int lo, int hi,
 396                                         struct ScmCharSetRange *next)
 397 {
 398     struct ScmCharSetRange *n = SCM_NEW(struct ScmCharSetRange);
 399     n->next = next;
 400     n->lo = lo;
 401     n->hi = hi;
 402     return n;
 403 }
 404 
 405 ScmObj Scm_CharSetAddRange(ScmCharSet *cs, ScmChar from, ScmChar to)
 406 {
 407     int i;
 408     struct ScmCharSetRange *lo, *lop, *hi;
 409     
 410     if (to < from) return SCM_OBJ(cs);
 411     if (from < SCM_CHARSET_MASK_CHARS) {
 412         if (to < SCM_CHARSET_MASK_CHARS) {
 413             for (i=from; i<=to; i++) MASK_SET(cs, i);
 414             return SCM_OBJ(cs);
 415         }
 416         for (i=from; i<SCM_CHARSET_MASK_CHARS; i++)  MASK_SET(cs, i);
 417         from = SCM_CHARSET_MASK_CHARS;
 418     }
 419     if (cs->ranges == NULL) {
 420         cs->ranges = newrange(from, to, NULL);
 421         return SCM_OBJ(cs);
 422     }
 423     /* Add range.  Ranges are chained from lower character code to higher,
 424        without any overlap. */
 425     /* First, we scan the ranges so that we'll get...
 426         - if FROM is in a range, lo points to it.
 427         - if FROM is out of any ranges, lo points to the closest range that
 428           is higher than FROM.
 429         - if TO is in a range, hi points to the range.
 430         - if TO is out of any ranges, hi points to the closest range that
 431           is higher than TO. */
 432     for (lop = NULL, lo = cs->ranges; lo; lop = lo, lo = lo->next) {
 433         if (from <= lo->hi+1) break;
 434     }
 435     if (!lo) {
 436         lop->next = newrange(from, to, NULL);
 437         return SCM_OBJ(cs);
 438     }
 439     for (hi = lo; hi; hi = hi->next) {
 440         if (to <= hi->hi) break;
 441     }
 442     /* Then we insert, extend and/or merge the ranges accordingly. */
 443     if (from < lo->lo) { /* FROM extends the LO */
 444         if (lo == hi) {
 445             if (to < hi->lo-1) {
 446                 if (lop == NULL) cs->ranges = newrange(from, to, lo);
 447                 else             lop->next = newrange(from, to, lo);
 448             } else {
 449                 lo->lo = from;
 450             }
 451         } else if (hi == NULL || to < hi->lo-1) {
 452             lo->lo = from;
 453             lo->hi = to;
 454             lo->next = hi;
 455         } else {
 456             lo->lo = from;
 457             lo->hi = hi->hi;
 458             lo->next = hi->next;
 459         }
 460     } else { /* FROM included in LO */
 461         if (lo != hi) {
 462             if (hi == NULL || to < hi->lo-1) {
 463                 lo->hi = to;
 464                 lo->next = hi;
 465             } else {
 466                 lo->hi = hi->hi;
 467                 lo->next = hi->next;
 468             }
 469         }
 470     }
 471     /* WRITE ME */
 472     return SCM_OBJ(cs);
 473 }
 474 
 475 ScmObj Scm_CharSetAdd(ScmCharSet *dst, ScmCharSet *src)
 476 {
 477     int i;
 478     struct ScmCharSetRange *r;
 479     for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
 480         dst->mask[i] |= src->mask[i];
 481     for (r = src->ranges; r; r = r->next) {
 482         Scm_CharSetAddRange(dst, r->lo, r->hi);
 483     }
 484     return SCM_OBJ(dst);
 485 }
 486 
 487 ScmObj Scm_CharSetComplement(ScmCharSet *cs)
 488 {
 489     int i, last;
 490     struct ScmCharSetRange *r, *p;
 491     for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
 492         cs->mask[i] = ~cs->mask[i];
 493     last = SCM_CHARSET_MASK_CHARS;
 494     for (p = NULL, r = cs->ranges; r; p = r, r = r->next) {
 495         int hi = r->hi+1;
 496         if (r->lo != SCM_CHARSET_MASK_CHARS) {
 497             r->hi = r->lo - 1;
 498             r->lo = last;
 499         } else {
 500             cs->ranges = r->next;
 501         }
 502         last = hi;
 503     }
 504     if (last < SCM_CHAR_MAX) {
 505         if (!p) cs->ranges = newrange(last, SCM_CHAR_MAX, NULL);
 506         else    p->next = newrange(last, SCM_CHAR_MAX, NULL);
 507     }
 508     return SCM_OBJ(cs);
 509 }
 510 
 511 /* Make charset case-insensitive.  For now, we only deal with
 512    ASCII range. */
 513 ScmObj Scm_CharSetCaseFold(ScmCharSet *cs)
 514 {
 515     int ch;
 516     for (ch='a'; ch<='z'; ch++) {
 517         if (MASK_ISSET(cs, ch) || MASK_ISSET(cs, (ch-('a'-'A')))) {
 518             MASK_SET(cs, ch);
 519             MASK_SET(cs, (ch-('a'-'A')));
 520         }
 521     }
 522     return SCM_OBJ(cs);
 523 }
 524 
 525 /*-----------------------------------------------------------------
 526  * Query
 527  */
 528 
 529 int Scm_CharSetContains(ScmCharSet *cs, ScmChar c)
 530 {
 531     if (c < 0) return FALSE;
 532     if (c < SCM_CHARSET_MASK_CHARS) return MASK_ISSET(cs, c);
 533     else {
 534         struct ScmCharSetRange *r;
 535         for (r = cs->ranges; r; r = r->next) {
 536             if (r->lo <= c && c <= r->hi) return TRUE;
 537         }
 538         return FALSE;
 539     }
 540 }
 541 
 542 /*-----------------------------------------------------------------
 543  * Inspection
 544  */
 545 
 546 /* returns a list of ranges contained in the charset */
 547 ScmObj Scm_CharSetRanges(ScmCharSet *cs)
 548 {
 549     ScmObj h = SCM_NIL, t = SCM_NIL, cell;
 550     int ind, begin = 0, prev = FALSE;
 551     struct ScmCharSetRange *r;
 552     
 553     for (ind = 0; ind < SCM_CHARSET_MASK_CHARS; ind++) {
 554         int bit = MASK_ISSET(cs, ind);
 555         if (!prev && bit) begin = ind;
 556         if (prev && !bit) {
 557             cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(ind-1));
 558             SCM_APPEND1(h, t, cell);
 559         }
 560         prev = bit;
 561     }
 562     if (prev) {
 563         if (!cs->ranges || cs->ranges->lo != SCM_CHARSET_MASK_CHARS) {
 564             cell = Scm_Cons(SCM_MAKE_INT(begin),
 565                             SCM_MAKE_INT(SCM_CHARSET_MASK_CHARS-1));
 566             SCM_APPEND1(h, t, cell);
 567             r = cs->ranges;
 568         } else {
 569             cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(cs->ranges->hi));
 570             SCM_APPEND1(h, t, cell);
 571             r = cs->ranges->next;
 572         }
 573     } else {
 574         r = cs->ranges;
 575     }
 576     for (; r; r = r->next) {
 577         cell = Scm_Cons(SCM_MAKE_INT(r->lo), SCM_MAKE_INT(r->hi));
 578         SCM_APPEND1(h, t, cell);
 579     }
 580     return h;
 581 }
 582 
 583 #if SCM_DEBUG_HELPER
 584 void Scm_CharSetDump(ScmCharSet *cs, ScmPort *port)
 585 {
 586     int i;
 587     struct ScmCharSetRange *r;
 588     Scm_Printf(port, "CharSet %p\nmask:", cs);
 589     for (i=0; i<SCM_CHARSET_MASK_SIZE; i++)
 590         Scm_Printf(port, "[%08x]", cs->mask[i]);
 591     Scm_Printf(port, "\nranges:");
 592     for (r=cs->ranges; r; r=r->next)
 593         Scm_Printf(port, "(%d-%d)", r->lo, r->hi);
 594     Scm_Printf(port, "\n");
 595 }
 596 #endif /* SCM_DEBUG_HELPER */
 597 
 598 /*-----------------------------------------------------------------
 599  * Reader
 600  */
 601 
 602 /* Read \x, \u, \U escape sequence in the charset spec. */
 603 static ScmChar read_charset_xdigits(ScmPort *port, int ndigs, int key)
 604 {
 605     char buf[8];
 606     int nread;
 607     ScmChar r;
 608     SCM_ASSERT(ndigs <= 8);
 609     r = Scm_ReadXdigitsFromPort(port, ndigs, buf, &nread);
 610     if (r == SCM_CHAR_INVALID) {
 611         ScmDString ds;
 612         int c, i;
 613         /* skip chars to the end of regexp, so that the reader will read
 614            after the erroneous string */
 615         for (;;) {
 616             SCM_GETC(c, port);
 617             if (c == EOF || c == ']') break;
 618             if (c == '\\') SCM_GETC(c, port);
 619         }
 620         /* construct an error message */
 621         Scm_DStringInit(&ds);
 622         Scm_DStringPutc(&ds, '\\');
 623         Scm_DStringPutc(&ds, key);
 624         for (i=0; i<nread; i++) Scm_DStringPutc(&ds, (unsigned char)buf[i]);
 625         Scm_Error("Bad '\\%c' escape sequence in a char-set literal: %s",
 626                   key, Scm_DStringGetz(&ds));
 627     }
 628     return r;
 629 }
 630 
 631 /* Parse regexp-style character set specification (e.g. [a-zA-Z]).
 632    Assumes the opening bracket is already read.
 633    Always return a fresh charset, that can be modified afterwards.
 634 
 635    If the input syntax is invalid, either signals an error or returns
 636    #f, depending error_p flag.
 637 
 638    If bracket_syntax is TRUE, the first closing bracket ']' in the
 639    charset (except the complimenting caret) is taken as a literal
 640    character, instead of terminating the charset.  It should be TRUE
 641    during reading the regexp syntax for compatibility to POSIX regexp.
 642    
 643    If complement_p is not NULL, the location get a boolean value of
 644    whether complement character (caret in the beginning) appeared or not.
 645    In that case, the returned charset is not complemented. */
 646 
 647 static ScmObj read_predef_charset(ScmPort*, ScmObj*);
 648 
 649 ScmObj Scm_CharSetRead(ScmPort *input, int *complement_p,
 650                        int error_p, int bracket_syntax)
 651 {
 652 #define REAL_BEGIN 1
 653 #define CARET_BEGIN 2
 654     int begin = REAL_BEGIN, complement = FALSE;
 655     int lastchar = -1, inrange = FALSE, moreset_complement = FALSE;
 656     ScmCharSet *set = SCM_CHARSET(Scm_MakeEmptyCharSet());
 657     ScmObj moreset;
 658     ScmObj chars = SCM_NIL;
 659     ScmChar ch = 0;
 660 
 661     for (;;) {
 662         SCM_GETC(ch, input);
 663         if (ch == EOF) goto err;
 664         chars = Scm_Cons(SCM_MAKE_CHAR(ch), chars);
 665 
 666         if (begin == REAL_BEGIN && ch == '^') {
 667             complement = TRUE;
 668             begin = CARET_BEGIN;
 669             continue;
 670         }
 671         if (bracket_syntax && begin && ch == ']') {
 672             Scm_CharSetAddRange(set, ch, ch);
 673             lastchar = ch;
 674             begin = FALSE;
 675             continue;
 676         }
 677         begin = FALSE;
 678 
 679         switch (ch) {
 680         case '-':
 681             if (inrange) goto ordchar;
 682             inrange = TRUE;
 683             continue;
 684         case ']':
 685             if (inrange) {
 686                 if (lastchar >= 0) {
 687                     Scm_CharSetAddRange(set, lastchar, lastchar);
 688                     Scm_CharSetAddRange(set, '-', '-');
 689                 } else {
 690                     Scm_CharSetAddRange(set, '-', '-');
 691                 }
 692             }
 693             break;
 694         case '\\':
 695             SCM_GETC(ch, input);
 696             if (ch == SCM_CHAR_INVALID) goto err;
 697             chars = Scm_Cons(SCM_MAKE_CHAR(ch), chars);
 698             switch (ch) {
 699             case 'a': ch = 7; goto ordchar;
 700             case 'b': ch = 8; goto ordchar;
 701             case 'n': ch = '\n'; goto ordchar;
 702             case 'r': ch = '\r'; goto ordchar;
 703             case 't': ch = '\t'; goto ordchar;
 704             case 'f': ch = '\f'; goto ordchar;
 705             case 'e': ch = 0x1b; goto ordchar;
 706             case 'x':
 707                 ch = read_charset_xdigits(input, 2, 'x'); goto ordchar;
 708             case 'u':
 709                 ch = Scm_UcsToChar(read_charset_xdigits(input, 4, 'u'));
 710                 goto ordchar;
 711             case 'U':
 712                 ch = Scm_UcsToChar(read_charset_xdigits(input, 8, 'U'));
 713                 goto ordchar;
 714             case 'd':
 715                 moreset_complement = FALSE;
 716                 moreset = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
 717                 break;
 718             case 'D':
 719                 moreset_complement = TRUE;
 720                 moreset = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
 721                 break;
 722             case 's':
 723                 moreset_complement = FALSE;
 724                 moreset = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
 725                 break;
 726             case 'S':
 727                 moreset_complement = TRUE;
 728                 moreset = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
 729                 break;
 730             case 'w':
 731                 moreset_complement = FALSE;
 732                 moreset = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
 733                 break;
 734             case 'W':
 735                 moreset_complement = TRUE;
 736                 moreset = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
 737                 break;
 738             default:
 739                 goto ordchar;
 740             }
 741             if (moreset_complement) {
 742                 moreset = Scm_CharSetComplement(SCM_CHARSET(Scm_CopyCharSet(SCM_CHARSET(moreset))));
 743             }
 744             Scm_CharSetAdd(set, SCM_CHARSET(moreset));
 745             continue;
 746         case '[':
 747             moreset = read_predef_charset(input, &chars);
 748             if (!SCM_CHARSETP(moreset)) goto err;
 749             Scm_CharSetAdd(set, SCM_CHARSET(moreset));
 750             continue;
 751         ordchar:
 752         default:
 753             if (inrange) {
 754                 if (lastchar < 0) {
 755                     Scm_CharSetAddRange(set, '-', '-');
 756                     Scm_CharSetAddRange(set, ch, ch);
 757                     lastchar = ch;
 758                 } else {
 759                     Scm_CharSetAddRange(set, lastchar, ch);
 760                     lastchar = -1;
 761                 }
 762                 inrange = FALSE;
 763             } else {
 764                 Scm_CharSetAddRange(set, ch, ch);
 765                 lastchar = ch;
 766             }
 767             continue;
 768         }
 769         break;
 770     }
 771     if (complement_p) {
 772         *complement_p = complement;
 773         return SCM_OBJ(set);
 774     } else {
 775         if (complement) Scm_CharSetComplement(set);
 776         return SCM_OBJ(set);
 777     }
 778   err:
 779     if (error_p)
 780         Scm_Error("Unclosed bracket in charset syntax [%A",
 781                   Scm_ListToString(Scm_ReverseX(chars)));
 782     return SCM_FALSE;
 783 }
 784 
 785 /* Read posix [:alpha:] etc.  The first '[' is already read.
 786    Return #f on error.  Set reverse list of read chars in *chars */
 787 #define MAX_CHARSET_NAME_LEN  10
 788 ScmObj read_predef_charset(ScmPort *input, ScmObj *chars)
 789 {
 790     int i;
 791     char name[MAX_CHARSET_NAME_LEN];
 792     ScmChar ch;
 793     for (i=0; i<MAX_CHARSET_NAME_LEN; i++) {
 794         SCM_GETC(ch, input);
 795         if (ch == SCM_CHAR_INVALID) return SCM_FALSE;
 796         *chars = Scm_Cons(SCM_MAKE_CHAR(ch), *chars);
 797         if (!SCM_CHAR_ASCII_P(ch)) break;
 798         if (ch != ']') {
 799             name[i] = ch;
 800             continue;
 801         }
 802         if (strncmp(name, ":alnum:", 7) == 0) {
 803             return Scm_GetStandardCharSet(SCM_CHARSET_ALNUM);
 804         } else if (strncmp(name, ":alpha:", 7) == 0) {
 805             return Scm_GetStandardCharSet(SCM_CHARSET_ALPHA);
 806         } else if (strncmp(name, ":blank:", 7) == 0) {
 807             return Scm_GetStandardCharSet(SCM_CHARSET_BLANK);
 808         } else if (strncmp(name, ":cntrl:", 7) == 0) {
 809             return Scm_GetStandardCharSet(SCM_CHARSET_CNTRL);
 810         } else if (strncmp(name, ":digit:", 7) == 0) {
 811             return Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
 812         } else if (strncmp(name, ":graph:", 7) == 0) {
 813             return Scm_GetStandardCharSet(SCM_CHARSET_GRAPH);
 814         } else if (strncmp(name, ":lower:", 7) == 0) {
 815             return Scm_GetStandardCharSet(SCM_CHARSET_LOWER);
 816         } else if (strncmp(name, ":print:", 7) == 0) {
 817             return Scm_GetStandardCharSet(SCM_CHARSET_PRINT);
 818         } else if (strncmp(name, ":punct:", 7) == 0) {
 819             return Scm_GetStandardCharSet(SCM_CHARSET_PUNCT);
 820         } else if (strncmp(name, ":space:", 7) == 0) {
 821             return Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
 822         } else if (strncmp(name, ":upper:", 7) == 0) {
 823             return Scm_GetStandardCharSet(SCM_CHARSET_UPPER);
 824         } else if (strncmp(name, ":xdigit:", 8) == 0) {
 825             return Scm_GetStandardCharSet(SCM_CHARSET_XDIGIT);
 826         } else break;
 827     }
 828     /* here we got invalid charset name */
 829     name[i] = '\0';
 830     Scm_Error("invalid or unsupported POSIX charset '[%s]'", name);
 831     return SCM_FALSE;
 832 }
 833 
 834 /*-----------------------------------------------------------------
 835  * Pre-defined charset
 836  */
 837 /* TODO: We need different definitions of character classes for different
 838  * character sets.  For now, I prepare the predefined table only for
 839  * ASCII range, that all character sets agree on.
 840  */
 841 
 842 static ScmCharSet *predef_charsets[SCM_CHARSET_NUM_PREDEFINED_SETS] = {NULL};
 843 static ScmInternalMutex predef_charsets_mutex;
 844 
 845 static void install_charsets(void)
 846 {
 847     int i, code;
 848 
 849     SCM_INTERNAL_MUTEX_LOCK(predef_charsets_mutex);
 850 
 851 #define CS(n)  predef_charsets[n]
 852     for (i = 0; i < SCM_CHARSET_NUM_PREDEFINED_SETS; i++) {
 853         CS(i) = SCM_CHARSET(Scm_MakeEmptyCharSet());
 854     }
 855     for (code = 0; code < SCM_CHARSET_MASK_CHARS; code++) {
 856         if (isalnum(code)) MASK_SET(CS(SCM_CHARSET_ALNUM), code);
 857         if (isalpha(code)) MASK_SET(CS(SCM_CHARSET_ALPHA), code);
 858         if (iscntrl(code)) MASK_SET(CS(SCM_CHARSET_CNTRL), code);
 859         if (isdigit(code)) MASK_SET(CS(SCM_CHARSET_DIGIT), code);
 860         if (isgraph(code)) MASK_SET(CS(SCM_CHARSET_GRAPH), code);
 861         if (islower(code)) MASK_SET(CS(SCM_CHARSET_LOWER), code);
 862         if (isprint(code)) MASK_SET(CS(SCM_CHARSET_PRINT), code);
 863         if (ispunct(code)) MASK_SET(CS(SCM_CHARSET_PUNCT), code);
 864         if (isspace(code)) MASK_SET(CS(SCM_CHARSET_SPACE), code);
 865         if (isupper(code)) MASK_SET(CS(SCM_CHARSET_UPPER), code);
 866         if (isxdigit(code)) MASK_SET(CS(SCM_CHARSET_XDIGIT), code);
 867         /* Default word constituent chars #[\w].  NB: in future versions,
 868            a parameter might be introduced to customize this set. */
 869         if (isalnum(code)||code=='_')
 870             MASK_SET(CS(SCM_CHARSET_WORD), code);
 871         /* isblank() is not in posix.  for now, I hardcode it. */
 872         if (code == ' ' || code == '\t')
 873             MASK_SET(CS(SCM_CHARSET_BLANK), code);
 874     }
 875 #undef CS
 876     SCM_INTERNAL_MUTEX_UNLOCK(predef_charsets_mutex);
 877 }
 878 
 879 ScmObj Scm_GetStandardCharSet(int id)
 880 {
 881     if (id < 0 || id >= SCM_CHARSET_NUM_PREDEFINED_SETS)
 882         Scm_Error("bad id for predefined charset index: %d", id);
 883     if (predef_charsets[id] == NULL) {
 884         install_charsets();
 885     }
 886     return SCM_OBJ(predef_charsets[id]);
 887 }
 888 
 889 void Scm__InitChar(void)
 890 {
 891     SCM_INTERNAL_MUTEX_INIT(predef_charsets_mutex);
 892 }
/* [<][>][^][v][top][bottom][index][help] */