root/src/regexp.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. make_regexp
  2. regexp_print
  3. regexp_compare
  4. regcomp_ctx
  5. rc_ctx_init
  6. rc1_lex
  7. rc1_lex_xdigits
  8. rc1_maybe_lazy
  9. rc1_lex_open_paren
  10. rc1_lex_minmax
  11. rc1_fold_alts
  12. rc1_parse
  13. rc1
  14. rc_charset
  15. rc_register_charset
  16. rc_setup_charsets
  17. rc2_optimize_seq
  18. rc2_optimize
  19. is_distinct
  20. Scm_RegOptimizeAST
  21. rc3_charset_index
  22. rc3_emit
  23. rc3_emit_offset
  24. rc3_fill_offset
  25. rc3_seq
  26. rc3_rec
  27. is_bol_anchored
  28. rc3
  29. Scm_RegDump
  30. rc_setup_context
  31. rc_setup_context_seq
  32. Scm_RegComp
  33. Scm_RegCompFromAST
  34. push_match
  35. match_ci
  36. is_word_constituent
  37. is_word_boundary
  38. rex_rec
  39. make_match
  40. rex
  41. Scm_RegExec
  42. Scm_RegMatchSubstr
  43. Scm_RegMatchStart
  44. Scm_RegMatchEnd
  45. Scm_RegMatchBefore
  46. Scm_RegMatchAfter
  47. Scm_RegMatchDump
  48. Scm__InitRegexp

   1 /*
   2  * regexp.c - regular expression
   3  *
   4  *   Copyright (c) 2000-2004 Shiro Kawai, All rights reserved.
   5  * 
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  * 
  10  *   1. Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *
  13  *   2. Redistributions in binary form must reproduce the above copyright
  14  *      notice, this list of conditions and the following disclaimer in the
  15  *      documentation and/or other materials provided with the distribution.
  16  *
  17  *   3. Neither the name of the authors nor the names of its contributors
  18  *      may be used to endorse or promote products derived from this
  19  *      software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  $Id: regexp.c,v 1.56 2005/11/02 10:49:19 shirok Exp $
  34  */
  35 
  36 #include <setjmp.h>
  37 #include <ctype.h>
  38 #define LIBGAUCHE_BODY
  39 #include "gauche.h"
  40 #include "gauche/class.h"
  41 #include "gauche/builtin-syms.h"
  42 
  43 /* I don't like to reinvent wheels, so I looked for a regexp implementation
  44  * that can handle multibyte encodings and not bound to Unicode.
  45  * Without assuming Unicode it'll be difficult to define character classes
  46  * correctly, but there are domains that you don't want to do native
  47  * charset <-> UTF-8 each time for regexp match, trading correctness of
  48  * character classes.
  49  *
  50  * The most recent version of famous Henry Spencer's regex is found in Tcl
  51  * 8.3, that supports wide characters (the state machine seems to work
  52  * with UCS-4, but the internal tables seem to be set up for UCS-2 only).
  53  * Tcl does UTF-8 <-> UCS-2 conversion in order to do regexp match.
  54  *
  55  * Lots of variants of Spencer's old regex code is floating around, such
  56  * as http://arglist.com/regex/ and the one in BSD.   They don't support
  57  * multibyte strings, as far as I know.
  58  *
  59  * Another popular package is PCRE.  PCRE 3.4 has UTF-8 support, but only
  60  * experimentally.
  61  *
  62  * None seems to satisfy my criteria.
  63  *
  64  * So I reluctantly started to write my own.  I don't think I can beat
  65  * those guys, and am willing to grab someone's code anytime if it's suitable
  66  * for my purpose and under a license like BSD one.  
  67  */
  68 
  69 /*
  70  * The idea here is to match string without converting mb <-> char as
  71  * much as possible.  Actually, the converion is done only when we see
  72  * large character sets.
  73  *
  74  * The engine is a sort of NFA, by keeping state information for backtrack
  75  * in C stack.  It'll bust the C stack if you try to match something like
  76  * (..)* with a long input string (there's a code to check the stack size
  77  * and aborts matching when the recursion goes too deep).
  78  * A possible fix is to check if recursion level exceeds some limit,
  79  * then save the C stack into heap (as in the C-stack-copying continuation
  80  * does) and reuse the stack area.
  81  */
  82 
  83 /* Instructions */
  84 enum {
  85     RE_MATCH1,                  /* followed by 1 byte to match */
  86     RE_MATCH,                   /* followed by length, and bytes to match */
  87     RE_MATCH1_CI,               /* case insenstive match */
  88     RE_MATCH_CI,                /* case insenstive match */
  89     RE_ANY,                     /* match any char */
  90     RE_TRY,                     /* followed by offset (2 bytes). try matching
  91                                    the following sequence, and if fails,
  92                                    jump to offset. */
  93     RE_SET,                     /* followed by charset #.  match any char in
  94                                    the charset. */
  95     RE_NSET,                    /* followed by charset #.  mathc any char but
  96                                    in the charset */
  97     RE_SET1,                    /* followed by charset #.  match any char in
  98                                    the charset.  guaranteed that the charset
  99                                    holds only range 0-127 */
 100     RE_NSET1,                   /* followed by charset #.  match any char
 101                                    but the ones in the charset.  guaranteed
 102                                    that the charset holds only range 0-127. */
 103     RE_JUMP,                    /* followed by offset (2 bytes).  jump to that
 104                                    bytecode. */
 105     RE_FAIL,                    /* fail */
 106     RE_SUCCESS,                 /* success */
 107     RE_BEGIN,                   /* followed by a group number.  start the
 108                                    group. */
 109     RE_END,                     /* followed by a group number.  end the
 110                                    group. */
 111     RE_BOL,                     /* beginning of line assertion */
 112     RE_EOL,                     /* end of line assertion */
 113     RE_WB,                      /* word boundary assertion */
 114     RE_NWB,                     /* negative word boundary assertion */
 115     RE_ASSERT,                  /* positive look-ahead assertion. followed by
 116                                    offset (2 bytes). */
 117     RE_NASSERT,                 /* negative look-ahead assertion. followed by
 118                                  * offset (2 bytes). */
 119     /* The following instructions are not necessary to implement the basic
 120        engine, but used in the optimized code */
 121     RE_MATCH1B,                 /* (match 1 byte or branch)
 122                                    followed by a byte, and offset.
 123                                    if the next byte matches the input,
 124                                    proceed.  otherwise, jump to the offset. */
 125     RE_SET1R,                   /* (1-byte set match repeat)
 126                                    followed by charset #.  Consumes all input
 127                                    that matches the given set. */
 128     RE_NSET1R,                  /* (1-byte negative set match repeat)
 129                                    followed by charset #.  Consumes all input
 130                                    that don't match the given set. */
 131     RE_SETR,                    /* (set match repeat)
 132                                    followed by charset #.  Consumes all input
 133                                    that matches the given set. */
 134     RE_NSETR,                   /* (negative set match repeat)
 135                                    followed by charset #.  Consumes all input
 136                                    that don't match the given set. */
 137     RE_NUM_INSN
 138 };
 139 
 140 /* maximum # of {n,m}-type limited repeat count */
 141 #define MAX_LIMITED_REPEAT 255
 142 
 143 /* internal regexp flag. */
 144 #define SCM_REGEXP_BOL_ANCHORED   (1L<<2)
 145 
 146 /* AST - the first pass of regexp compiler creates intermediate AST.
 147  * Alternatively, you can provide AST directly to the regexp compiler,
 148  * using Scm_RegCompFromAST().
 149  *
 150  *  <ast> : (<element> ...)
 151  *  
 152  *  <element> : <clause>   ; special clause
 153  *         | <item>        ; matches <item>
 154  *
 155  *  <item> : <char>       ; matches char
 156  *         | <char-set>   ; matches char set
 157  *         | (comp . <char-set>) ; matches complement of char set
 158  *         | any          ; matches any char
 159  *         | bol | eol    ; beginning/end of line assertion
 160  *         | wb | nwb     ; word-boundary/negative word boundary assertion
 161  *
 162  *  <clause> : (seq . <ast>)      ; sequence
 163  *           | (seq-uncase . <ast>) ; sequence (case insensitive match)
 164  *           | (seq-case . <ast>) ; sequence (case sensitive match)
 165  *           | (alt . <ast>)      ; alternative
 166  *           | (rep . <ast>)      ; 0 or more repetition of <ast> (greedy)
 167  *           | (rep-min . <ast>)  ; 0 or more repetition of <ast> (lazy)
 168  *           | (rep-bound <n> . <ast>) ; repetition up to <n> (greedy)
 169  *           | (rep-bound-min <n> . <ast>) ; repetition up to <n> (lazy)
 170  *           | (rep-while . <ast>) ; like rep, but no backtrack
 171  *           | (<integer> . <ast>) ; capturing group 
 172  *           | (assert . <ast>)   ; positive look-ahead assertion
 173  *           | (nassert . <ast>)  ; negative look-ahead assertion
 174  * 
 175  * For seq-uncase, items inside <ast> has to be prepared for case-insensitive
 176  * match, i.e. chars have to be downcased and char-sets have to be 
 177  * case-folded.
 178  */
 179    
 180 static void regexp_print(ScmObj obj, ScmPort *port, ScmWriteContext *c);
 181 static int  regexp_compare(ScmObj x, ScmObj y, int equalp);
 182 
 183 SCM_DEFINE_BUILTIN_CLASS(Scm_RegexpClass,
 184                          regexp_print, regexp_compare, NULL, NULL,
 185                          SCM_CLASS_DEFAULT_CPL);
 186 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_RegMatchClass, NULL);
 187 
 188 static ScmRegexp *make_regexp(void)
 189 {
 190     ScmRegexp *rx = SCM_NEW(ScmRegexp);
 191     SCM_SET_CLASS(rx, SCM_CLASS_REGEXP);
 192     rx->code = NULL;
 193     rx->numCodes = 0;
 194     rx->numGroups = 0;
 195     rx->numSets = 0;
 196     rx->sets = NULL;
 197     rx->mustMatch = NULL;
 198     rx->flags = 0;
 199     rx->pattern = NULL;
 200     return rx;
 201 }
 202 
 203 static void regexp_print(ScmObj rx, ScmPort *out, ScmWriteContext *ctx)
 204 {
 205     if (SCM_REGEXP(rx)->pattern) {
 206         Scm_Printf(out, "#/%A/", SCM_REGEXP(rx)->pattern);
 207     } else {
 208         /* fail safe */
 209         Scm_Printf(out, "#<regexp %p>", rx);
 210     }
 211 }
 212 
 213 static int regexp_compare(ScmObj x, ScmObj y, int equalp)
 214 {
 215     if (!equalp) {
 216         Scm_Error("cannot compare regexps: %S and %S", x, y);
 217     }
 218     return !(SCM_REGEXP(x)->pattern
 219              && SCM_REGEXP(y)->pattern
 220              && Scm_StringEqual(SCM_STRING(SCM_REGEXP(x)->pattern),
 221                                 SCM_STRING(SCM_REGEXP(y)->pattern))
 222              && ((SCM_REGEXP(x)->flags&SCM_REGEXP_CASE_FOLD)
 223                  == (SCM_REGEXP(y)->flags&SCM_REGEXP_CASE_FOLD)));
 224 }
 225 
 226 #ifndef CHAR_MAX
 227 #define CHAR_MAX 256
 228 #endif
 229 
 230 #define REGEXP_OFFSET_MAX 65535
 231 
 232 /*=======================================================================
 233  * Compiler
 234  */
 235 
 236 /* 3-pass compiler.
 237  *
 238  *  pass 1: parses the pattern and creates an AST.
 239  *  pass 2: optimize on AST.
 240  *  pass 3: byte code generation.
 241  */
 242 
 243 /* compiler state information */
 244 typedef struct regcomp_ctx_rec {
 245     ScmRegexp *rx;              /* the building regexp */
 246     ScmString *pattern;         /* original pattern */
 247     int casefoldp;              /* TRUE if case-folding match */
 248     ScmPort *ipat;              /* [pass1] string port for pattern */
 249     ScmObj sets;                /* [pass1] list of charsets */
 250     int grpcount;               /* [pass1] group count */
 251     unsigned char *code;        /* [pass3] code being built */
 252     int codep;                  /* [pass3] front of code generation */
 253     int emitp;                  /* [pass3] am I generating code? */
 254     int codemax;                /* [pass3] max codep */
 255 } regcomp_ctx;
 256 
 257 static void rc_ctx_init(regcomp_ctx *ctx, ScmRegexp *rx)
 258 {
 259     ctx->rx = rx;
 260     ctx->pattern = rx->pattern;
 261     ctx->casefoldp = FALSE;
 262     if (rx->pattern) {
 263         ctx->ipat = SCM_PORT(Scm_MakeInputStringPort(rx->pattern, FALSE));
 264     } else {
 265         ctx->ipat = NULL;
 266     }
 267     ctx->sets = SCM_NIL;
 268     ctx->grpcount = 0;
 269     ctx->code = NULL;
 270     ctx->codep = 0;
 271     ctx->emitp = FALSE;
 272     ctx->codemax = 1;
 273 }
 274 
 275 static ScmObj rc_charset(regcomp_ctx *ctx);
 276 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs);
 277 static ScmObj rc1_maybe_lazy(regcomp_ctx *ctx, ScmObj greedy, ScmObj lazy);
 278 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx);
 279 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx);
 280 static ScmChar rc1_lex_xdigits(ScmPort *port, int ndigs, int key);
 281 
 282 /*----------------------------------------------------------------
 283  * pass1 - parser
 284  */
 285 
 286 /* EBNF Syntax of Gauche's regexp.
 287  * This is a rough sketch.  The condition of BOL/EOL ("^" and "$"), for
 288  * example, has to be dealt with context information.
 289  * To follow the convention, "{" and "}" token that don't appear to
 290  * consist of the "limited repetition" syntax are regarded as literal
 291  * characters.
 292  *
 293  *  <re>   :
 294  *         | <re> <alt>
 295  *
 296  *  <alt>  :
 297  *         | <item>
 298  *         | <alt> "|" <item>
 299  *
 300  *  <item> : <atom> "*"
 301  *         | <atom> "+"
 302  *         | <atom> "?"
 303  *         | <atom> "{" <n> ("," <m>?)? "}"
 304  *         | <atom> "*?"
 305  *         | <atom> "+?"
 306  *         | <atom> "??"
 307  *         | <atom> "{" <n> ("," <m>?)? "}?"
 308  *         | <atom>
 309  *
 310  *  <atom> : a normal char, an escaped char, or a char-set
 311  *         | "(" <re> ")"       ;; grouping w/  capturing
 312  *         | "(?:"   <re> ")"   ;; grouping w/o capturing
 313  *         | "(?i:"  <re> ")"   ;; grouping w/o capturing (case insensitive)
 314  *         | "(?-i:" <re> ")"   ;; grouping w/o capturing (case sensitive)
 315  *         | "(?="   <re> ")"   ;; positive look-ahead assertion
 316  *         | "(?!"   <re> ")"   ;; negative look-ahead assertion
 317  */
 318 
 319 /* Lexer */
 320 static ScmObj rc1_lex(regcomp_ctx *ctx)
 321 {
 322     ScmChar ch;
 323     ScmObj cs;
 324 
 325     ch = Scm_GetcUnsafe(ctx->ipat);
 326     if (ch == SCM_CHAR_INVALID) return SCM_EOF;
 327     switch (ch) {
 328     case '(': return rc1_lex_open_paren(ctx);
 329     case ')': return SCM_SYM_CLOSE_PAREN;
 330     case '|': return SCM_SYM_ALT;
 331     case '^': return SCM_SYM_BOL;
 332     case '.': return SCM_SYM_ANY;
 333     case '$': return SCM_SYM_EOL;
 334     case '[': return rc_charset(ctx);
 335     case '{': return rc1_lex_minmax(ctx);
 336     case '+': return rc1_maybe_lazy(ctx, SCM_SYM_PLUS, SCM_SYM_PLUSQ);
 337     case '*': return rc1_maybe_lazy(ctx, SCM_SYM_STAR, SCM_SYM_STARQ);
 338     case '?': return rc1_maybe_lazy(ctx, SCM_SYM_QUESTION, SCM_SYM_QUESTIONQ);
 339     case '\\':
 340         ch = Scm_GetcUnsafe(ctx->ipat);
 341         if (ch == SCM_CHAR_INVALID) {
 342             Scm_Error("stray backslash at the end of pattern: %S\n",
 343                       ctx->pattern);
 344         }
 345         switch (ch) {
 346         case 'a': return SCM_MAKE_CHAR(0x07);
 347         case 'n': return SCM_MAKE_CHAR('\n');
 348         case 'r': return SCM_MAKE_CHAR('\r');
 349         case 't': return SCM_MAKE_CHAR('\t');
 350         case 'f': return SCM_MAKE_CHAR('\f');
 351         case 'e': return SCM_MAKE_CHAR(0x1b);
 352         case 'b': return SCM_SYM_WB;
 353         case 'B': return SCM_SYM_NWB;
 354         case 'x':
 355             ch = rc1_lex_xdigits(ctx->ipat, 2, 'x');
 356             return SCM_MAKE_CHAR(ch);
 357         case 'u':
 358             ch = rc1_lex_xdigits(ctx->ipat, 4, 'u');
 359             return SCM_MAKE_CHAR(Scm_UcsToChar(ch));
 360         case 'U':
 361             ch = rc1_lex_xdigits(ctx->ipat, 8, 'U');
 362             return SCM_MAKE_CHAR(Scm_UcsToChar(ch));
 363         case 'd':
 364             cs = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
 365             rc_register_charset(ctx, SCM_CHARSET(cs));
 366             return cs;
 367         case 'D':
 368             cs = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
 369             rc_register_charset(ctx, SCM_CHARSET(cs));
 370             return Scm_Cons(SCM_SYM_COMP, cs);
 371         case 'w':
 372             cs = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
 373             rc_register_charset(ctx, SCM_CHARSET(cs));
 374             return cs;
 375         case 'W':
 376             cs = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
 377             rc_register_charset(ctx, SCM_CHARSET(cs));
 378             return Scm_Cons(SCM_SYM_COMP, cs);
 379         case 's':
 380             cs = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
 381             rc_register_charset(ctx, SCM_CHARSET(cs));
 382             return cs;
 383         case 'S':
 384             cs = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
 385             rc_register_charset(ctx, SCM_CHARSET(cs));
 386             return Scm_Cons(SCM_SYM_COMP, cs);
 387         }
 388         /*FALLTHROUGH*/
 389     default:
 390         if (ctx->casefoldp) ch = SCM_CHAR_DOWNCASE(ch);
 391         return SCM_MAKE_CHAR(ch);
 392     }
 393     /*NOTREACHED*/
 394 }
 395 
 396 /* Read \x, \u, \U escape sequence in the regexp spec. */
 397 static ScmChar rc1_lex_xdigits(ScmPort *port, int ndigs, int key)
 398 {
 399     char buf[8];
 400     int nread;
 401     ScmChar r;
 402     SCM_ASSERT(ndigs <= 8);
 403     r = Scm_ReadXdigitsFromPort(port, ndigs, buf, &nread);
 404     if (r == SCM_CHAR_INVALID) {
 405         ScmDString ds;
 406         int c, i;
 407         /* skip chars to the end of regexp, so that the reader will read
 408            after the erroneous string */
 409         for (;;) {
 410             SCM_GETC(c, port);
 411             if (c == EOF || c == '/') break;
 412             if (c == '\\') SCM_GETC(c, port);
 413         }
 414         /* construct an error message */
 415         Scm_DStringInit(&ds);
 416         Scm_DStringPutc(&ds, '\\');
 417         Scm_DStringPutc(&ds, key);
 418         for (i=0; i<nread; i++) Scm_DStringPutc(&ds, (unsigned char)buf[i]);
 419         Scm_Error("Bad '\\%c' escape sequence in a regexp literal: %s",
 420                   key, Scm_DStringGetz(&ds));
 421     }
 422     return r;
 423 }
 424 
 425 /* Called after '+', '*' or '?' is read, and check if there's a
 426    following '?' (lazy quantifier) */
 427 static ScmObj rc1_maybe_lazy(regcomp_ctx *ctx, ScmObj greedy, ScmObj lazy)
 428 {
 429     ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
 430     if (ch == '?') return lazy;
 431     Scm_UngetcUnsafe(ch, ctx->ipat);
 432     return greedy;
 433 }
 434 
 435 /* Reads '('-sequence - either one of "(", "(?:", "(?i:" or "(?-i:".
 436    The leading "(" has already been read. */
 437 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx)
 438 {
 439     ScmObj pos;
 440     ScmChar ch;
 441     
 442     pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
 443     ch = Scm_GetcUnsafe(ctx->ipat);
 444     if (ch != '?') {
 445         Scm_UngetcUnsafe(ch, ctx->ipat);
 446         return SCM_SYM_OPEN_PAREN;
 447     }
 448     ch = Scm_GetcUnsafe(ctx->ipat);
 449     if (ch == ':') return SCM_SYM_SEQ;
 450     if (ch == '=') return SCM_SYM_ASSERT;
 451     if (ch == '!') return SCM_SYM_NASSERT;
 452     if (ch == 'i') {
 453         ch = Scm_GetcUnsafe(ctx->ipat);
 454         if (ch == ':') return SCM_SYM_SEQ_UNCASE;
 455         /* fall through */
 456     } else if (ch == '-') {
 457         ch = Scm_GetcUnsafe(ctx->ipat);
 458         if (ch == 'i') {
 459             ch = Scm_GetcUnsafe(ctx->ipat);
 460             if (ch == ':') return SCM_SYM_SEQ_CASE;
 461         }
 462         /* fall through */
 463     }
 464     /* fail. */
 465     Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
 466     return SCM_SYM_OPEN_PAREN;
 467 }
 468 
 469 /* Reads {n,m}-type repeat syntax.  The leading "{" has been read.
 470    If the character sequence doesn't consist of valid syntax, rollback
 471    to the ordinary character sequence.
 472    If successfully parsed, returns (rep-bound <n> . <m>) where
 473     <m> == #f if the pattern is "{n}"   (exact count), or
 474     <m> == #t if the pattern is "{n,}"  (minimum count), or
 475     <m> == integer if the pattern is "{n,m}" (limited count).
 476    If the pattern is followed by '?', rep-bound-min is used instead.
 477  */
 478 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx)
 479 {
 480     int rep_min = -1, rep_max = -1, exact = FALSE, ch;
 481     ScmObj pos, m;
 482     ScmObj type = SCM_SYM_REP_BOUND; /* default is greedy */
 483 
 484     pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
 485     
 486     for (;;) {
 487         ch = Scm_GetcUnsafe(ctx->ipat);
 488         if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
 489             if (rep_min < 0) {
 490                 rep_min = (ch - '0');
 491             } else {
 492                 rep_min = rep_min*10 + (ch - '0');
 493             }
 494         } else if (ch == ',') {
 495             break;
 496         } else if (ch == '}') {
 497             exact = TRUE;
 498             break;
 499         } else {
 500             goto bad_min_max;
 501         }
 502     }
 503     if (rep_min < 0) goto bad_min_max;
 504     if (rep_min > MAX_LIMITED_REPEAT) goto out_of_range;
 505     if (!exact) {
 506         for (;;) {
 507             ch = Scm_GetcUnsafe(ctx->ipat);
 508             if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
 509                 if (rep_max < 0) {
 510                     rep_max = (ch - '0');
 511                 } else {
 512                     rep_max = rep_max*10 + (ch - '0');
 513                 }
 514             } else if (ch == '}') {
 515                 break;
 516             } else {
 517                 goto bad_min_max;
 518             }
 519         }
 520         if (rep_max > MAX_LIMITED_REPEAT) goto out_of_range;
 521         if (rep_max >= 0 && rep_max < rep_min) {
 522             Scm_Error("{n,m}-syntax requires n <= m: %S", ctx->pattern);
 523         }
 524     }
 525 
 526     if (exact)            m = SCM_FALSE;
 527     else if (rep_max < 0) m = SCM_TRUE;
 528     else                  m = SCM_MAKE_INT(rep_max);
 529 
 530     ch = Scm_GetcUnsafe(ctx->ipat);
 531     if (ch == '?') type = SCM_SYM_REP_BOUND_MIN;
 532     else Scm_UngetcUnsafe(ch, ctx->ipat);
 533     return Scm_Cons(type, Scm_Cons(SCM_MAKE_INT(rep_min), m));
 534 
 535   out_of_range:
 536     Scm_Error("{n,m}-syntax can accept up to %d count: %S",
 537               MAX_LIMITED_REPEAT, ctx->pattern);
 538     /*NOTREACHED*/
 539   bad_min_max:
 540     /* back up */
 541     Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
 542     return SCM_MAKE_CHAR('{');
 543 }
 544 
 545 static ScmObj rc1_fold_alts(regcomp_ctx *ctx, ScmObj alts)
 546 {
 547     ScmObj r = SCM_NIL, ap;
 548     SCM_FOR_EACH(ap, alts) {
 549         ScmObj alt = SCM_CAR(ap);
 550         if (SCM_PAIRP(alt) && SCM_NULLP(SCM_CDR(alt))) {
 551             r = Scm_Cons(SCM_CAR(alt), r);
 552         } else {
 553             r = Scm_Cons(Scm_Cons(SCM_SYM_SEQ, alt), r);
 554         }
 555     }
 556     return Scm_Cons(SCM_SYM_ALT, r);
 557 }
 558 
 559 /* Parser */
 560 static ScmObj rc1_parse(regcomp_ctx *ctx, int bolp, int topp)
 561 {
 562     ScmObj stack = SCM_NIL, alts = SCM_NIL;
 563     ScmObj token, item;
 564     int bolpsave = bolp;
 565 
 566 #define PUSH(elt)  (stack = Scm_Cons((elt), stack))
 567 #define PUSH1(elt) (stack = Scm_Cons((elt), SCM_CDR(stack)))
 568 
 569     for (;;) {
 570         token = rc1_lex(ctx);
 571         if (SCM_EOFP(token)) {
 572             if (!topp) {
 573                 Scm_Error("unterminated grouping in regexp %S", ctx->pattern);
 574             }
 575             break;
 576         }
 577         if (SCM_EQ(token, SCM_SYM_CLOSE_PAREN)) {
 578             if (topp) {
 579                 Scm_Error("extra close parenthesis in regexp %S", ctx->pattern);
 580             }
 581             break;
 582         }
 583         if (SCM_EQ(token, SCM_SYM_BOL)) {
 584             if (bolp) {
 585                 PUSH(SCM_SYM_BOL);
 586                 bolp = FALSE;
 587                 continue;
 588             } else {
 589                 token = SCM_MAKE_CHAR('^');
 590             }
 591             /*FALLTHROUGH*/
 592         }
 593         if (SCM_EQ(token, SCM_SYM_ALT)) {
 594             alts = Scm_Cons(Scm_ReverseX(stack), alts);
 595             stack = SCM_NIL;
 596             bolp = bolpsave;
 597             continue;
 598         }
 599         if (SCM_EQ(token, SCM_SYM_OPEN_PAREN)) {
 600             int grpno = ++ctx->grpcount;
 601             item = rc1_parse(ctx, bolp, FALSE);
 602             PUSH(Scm_Cons(SCM_MAKE_INT(grpno), item));
 603             bolp = FALSE;
 604             continue;
 605         }
 606         if (SCM_EQ(token, SCM_SYM_SEQ)) {
 607             item = rc1_parse(ctx, bolp, FALSE);
 608             PUSH(Scm_Cons(SCM_SYM_SEQ, item));
 609             bolp = FALSE;
 610             continue;
 611         }
 612         if (SCM_EQ(token, SCM_SYM_SEQ_UNCASE) || SCM_EQ(token, SCM_SYM_SEQ_CASE)) {
 613             int oldflag = ctx->casefoldp;
 614             ctx->casefoldp = SCM_EQ(token, SCM_SYM_SEQ_UNCASE);
 615             item = rc1_parse(ctx, bolp, FALSE);
 616             PUSH(Scm_Cons(token, item));
 617             ctx->casefoldp = oldflag;
 618             bolp = FALSE;
 619             continue;
 620         }
 621         if (SCM_EQ(token, SCM_SYM_ASSERT)) {
 622             item = rc1_parse(ctx, bolp, FALSE);
 623             PUSH(Scm_Cons(SCM_SYM_ASSERT, item));
 624             continue;
 625         }
 626         if (SCM_EQ(token, SCM_SYM_NASSERT)) {
 627             item = rc1_parse(ctx, bolp, FALSE);
 628             PUSH(Scm_Cons(SCM_SYM_NASSERT, item));
 629             continue;
 630         }
 631         if (SCM_EQ(token, SCM_SYM_STAR)) {
 632             /* "x*" => (rep x) */
 633             if (SCM_NULLP(stack)) goto synerr;
 634             item = SCM_LIST2(SCM_SYM_REP, SCM_CAR(stack));
 635             PUSH1(item);
 636             continue;
 637         }
 638         if (SCM_EQ(token, SCM_SYM_STARQ)) {
 639             /* "x*?" => (rep-min x) */
 640             if (SCM_NULLP(stack)) goto synerr;
 641             item = SCM_LIST2(SCM_SYM_REP_MIN, SCM_CAR(stack));
 642             PUSH1(item);
 643             continue;
 644         }
 645         if (SCM_EQ(token, SCM_SYM_PLUS)) {
 646             /* "x+" => (seq x (rep x)) */
 647             if (SCM_NULLP(stack)) goto synerr;
 648             item = SCM_LIST3(SCM_SYM_SEQ, SCM_CAR(stack),
 649                              SCM_LIST2(SCM_SYM_REP, SCM_CAR(stack)));
 650             PUSH1(item);
 651             continue;
 652         }
 653         if (SCM_EQ(token, SCM_SYM_PLUSQ)) {
 654             /* "x+?" => (seq x (rep-min x)) */
 655             if (SCM_NULLP(stack)) goto synerr;
 656             item = SCM_LIST3(SCM_SYM_SEQ, SCM_CAR(stack),
 657                              SCM_LIST2(SCM_SYM_REP_MIN, SCM_CAR(stack)));
 658             PUSH1(item);
 659             continue;
 660         }
 661         if (SCM_EQ(token, SCM_SYM_QUESTION)) {
 662             /* "x?" => (alt x ()) */
 663             if (SCM_NULLP(stack)) goto synerr;
 664             item = rc1_fold_alts(ctx,
 665                                  SCM_LIST2(SCM_NIL, SCM_LIST1(SCM_CAR(stack))));
 666             PUSH1(item);
 667             continue;
 668         }
 669         if (SCM_EQ(token, SCM_SYM_QUESTIONQ)) {
 670             /* "x??" => (alt () x) */
 671             if (SCM_NULLP(stack)) goto synerr;
 672             item = rc1_fold_alts(ctx,
 673                                  SCM_LIST2(SCM_LIST1(SCM_CAR(stack)), SCM_NIL));
 674             PUSH1(item);
 675             continue;
 676         }
 677         if (SCM_PAIRP(token)&&
 678             (SCM_EQ(SCM_CAR(token), SCM_SYM_REP_BOUND) ||
 679              SCM_EQ(SCM_CAR(token), SCM_SYM_REP_BOUND_MIN))) {
 680             /* "x{n}"    => (seq x .... x) 
 681                "x{n,}"   => (seq x .... x (rep x))
 682                "x{n,m}"  => (seq x .... x (rep-bound m-n x))
 683                "x{n,}?"  => (seq x .... x (rep-min x))
 684                "x{n,m}?" => (seq x .... x (rep-bound-min m-n x)) */
 685             ScmObj n = SCM_CADR(token), m = SCM_CDDR(token);
 686             int greedy = SCM_EQ(SCM_CAR(token), SCM_SYM_REP_BOUND);
 687 
 688             if (SCM_NULLP(stack)) goto synerr;
 689             SCM_ASSERT(SCM_INTP(n));
 690             item = Scm_MakeList(SCM_INT_VALUE(n), SCM_CAR(stack));
 691             if (SCM_FALSEP(m)) {
 692                 item = Scm_Cons(SCM_SYM_SEQ, item);
 693             } else if (SCM_TRUEP(m)) {
 694                 item = Scm_Cons(SCM_SYM_SEQ, item);
 695                 item = Scm_Append2X(item,
 696                                     SCM_LIST1(SCM_LIST2((greedy? SCM_SYM_REP : SCM_SYM_REP_MIN),
 697                                                         SCM_CAR(stack))));
 698             } else {
 699                 int m_n;
 700                 SCM_ASSERT(SCM_INTP(m));
 701                 m_n = SCM_INT_VALUE(m)-SCM_INT_VALUE(n);
 702                 SCM_ASSERT(m_n >= 0);
 703                 item = Scm_Cons(SCM_SYM_SEQ, item);
 704                 if (m_n > 0) {
 705                     item = Scm_Append2X(item,
 706                                         SCM_LIST1(SCM_LIST3((greedy? SCM_SYM_REP_BOUND : SCM_SYM_REP_BOUND_MIN),
 707                                                             SCM_MAKE_INT(m_n),
 708                                                             SCM_CAR(stack))));
 709                 }
 710             }
 711             PUSH1(item);
 712             continue;
 713         }
 714         PUSH(token);
 715         bolp = FALSE;
 716         continue;
 717     }
 718     if (SCM_NULLP(alts)) {
 719         return Scm_ReverseX(stack);
 720     } else {
 721         alts = Scm_Cons(Scm_ReverseX(stack), alts);
 722         return SCM_LIST1(rc1_fold_alts(ctx, alts));
 723     }
 724   synerr:
 725     Scm_Error("bad regexp syntax in %S", ctx->pattern);
 726     return SCM_UNDEFINED;       /* dummy */
 727 #undef PUSH
 728 #undef PUSH1
 729 }
 730 
 731 static ScmObj rc1(regcomp_ctx *ctx)
 732 {
 733     ScmObj ast = rc1_parse(ctx, TRUE, TRUE);
 734     if (ctx->casefoldp) {
 735         ast = SCM_LIST2(SCM_MAKE_INT(0), Scm_Cons(SCM_SYM_SEQ_UNCASE, ast));
 736     } else {
 737         ast = Scm_Cons(SCM_MAKE_INT(0), ast);
 738     }
 739     ctx->rx->numGroups = ctx->grpcount+1;
 740     return ast;
 741 }
 742 
 743 /* character range */
 744 static ScmObj rc_charset(regcomp_ctx *ctx)
 745 {
 746     int complement;
 747     ScmObj set = Scm_CharSetRead(ctx->ipat, &complement, FALSE, TRUE);
 748     if (!SCM_CHARSETP(set)) {
 749         Scm_Error("bad charset spec in pattern: %S", ctx->pattern);
 750     }
 751     if (ctx->casefoldp) {
 752         Scm_CharSetCaseFold(SCM_CHARSET(set));
 753     }
 754     
 755     rc_register_charset(ctx, SCM_CHARSET(set));
 756     if (complement) {
 757         return Scm_Cons(SCM_SYM_COMP, SCM_OBJ(set));
 758     } else {
 759         return SCM_OBJ(set);
 760     }
 761 }
 762 
 763 /* Remember charset so that we can construct charset vector later */
 764 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs)
 765 {
 766     if (SCM_FALSEP(Scm_Memq(SCM_OBJ(cs), ctx->sets))) {
 767         ctx->sets = Scm_Cons(SCM_OBJ(cs), ctx->sets);
 768     }
 769 }
 770 
 771 /* An interlude between pass1 and pass2.  From the information of
 772    parser context, build a charset vector. */
 773 static void rc_setup_charsets(ScmRegexp *rx, regcomp_ctx *ctx)
 774 {
 775     ScmObj cp;
 776     int i = 0;
 777     rx->numSets = Scm_Length(ctx->sets);
 778     rx->sets = SCM_NEW_ARRAY(ScmCharSet*, rx->numSets);
 779     for (i=0, cp = Scm_Reverse(ctx->sets); !SCM_NULLP(cp); cp = SCM_CDR(cp)) {
 780         rx->sets[i++] = SCM_CHARSET(SCM_CAR(cp));
 781     }
 782 }
 783 
 784 /*-------------------------------------------------------------
 785  * pass 2: optimizer
 786  *
 787  *  - flattening nested sequences: (seq a (seq b) c) => (seq a b c)
 788  *  - introduces short-cut construct for certain cases.
 789  *       (... (rep #\a) #\b ...) => (... (rep-while #\a) #\b ...)
 790  */
 791 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest);
 792 static int    is_distinct(ScmObj x, ScmObj y);
 793 
 794 static ScmObj rc2_optimize_seq(ScmObj seq, ScmObj rest)
 795 {
 796     ScmObj elt, tail, etype, opted;
 797     if (!SCM_PAIRP(seq)) return seq;
 798     elt = SCM_CAR(seq);
 799     tail = rc2_optimize_seq(SCM_CDR(seq), rest);
 800     rest = SCM_NULLP(tail)? rest : tail;
 801     if (!SCM_PAIRP(elt) || SCM_EQ(SCM_CAR(elt), SCM_SYM_COMP)) {
 802         if (SCM_EQ(tail, SCM_CDR(seq))) return seq;
 803         else return Scm_Cons(elt, tail);
 804     }
 805     etype = SCM_CAR(elt);
 806     if (SCM_EQ(etype, SCM_SYM_SEQ)) {
 807         return Scm_Append2(rc2_optimize_seq(SCM_CDR(elt), rest), tail);
 808     }
 809     if (SCM_EQ(etype, SCM_SYM_REP)) {
 810         /* If the head of repeating sequence and the beginning of the
 811            following sequence are distinct, like #/\s*foo/, the branch
 812            becomes deterministic (i.e. we don't need backtrack). */
 813         ScmObj repbody = rc2_optimize_seq(SCM_CDR(elt), rest);
 814         SCM_ASSERT(SCM_PAIRP(repbody));
 815         if (SCM_NULLP(rest) || is_distinct(SCM_CAR(repbody), SCM_CAR(rest))) {
 816             return Scm_Cons(Scm_Cons(SCM_SYM_REP_WHILE, repbody), tail);
 817         }
 818         if (SCM_EQ(repbody, SCM_CDR(elt))) opted = elt;
 819         else opted = Scm_Cons(SCM_SYM_REP, repbody);
 820     } else {
 821         opted = rc2_optimize(elt, rest);
 822     }
 823     if (SCM_EQ(elt, opted) && SCM_EQ(tail, SCM_CDR(seq))) return seq;
 824     else return Scm_Cons(opted, tail);
 825 }
 826 
 827 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest)
 828 {
 829     ScmObj type, seq, seqo;
 830     if (!SCM_PAIRP(ast)) return ast;
 831     type = SCM_CAR(ast);
 832     if (SCM_EQ(type, SCM_SYM_COMP)) return ast;
 833 
 834     if (SCM_EQ(type, SCM_SYM_ALT)) {
 835         ScmObj sp, sp2, e = SCM_UNBOUND, h, t;
 836         SCM_FOR_EACH(sp, SCM_CDR(ast)) {
 837             e = rc2_optimize(SCM_CAR(sp), rest);
 838             if (!SCM_EQ(e, SCM_CAR(sp))) break;
 839         }
 840         if (SCM_NULLP(sp)) return ast;
 841         /* need to copy the spine */
 842         h = t = SCM_NIL;
 843         SCM_FOR_EACH(sp2, SCM_CDR(ast)) {
 844             if (SCM_EQ(sp, sp2)) { SCM_APPEND1(h, t, e); break; }
 845             SCM_APPEND1(h, t, SCM_CAR(sp2));
 846         }
 847         SCM_FOR_EACH(sp2, SCM_CDR(sp2)) {
 848             SCM_APPEND1(h, t, rc2_optimize(SCM_CAR(sp2), rest));
 849         }
 850         return Scm_Cons(SCM_SYM_ALT, h);
 851     }
 852     if (SCM_EQ(type, SCM_SYM_REP_BOUND)) seq = SCM_CDDR(ast);
 853     else seq = SCM_CDR(ast);
 854     seqo = rc2_optimize_seq(seq, rest);
 855     if (SCM_EQ(seq, seqo)) return ast;
 856     else {
 857         if (SCM_EQ(type, SCM_SYM_REP_BOUND)) {
 858             return Scm_Cons(type, Scm_Cons(SCM_CADR(ast), seqo));
 859         } else {
 860             return Scm_Cons(type, seqo);
 861         }
 862     }
 863 }
 864 
 865 static int is_distinct(ScmObj x, ScmObj y)
 866 {
 867     ScmObj carx;
 868     if (SCM_PAIRP(x)) {
 869         carx = SCM_CAR(x);
 870         if (SCM_EQ(carx, SCM_SYM_COMP)) {
 871             SCM_ASSERT(SCM_CHARSETP(SCM_CDR(x)));
 872             if (SCM_CHARP(y) || SCM_CHARSETP(y)) {
 873                 return !is_distinct(SCM_CDR(x), y);
 874             }
 875             return FALSE;
 876         }
 877         if (SCM_INTP(carx)
 878             || SCM_EQ(carx, SCM_SYM_SEQ_UNCASE)
 879             || SCM_EQ(carx, SCM_SYM_SEQ_CASE)) {
 880             if (SCM_PAIRP(SCM_CDR(x))) {
 881                 return is_distinct(SCM_CADR(x), y);
 882             }
 883         }
 884         return FALSE;
 885     }
 886     if (SCM_CHARP(x)) {
 887         if (SCM_CHARP(y)) return !SCM_EQ(x, y);
 888         return is_distinct(y, x);
 889     }
 890     if (SCM_CHARSETP(x)) {
 891         if (SCM_CHARP(y)) {
 892             return !Scm_CharSetContains(SCM_CHARSET(x), SCM_CHAR_VALUE(y));
 893         }
 894         if (SCM_CHARSETP(y)) {
 895             ScmObj ccs = Scm_CopyCharSet(SCM_CHARSET(y));
 896             ccs = Scm_CharSetComplement(SCM_CHARSET(ccs));
 897             return Scm_CharSetLE(SCM_CHARSET(x), SCM_CHARSET(ccs));
 898         }
 899         return is_distinct(y, x);
 900     }
 901     return FALSE;
 902 }
 903 
 904 ScmObj Scm_RegOptimizeAST(ScmObj ast)
 905 {
 906     return rc2_optimize(ast, SCM_NIL);
 907 }
 908 
 909 /*-------------------------------------------------------------
 910  * pass 3 - code generation
 911  *          This pass actually called twice; the first run counts
 912  *          the size of the bytecode, and the second run fills
 913  *          the bytecode.   EMITP == FALSE for the first, EMITP == TRUE
 914  *          for the second.
 915  *          LASTP indicates this call is dealing with the last part of
 916  *          the compiled tree, thus need to deal with EOL marker.
 917  */
 918 
 919 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp, int toplevelp);
 920 
 921 /* Util function for pass2, to get an index of the charset vector
 922  * for the given charset.
 923  */
 924 static int rc3_charset_index(ScmRegexp *rx, ScmObj cs)
 925 {
 926     int i;
 927     for (i=0; i<rx->numSets; i++)
 928         if (cs == SCM_OBJ(rx->sets[i])) return i;
 929     Scm_Panic("rc3_charset_index: can't be here");
 930     return 0;                   /* dummy */
 931 }
 932 
 933 static void rc3_emit(regcomp_ctx *ctx, char code)
 934 {
 935     if (ctx->emitp) {
 936         SCM_ASSERT(ctx->codep < ctx->codemax);
 937         ctx->code[ctx->codep++] = code;
 938     } else {
 939         ctx->codemax++;
 940     }
 941 }
 942 
 943 static void rc3_emit_offset(regcomp_ctx *ctx, int offset)
 944 {
 945     if (offset > REGEXP_OFFSET_MAX) {
 946         Scm_Error("regexp too large.  consider splitting it up: %50.1S",
 947                   SCM_OBJ(ctx->rx));
 948     }
 949     
 950     if (ctx->emitp) {
 951         SCM_ASSERT(ctx->codep < ctx->codemax-1);
 952         ctx->code[ctx->codep++] = (offset>>8) & 0xff;
 953         ctx->code[ctx->codep++] = offset & 0xff;
 954     } else {
 955         ctx->codemax+=2;
 956     }
 957 }
 958 
 959 static void rc3_fill_offset(regcomp_ctx *ctx, int codep, int offset)
 960 {
 961     if (offset > REGEXP_OFFSET_MAX) {
 962         Scm_Error("regexp too large.  consider splitting it up: %50.1S",
 963                   SCM_OBJ(ctx->rx));
 964     }
 965 
 966     if (ctx->emitp) {
 967         SCM_ASSERT(codep < ctx->codemax-1);
 968         ctx->code[codep] = (offset >> 8) & 0xff;
 969         ctx->code[codep+1] = offset & 0xff;
 970     }
 971 }
 972 
 973 static void rc3_seq(regcomp_ctx *ctx, ScmObj seq, int lastp, int toplevelp)
 974 {
 975     ScmObj cp, item;
 976     
 977     SCM_FOR_EACH(cp, seq) {
 978         item = SCM_CAR(cp);
 979 
 980         /* concatenate literal character sequence */
 981         if (SCM_CHARP(item)) {
 982             int nrun = 0, ocodep = ctx->codep, nb, i;
 983             ScmChar ch;
 984             char chbuf[SCM_CHAR_MAX_BYTES];
 985             
 986             rc3_emit(ctx, (ctx->casefoldp? RE_MATCH_CI:RE_MATCH));
 987             rc3_emit(ctx, 0); /* patched later */
 988             do {
 989                 ch = SCM_CHAR_VALUE(item);
 990                 nb = SCM_CHAR_NBYTES(ch);
 991                 SCM_CHAR_PUT(chbuf, ch);
 992                 for (i=0; i<nb; i++) rc3_emit(ctx, chbuf[i]);
 993                 nrun += nb;
 994                 cp = SCM_CDR(cp);
 995                 if (SCM_NULLP(cp)) break;
 996                 item = SCM_CAR(cp);
 997             } while (SCM_CHARP(item) && nrun < CHAR_MAX);
 998             if (ctx->emitp) {
 999                 /* patches the run length.  if we are matching to a
1000                    single byte char, use MATCH1 insn. */
1001                 if (nrun == 1) {
1002                     ctx->code[ocodep] =
1003                         ctx->casefoldp?RE_MATCH1_CI:RE_MATCH1;
1004                     ctx->code[ocodep+1] = ctx->code[ocodep+2];
1005                     ctx->codep = ocodep+2;
1006                 } else {
1007                     ctx->code[ocodep+1] = (char)nrun;
1008                 }
1009             }
1010             if (SCM_NULLP(cp)) break;
1011             cp = Scm_Cons(item, cp); /* pushback */
1012         } else {
1013             rc3_rec(ctx, item, lastp&&SCM_NULLP(SCM_CDR(cp)), toplevelp);
1014         }
1015     }
1016 }
1017 
1018 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp, int toplevelp)
1019 {
1020     ScmObj type;
1021     ScmRegexp *rx = ctx->rx;
1022 
1023     /* first, deal with atoms */
1024     if (!SCM_PAIRP(ast)) {
1025         /* a char */
1026         if (SCM_CHARP(ast)) {
1027             char chbuf[SCM_CHAR_MAX_BYTES];
1028             ScmChar ch = SCM_CHAR_VALUE(ast);
1029             int i, nb = SCM_CHAR_NBYTES(ch);
1030             SCM_CHAR_PUT(chbuf, ch);
1031             if (nb == 1) {
1032                 rc3_emit(ctx, (ctx->casefoldp? RE_MATCH1_CI:RE_MATCH1));
1033                 rc3_emit(ctx, chbuf[0]);
1034             } else {
1035                 rc3_emit(ctx, (ctx->casefoldp? RE_MATCH_CI:RE_MATCH));
1036                 rc3_emit(ctx, nb);
1037                 for (i=0; i<nb; i++) rc3_emit(ctx, chbuf[i]);
1038             }
1039             return;
1040         }
1041         /* charset */
1042         if (SCM_CHARSETP(ast)) {
1043             if (SCM_CHARSET_SMALLP(ast)) {
1044                 rc3_emit(ctx, RE_SET1);
1045             } else {
1046                 rc3_emit(ctx, RE_SET);
1047             }
1048             rc3_emit(ctx, rc3_charset_index(rx, ast));
1049             return;
1050         }
1051         /* special stuff */
1052         if (SCM_SYMBOLP(ast)) {
1053             if (SCM_EQ(ast, SCM_SYM_ANY)) {
1054                 rc3_emit(ctx, RE_ANY);
1055                 return;
1056             }
1057             if (SCM_EQ(ast, SCM_SYM_BOL)) {
1058                 rc3_emit(ctx, RE_BOL);
1059                 return;
1060             }
1061             if (SCM_EQ(ast, SCM_SYM_EOL)) {
1062                 if (lastp) {
1063                     rc3_emit(ctx, RE_EOL);
1064                 } else {
1065                     rc3_emit(ctx, RE_MATCH1);
1066                     rc3_emit(ctx, '$');
1067                 }
1068                 return;
1069             }
1070             if (SCM_EQ(ast, SCM_SYM_WB)) {
1071                 rc3_emit(ctx, RE_WB);
1072                 return;
1073             }
1074             if (SCM_EQ(ast, SCM_SYM_NWB)) {
1075                 rc3_emit(ctx, RE_NWB);
1076                 return;
1077             }
1078             /* fallback */
1079         }
1080         Scm_Error("internal error in regexp compilation: unrecognized AST item: %S", ast);
1081     }
1082 
1083     /* now we have a structured node */
1084     type = SCM_CAR(ast);
1085     if (SCM_EQ(type, SCM_SYM_COMP)) {
1086         ScmObj cs = SCM_CDR(ast);
1087         SCM_ASSERT(SCM_CHARSETP(cs));
1088         if (SCM_CHARSET_SMALLP(cs)) {
1089             rc3_emit(ctx, RE_NSET1);
1090         } else {
1091             rc3_emit(ctx, RE_NSET);
1092         }
1093         rc3_emit(ctx, rc3_charset_index(rx, cs));
1094         return;
1095     }
1096     if (SCM_EQ(type, SCM_SYM_SEQ)) {
1097         rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1098         return;
1099     }
1100     if (SCM_INTP(type)) {
1101         int grpno = SCM_INT_VALUE(type);
1102         rc3_emit(ctx, RE_BEGIN);
1103         rc3_emit(ctx, grpno);
1104         rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1105         rc3_emit(ctx, RE_END);
1106         rc3_emit(ctx, grpno);
1107         return;
1108     }
1109     if (SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1110         int oldcase = ctx->casefoldp;
1111         ctx->casefoldp = SCM_EQ(type, SCM_SYM_SEQ_UNCASE);
1112         rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1113         ctx->casefoldp = oldcase;
1114         return;
1115     }
1116     if (SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1117         /* here we have an opportunity to generate an optimized code. */
1118         if (SCM_PAIRP(SCM_CDR(ast)) && SCM_NULLP(SCM_CDDR(ast))) {
1119             ScmObj elem = SCM_CADR(ast);
1120             if (SCM_CHARSETP(elem)) {
1121                 rc3_emit(ctx, SCM_CHARSET_SMALLP(elem)?RE_SET1R:RE_SETR);
1122                 rc3_emit(ctx, rc3_charset_index(rx, elem));
1123                 return;
1124             }
1125             if (SCM_PAIRP(elem)&&SCM_EQ(SCM_CAR(elem), SCM_SYM_COMP)) {
1126                 elem = SCM_CDR(elem);
1127                 SCM_ASSERT(SCM_CHARSETP(elem));
1128                 rc3_emit(ctx, SCM_CHARSET_SMALLP(elem)?RE_NSET1R:RE_NSETR);
1129                 rc3_emit(ctx, rc3_charset_index(rx, elem));
1130                 return;
1131             }
1132         }
1133         /* fallthrough to rep */
1134         type = SCM_SYM_REP;
1135     }
1136     if (SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)) {
1137         int ocodep = ctx->codep;
1138         rc3_emit(ctx, SCM_EQ(type, SCM_SYM_ASSERT) ? RE_ASSERT : RE_NASSERT);
1139         rc3_emit_offset(ctx, 0); /* will be patched */
1140         rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1141         rc3_emit(ctx, RE_SUCCESS);
1142         rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1143         return;
1144     }
1145     if (SCM_EQ(type, SCM_SYM_REP)) {
1146         /* rep: TRY next
1147                 <seq>
1148                 JUMP rep
1149            next:
1150         */
1151         int ocodep = ctx->codep;
1152         rc3_emit(ctx, RE_TRY);
1153         rc3_emit_offset(ctx, 0); /* will be patched */
1154         rc3_seq(ctx, SCM_CDR(ast), FALSE, FALSE);
1155         rc3_emit(ctx, RE_JUMP);
1156         rc3_emit_offset(ctx, ocodep);
1157         rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1158         return;
1159     }
1160     if (SCM_EQ(type, SCM_SYM_REP_MIN)) {
1161         /* non-greedy repeat
1162            rep: TRY seq
1163                 JUMP next
1164            seq: <seq>
1165                 JUMP rep
1166            next:
1167         */
1168         int ocodep1 = ctx->codep, ocodep2;
1169         rc3_emit(ctx, RE_TRY);
1170         rc3_emit_offset(ctx, 0); /* will be patched */
1171         ocodep2 = ctx->codep;
1172         rc3_emit(ctx, RE_JUMP);
1173         rc3_emit_offset(ctx, 0); /* will be patched */
1174         rc3_fill_offset(ctx, ocodep1+1, ctx->codep);
1175         rc3_seq(ctx, SCM_CDR(ast), FALSE, FALSE);
1176         rc3_emit(ctx, RE_JUMP);
1177         rc3_emit_offset(ctx, ocodep1);
1178         rc3_fill_offset(ctx, ocodep2+1, ctx->codep);
1179         return;
1180     }
1181     if (SCM_EQ(type, SCM_SYM_ALT)) {
1182         /*     TRY #1
1183                <alt0>
1184                JUMP next
1185            #1: TRY #2
1186                <alt1>
1187                JUMP next
1188                 :
1189                 :
1190                TRY next
1191                <altN>
1192            next:
1193         */
1194         ScmObj clause;
1195         ScmObj jumps = SCM_NIL;
1196         int patchp;
1197 
1198         for (clause = SCM_CDR(ast);
1199              SCM_PAIRP(SCM_CDR(clause));
1200              clause = SCM_CDR(clause)) {
1201             rc3_emit(ctx, RE_TRY);
1202             patchp = ctx->codep;
1203             rc3_emit_offset(ctx, 0); /* will be patched */
1204             rc3_rec(ctx, SCM_CAR(clause), lastp, FALSE);
1205             rc3_emit(ctx, RE_JUMP);
1206             if (ctx->emitp) {
1207                 jumps = Scm_Cons(SCM_MAKE_INT(ctx->codep), jumps);
1208             }
1209             rc3_emit_offset(ctx, 0); /* will be patched */
1210             rc3_fill_offset(ctx, patchp, ctx->codep);
1211         }
1212         rc3_rec(ctx, SCM_CAR(clause), lastp, FALSE);
1213         if (ctx->emitp) {
1214             SCM_FOR_EACH(jumps, jumps) {
1215                 patchp = SCM_INT_VALUE(SCM_CAR(jumps));
1216                 rc3_fill_offset(ctx, patchp, ctx->codep);
1217             }
1218         }
1219         return;
1220     }
1221     if (SCM_EQ(type, SCM_SYM_REP_BOUND) || SCM_EQ(type, SCM_SYM_REP_BOUND_MIN)) {
1222         /* (rep-bound <n> . <x>)
1223 
1224                TRY  #01
1225                JUMP #11
1226            #01:TRY  #02
1227                JUMP #12
1228                 :
1229            #0n:JUMP #1N
1230            #11:<X>
1231            #12:<X>
1232                 :
1233            #1n:<X>
1234            #1N:
1235 
1236            (rep-bound-min <n> . <x>)
1237            
1238                TRY  #01
1239                JUMP #1N
1240            #01:TRY  #02
1241                JUMP #1n
1242                 :
1243            #0n TRY  #11
1244                JUMP #12
1245            #11:<X>
1246            #12:<X>
1247                 :
1248            #1n:<X>
1249            #1N:
1250 
1251          */
1252         ScmObj item, jlist = SCM_NIL;
1253         int count, n, j0 = 0, jn;
1254         int greedy = SCM_EQ(type, SCM_SYM_REP_BOUND);
1255 
1256         SCM_ASSERT(Scm_Length(ast) >= 3 && SCM_INTP(SCM_CADR(ast)));
1257         count = SCM_INT_VALUE(SCM_CADR(ast));
1258         SCM_ASSERT(count > 0);
1259         item = SCM_CDDR(ast);
1260         /* first part - TRYs and JUMPs
1261            j0 is used to patch the label #0k
1262            the destination of jumps to be patched are linked to jlist */
1263         for (n=0; n<count; n++) {
1264             if (n>0) rc3_fill_offset(ctx, j0, ctx->codep);
1265             rc3_emit(ctx, RE_TRY);
1266             if (ctx->emitp) j0 = ctx->codep;
1267             rc3_emit_offset(ctx, 0); /* to be patched */
1268             rc3_emit(ctx, RE_JUMP);
1269             if (ctx->emitp) {
1270                 jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1271             }
1272             rc3_emit_offset(ctx, 0); /* to be patched */
1273         }
1274         rc3_fill_offset(ctx, j0, ctx->codep); /* patch #0n */
1275         /* finishing the first part.
1276            for non-greedy match, we need one more TRY. */
1277         if (greedy) {
1278             rc3_emit(ctx, RE_JUMP);
1279             jn = ctx->codep;
1280             rc3_emit_offset(ctx, 0); /* to be patched */
1281         } else {
1282             rc3_emit(ctx, RE_TRY);
1283             jn = ctx->codep;
1284             rc3_emit_offset(ctx, 0);  /* to be patched */
1285             rc3_emit(ctx, RE_JUMP);
1286             if (ctx->emitp) {
1287                 jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1288             }
1289             rc3_emit_offset(ctx, 0);  /* to be patched */
1290             rc3_fill_offset(ctx, jn, ctx->codep);
1291         }
1292         if (ctx->emitp && greedy) jlist = Scm_ReverseX(jlist);
1293         if (!greedy) rc3_seq(ctx, item, FALSE, toplevelp);
1294         for (n=0; n<count; n++) {
1295             if (ctx->emitp) {
1296                 rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)),
1297                                 ctx->codep);
1298             }
1299             rc3_seq(ctx, item, FALSE, toplevelp);
1300             if (ctx->emitp) jlist = SCM_CDR(jlist);
1301         }
1302         if (greedy) {
1303             /* the last JUMP to #1N */
1304             rc3_fill_offset(ctx, jn, ctx->codep);
1305         } else {
1306             /* the first JUMP to #1N */
1307             if (ctx->emitp) {
1308                 SCM_ASSERT(SCM_PAIRP(jlist));
1309                 rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)), ctx->codep);
1310             }
1311         }
1312         return;
1313     }
1314     Scm_Error("internal error in regexp compilation: bad node: %S", ast);
1315 }
1316 
1317 static int is_bol_anchored(ScmObj ast)
1318 {
1319     ScmObj type;
1320     if (!SCM_PAIRP(ast)) {
1321         if (SCM_EQ(ast, SCM_SYM_BOL)) return TRUE;
1322         else return FALSE;
1323     }
1324     type = SCM_CAR(ast);
1325     if (SCM_INTP(type) || SCM_EQ(type, SCM_SYM_SEQ)
1326         || SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1327         if (!SCM_PAIRP(SCM_CDR(ast))) return FALSE;
1328         return is_bol_anchored(SCM_CADR(ast));
1329     }
1330     if (SCM_EQ(type, SCM_SYM_ALT)) {
1331         ScmObj ap;
1332         SCM_FOR_EACH(ap, SCM_CDR(ast)) { 
1333             if (!is_bol_anchored(SCM_CAR(ap))) return FALSE;
1334         }
1335         return TRUE;
1336     }
1337     return FALSE;
1338 }
1339 
1340 /* pass 3 */
1341 static ScmObj rc3(regcomp_ctx *ctx, ScmObj ast)
1342 {
1343     /* check if ast is bol-anchored */
1344     if (is_bol_anchored(ast)) ctx->rx->flags |= SCM_REGEXP_BOL_ANCHORED;
1345 
1346     /* pass 3-1 : count # of insns */
1347     ctx->codemax = 1;
1348     ctx->emitp = FALSE;
1349     rc3_rec(ctx, ast, TRUE, TRUE);
1350     
1351     /* pass 3-2 : code generation */
1352     ctx->code = SCM_NEW_ATOMIC2(unsigned char *, ctx->codemax);
1353     ctx->emitp = TRUE;
1354     rc3_rec(ctx, ast, TRUE, TRUE);
1355     rc3_emit(ctx, RE_SUCCESS);
1356     ctx->rx->code = ctx->code;
1357     ctx->rx->numCodes = ctx->codep;
1358     return SCM_OBJ(ctx->rx);
1359 }
1360 
1361 /* For debug */
1362 #if SCM_DEBUG_HELPER
1363 void Scm_RegDump(ScmRegexp *rx)
1364 {
1365     int end = rx->numCodes, codep;
1366 
1367     Scm_Printf(SCM_CUROUT, "Regexp %p: (flags=%08x)\n", rx, rx->flags);
1368     Scm_Printf(SCM_CUROUT, "  must = ");
1369     if (rx->mustMatch) {
1370         Scm_Printf(SCM_CUROUT, "%S\n", rx->mustMatch);
1371     } else {
1372         Scm_Printf(SCM_CUROUT, "(none)\n");
1373     }
1374 
1375     for (codep = 0; codep < end; codep++) {
1376         switch (rx->code[codep]) {
1377         case RE_MATCH1:;
1378         case RE_MATCH1_CI:
1379             codep++;
1380             Scm_Printf(SCM_CUROUT, "%4d  %s  0x%02x  '%c'\n",
1381                        codep-1,
1382                        (rx->code[codep-1]==RE_MATCH1? "MATCH1":"MATCH1_CI"),
1383                        rx->code[codep], rx->code[codep]);
1384             continue;
1385         case RE_MATCH:;
1386         case RE_MATCH_CI:
1387             codep++;
1388             {
1389                 u_int numchars = (u_int)rx->code[codep];
1390                 int i;
1391                 Scm_Printf(SCM_CUROUT, "%4d  %s(%3d) '",
1392                            codep-1,
1393                            (rx->code[codep-1]==RE_MATCH? "MATCH":"MATCH_CI"),
1394                            numchars);
1395                 for (i=0; i< numchars; i++)
1396                     Scm_Printf(SCM_CUROUT, "%c", rx->code[++codep]);
1397                 Scm_Printf(SCM_CUROUT, "'\n");
1398             }
1399             continue;
1400         case RE_ANY:
1401             Scm_Printf(SCM_CUROUT, "%4d  ANY\n", codep);
1402             continue;
1403         case RE_TRY:
1404             codep++;
1405             Scm_Printf(SCM_CUROUT, "%4d  TRY  %d\n", codep-1,
1406                        (rx->code[codep])*256 + rx->code[codep+1]);
1407             codep++;
1408             continue;
1409         case RE_SET:
1410             codep++;
1411             Scm_Printf(SCM_CUROUT, "%4d  SET  %d    %S\n",
1412                        codep-1, rx->code[codep],
1413                        rx->sets[rx->code[codep]]);
1414             continue;
1415         case RE_NSET:
1416             codep++;
1417             Scm_Printf(SCM_CUROUT, "%4d  NSET %d    %S\n",
1418                        codep-1, rx->code[codep],
1419                        rx->sets[rx->code[codep]]);
1420             continue;
1421         case RE_SET1:
1422             codep++;
1423             Scm_Printf(SCM_CUROUT, "%4d  SET1 %d    %S\n",
1424                        codep-1, rx->code[codep],
1425                        rx->sets[rx->code[codep]]);
1426             continue;
1427         case RE_NSET1:
1428             codep++;
1429             Scm_Printf(SCM_CUROUT, "%4d  NSET1 %d    %S\n",
1430                        codep-1, rx->code[codep],
1431                        rx->sets[rx->code[codep]]);
1432             continue;
1433         case RE_JUMP:
1434             codep++;
1435             Scm_Printf(SCM_CUROUT, "%4d  JUMP %d\n", codep-1,
1436                        (rx->code[codep])*256 + rx->code[codep+1]);
1437             codep++;
1438             continue;
1439         case RE_FAIL:
1440             Scm_Printf(SCM_CUROUT, "%4d  FAIL\n", codep);
1441             continue;
1442         case RE_SUCCESS:
1443             Scm_Printf(SCM_CUROUT, "%4d  SUCCESS\n", codep);
1444             continue;
1445         case RE_BEGIN:
1446             codep++;
1447             Scm_Printf(SCM_CUROUT, "%4d  BEGIN %d\n", codep-1, rx->code[codep]);
1448             continue;
1449         case RE_END:
1450             codep++;
1451             Scm_Printf(SCM_CUROUT, "%4d  END %d\n", codep-1, rx->code[codep]);
1452             continue;
1453         case RE_BOL:
1454             Scm_Printf(SCM_CUROUT, "%4d  BOL\n", codep);
1455             continue;
1456         case RE_EOL:
1457             Scm_Printf(SCM_CUROUT, "%4d  EOL\n", codep);
1458             continue;
1459         case RE_WB:
1460             Scm_Printf(SCM_CUROUT, "%4d  WB\n", codep);
1461             continue;
1462         case RE_NWB:
1463             Scm_Printf(SCM_CUROUT, "%4d  NWB\n", codep);
1464             continue;
1465         case RE_MATCH1B:
1466             Scm_Printf(SCM_CUROUT, "%4d  MATCH1B %02x '%c', %d\n",
1467                        codep, rx->code[codep+1], rx->code[codep+1],
1468                        rx->code[codep+2]*256+rx->code[codep+3]);
1469             codep += 3;
1470             continue;
1471         case RE_SET1R:
1472             codep++;
1473             Scm_Printf(SCM_CUROUT, "%4d  SET1R %d   %S\n",
1474                        codep-1, rx->code[codep],
1475                        rx->sets[rx->code[codep]]);
1476             continue;
1477         case RE_NSET1R:
1478             codep++;
1479             Scm_Printf(SCM_CUROUT, "%4d  NSET1R %d  %S\n",
1480                        codep-1, rx->code[codep],
1481                        rx->sets[rx->code[codep]]);
1482             continue;
1483         case RE_SETR:
1484             codep++;
1485             Scm_Printf(SCM_CUROUT, "%4d  SETR %d    %S\n",
1486                        codep-1, rx->code[codep],
1487                        rx->sets[rx->code[codep]]);
1488             continue;
1489         case RE_NSETR:
1490             codep++;
1491             Scm_Printf(SCM_CUROUT, "%4d  NSETR %d   %S\n",
1492                        codep-1, rx->code[codep],
1493                        rx->sets[rx->code[codep]]);
1494             continue;
1495         case RE_ASSERT:
1496             codep++;
1497             Scm_Printf(SCM_CUROUT, "%4d  ASSERT %d\n", codep-1,
1498                        (rx->code[codep])*256 + rx->code[codep+1]);
1499             codep++;
1500             continue;            
1501         case RE_NASSERT:
1502             codep++;
1503             Scm_Printf(SCM_CUROUT, "%4d  NASSERT %d\n", codep-1,
1504                        (rx->code[codep])*256 + rx->code[codep+1]);
1505             codep++;
1506             continue;            
1507         default:
1508             Scm_Error("regexp screwed up\n");
1509         }
1510     }
1511 }
1512 #endif /* SCM_DEBUG_HELPER */
1513 
1514 /* Helper routine to be used for compilation from AST.
1515    Traverses AST to reorder groups and collect charsets.
1516    Note that the native regcomp path doesn't use these fns.
1517    Only the AST provided from outside is processed. */
1518 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq);
1519 
1520 static ScmObj rc_setup_context(regcomp_ctx *ctx, ScmObj ast)
1521 {
1522     ScmObj type, rest;
1523     if (!SCM_PAIRP(ast)) {
1524         if (SCM_CHARP(ast)) return ast;
1525         if (SCM_CHARSETP(ast)) {
1526             rc_register_charset(ctx, SCM_CHARSET(ast));
1527             return ast;
1528         }
1529         if (SCM_EQ(ast, SCM_SYM_BOL) || SCM_EQ(ast, SCM_SYM_EOL)
1530             || SCM_EQ(ast, SCM_SYM_WB) || SCM_EQ(ast, SCM_SYM_NWB)
1531             || SCM_EQ(ast, SCM_SYM_ANY)) {
1532             return ast;
1533         }
1534         goto badast;
1535     }
1536     type = SCM_CAR(ast);
1537     if (SCM_INTP(type)) {
1538         int grpno = ctx->grpcount++;
1539         rest = rc_setup_context_seq(ctx, SCM_CDR(ast));
1540         if (SCM_INT_VALUE(type) == grpno && SCM_EQ(SCM_CDR(ast), rest)) {
1541             return ast;
1542         } else {
1543             return Scm_Cons(SCM_MAKE_INT(grpno), rest);
1544         }
1545     }
1546     if (SCM_EQ(type, SCM_SYM_COMP)) {
1547         if (!SCM_CHARSETP(SCM_CDR(ast))) goto badast;
1548         rc_register_charset(ctx, SCM_CHARSET(SCM_CDR(ast)));
1549         return ast;
1550     }
1551     if (SCM_EQ(type, SCM_SYM_SEQ) || SCM_EQ(type, SCM_SYM_ALT)
1552         || SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)
1553         || SCM_EQ(type, SCM_SYM_REP) || SCM_EQ(type, SCM_SYM_REP_MIN)
1554         || SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)
1555         || SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1556         rest = rc_setup_context_seq(ctx, SCM_CDR(ast));
1557         if (SCM_EQ(SCM_CDR(ast), rest)) return ast;
1558         else return Scm_Cons(type, rest);
1559     }
1560     if (SCM_EQ(type, SCM_SYM_REP_BOUND) || SCM_EQ(type, SCM_SYM_REP_BOUND_MIN)) {
1561         if (!SCM_PAIRP(SCM_CDR(ast)) || !SCM_INTP(SCM_CADR(ast))
1562             || SCM_INT_VALUE(SCM_CADR(ast)) < 0) {
1563             goto badast;
1564         }
1565         rest = rc_setup_context_seq(ctx, SCM_CDDR(ast));
1566         if (SCM_EQ(SCM_CDDR(ast), rest)) return ast;
1567         else return Scm_Cons(type, Scm_Cons(SCM_CADR(ast), rest));
1568     }
1569   badast:
1570     Scm_Error("invalid regexp AST: %S", ast);
1571     return SCM_UNDEFINED;       /* dummy */
1572 }
1573 
1574 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq) 
1575 {
1576     ScmObj sp, sp2, obj, head = SCM_NIL, tail = SCM_NIL;
1577     SCM_FOR_EACH(sp, seq) {
1578         obj = rc_setup_context(ctx, SCM_CAR(sp));
1579         if (!SCM_EQ(obj, SCM_CAR(sp))) break;
1580     }
1581     if (SCM_NULLP(sp)) return seq;
1582     /* we need to copy the spine */
1583     SCM_FOR_EACH(sp2, seq) {
1584         if (SCM_EQ(sp2, sp)) break;
1585         SCM_APPEND1(head, tail, SCM_CAR(sp2));
1586     }
1587     SCM_FOR_EACH(sp2, sp2) {
1588         SCM_APPEND1(head, tail, rc_setup_context(ctx, SCM_CAR(sp2)));
1589     }
1590     return head;
1591 }
1592 
1593 /*--------------------------------------------------------------
1594  * Compiler entry point
1595  */
1596 ScmObj Scm_RegComp(ScmString *pattern, int flags)
1597 {
1598     ScmRegexp *rx = make_regexp();
1599     ScmObj ast;
1600     regcomp_ctx cctx;
1601     
1602     if (SCM_STRING_INCOMPLETE_P(pattern)) {
1603         Scm_Error("incomplete string is not allowed: %S", pattern);
1604     }
1605     rx->pattern = SCM_STRING(Scm_CopyStringWithFlags(pattern, 
1606                                                      SCM_STRING_IMMUTABLE,
1607                                                      SCM_STRING_IMMUTABLE));
1608     rc_ctx_init(&cctx, rx);
1609     cctx.casefoldp = flags & SCM_REGEXP_CASE_FOLD;
1610     rx->flags |= (flags & SCM_REGEXP_CASE_FOLD);
1611 
1612     /* pass 1 : parse regexp spec */
1613     ast = rc1(&cctx);
1614     rc_setup_charsets(rx, &cctx);
1615     if (flags & SCM_REGEXP_PARSE_ONLY) return ast;
1616 
1617     /* pass 2 : optimization */
1618     ast = rc2_optimize(ast, SCM_NIL);
1619 
1620     /* pass 3 : generate bytecode */
1621     return rc3(&cctx, ast);
1622 }
1623 
1624 /* alternative entry that compiles from AST */
1625 ScmObj Scm_RegCompFromAST(ScmObj ast)
1626 {
1627     ScmRegexp *rx = make_regexp();
1628     regcomp_ctx cctx;
1629     rc_ctx_init(&cctx, rx);
1630 
1631     /* prepare some context */
1632     if (!SCM_PAIRP(ast) || !SCM_INTP(SCM_CAR(ast))) {
1633         /* ensure the entire AST is in a group #0 */
1634         ast = SCM_LIST2(SCM_MAKE_INT(0), ast);
1635     }
1636     ast = rc_setup_context(&cctx, ast);
1637     rc_setup_charsets(rx, &cctx);
1638     rx->numGroups = cctx.grpcount+1;
1639     
1640     /* pass 3 */
1641     return rc3(&cctx, ast);
1642 }
1643 
1644 /*=======================================================================
1645  * Matcher
1646  */
1647 
1648 /* For now, I use C-stack directly to keep information for backtrack,
1649  * i.e. anytime I should try something I recursively call rex_rec().
1650  * It may run out the stack space if regexp requires deep recursion.
1651  *
1652  * Rex_rec doesn't return as long as match succeeds.  At the end of
1653  * code, it longjmp's to the start of matcher.
1654  *
1655  * My preliminary test showed that using C-stack & longjmp is faster than
1656  * allocating and maintaining the stack by myself.   Further test is required
1657  * for practical case, though.
1658  */
1659 
1660 struct match_list {
1661     struct match_list *next;
1662     int grpnum;
1663     const char *ptr;
1664 };
1665 
1666 struct match_ctx {
1667     ScmRegexp *rx;
1668     const unsigned char *codehead; /* start of code */
1669     const char *input;          /* start of input */
1670     const char *stop;           /* end of input */
1671     const char *last;
1672     struct match_list *matches;
1673     void *begin_stack;          /* C stack pointer the match began from. */
1674     sigjmp_buf *cont;
1675 };
1676 
1677 #define MAX_STACK_USAGE   0x100000
1678 
1679 static struct match_list *push_match(struct match_list *mlist,
1680                                             int grpnum, const char *ptr)
1681 {
1682     struct match_list *elt = SCM_NEW(struct match_list);
1683     elt->next = mlist;
1684     elt->grpnum = grpnum;
1685     elt->ptr = ptr;
1686     return elt;
1687 }
1688 
1689 static int match_ci(const char **input, const unsigned char **code, int length)
1690 {
1691     unsigned char inch, c;
1692     int csize, i;
1693     do {
1694         inch = *(*input)++;
1695         c = *(*code)++;
1696         if ((csize = SCM_CHAR_NFOLLOWS(inch)) == 0) {
1697             if (c != SCM_CHAR_DOWNCASE(inch)) return FALSE;
1698         } else {
1699             if (c != inch) return FALSE;
1700             for (i=0; i<csize; i++) {
1701                 if ((unsigned char)*(*code)++ != (unsigned char)*(*input)++)
1702                     return FALSE;
1703             }
1704         }
1705         length -= (csize+1);
1706     } while (length > 0);
1707     return TRUE;
1708 }
1709 
1710 /* Check if input points to the word boundary.  For now, I consider
1711    all multibyte chars word-constituent. */
1712 static int is_word_constituent(unsigned char b)
1713 {
1714     if (b >= 128) return TRUE;
1715     if (b >= '0' && b <= '9') return TRUE;
1716     if (b >= 'A' && b <= 'Z') return TRUE;
1717     if (b >= 'a' && b <= 'z') return TRUE;
1718     return FALSE;
1719 }
1720 
1721 static int is_word_boundary(struct match_ctx *ctx, const char *input)
1722 {
1723     unsigned char nextb, prevb;
1724     const char *prevp;
1725     
1726     if (input == ctx->input || input == ctx->stop) return TRUE;
1727     nextb = (unsigned char)*input;
1728     SCM_CHAR_BACKWARD(input, ctx->input, prevp);
1729     SCM_ASSERT(prevp != NULL);
1730     prevb = (unsigned char)*prevp;
1731     if ((is_word_constituent(nextb) && !is_word_constituent(prevb))
1732         || (!is_word_constituent(nextb) && is_word_constituent(prevb))) {
1733         return TRUE;
1734     }
1735     return FALSE;
1736 }
1737 
1738 static void rex_rec(const unsigned char *code,
1739                     const char *input,
1740                     struct match_ctx *ctx,                 
1741                     struct match_list *mlist)
1742 {
1743     register int param;
1744     register ScmChar ch;
1745     ScmCharSet *cset;
1746 
1747     /* TODO: here we assume C-stack grows downward; need to check by
1748        configure */
1749     if ((char*)&cset < (char*)ctx->begin_stack - MAX_STACK_USAGE) {
1750         Scm_Error("stack overrun during matching regexp %S", ctx->rx);
1751     }
1752     
1753     for (;;) {
1754         switch(*code++) {
1755         case RE_MATCH:
1756             param = *code++;
1757             if (ctx->stop - input < param) return;
1758             while (param-- > 0) {
1759                 if (*code++ != (unsigned char)*input++) return;
1760             }
1761             continue;
1762         case RE_MATCH1:
1763             if (ctx->stop == input) return;
1764             if (*code++ != (unsigned char)*input++) return;
1765             continue;
1766         case RE_MATCH_CI:
1767             param = *code++;
1768             if (ctx->stop - input < param) return;
1769             if (!match_ci(&input, &code, param)) return;
1770             continue;
1771         case RE_MATCH1_CI:
1772             if (ctx->stop == input) return;
1773             param  = (unsigned char)*input++;
1774             if (SCM_CHAR_NFOLLOWS(param)!=0
1775                 || (*code++)!=SCM_CHAR_DOWNCASE(param)) {
1776                 return;
1777             }
1778             continue;
1779         case RE_ANY:
1780             if (ctx->stop == input) return;
1781             input += SCM_CHAR_NFOLLOWS(*input) + 1;
1782             continue;
1783         case RE_TRY:
1784             rex_rec(code+2, input, ctx, mlist);
1785             code = ctx->codehead + code[0]*256 + code[1];
1786             continue;
1787         case RE_JUMP:
1788             code = ctx->codehead + code[0]*256 + code[1];
1789             continue;
1790         case RE_SET1:
1791             if (ctx->stop == input) return;
1792             if ((unsigned char)*input >= 128) return;
1793             if (!Scm_CharSetContains(ctx->rx->sets[*code++], *input)) return;
1794             input++;
1795             continue;
1796         case RE_NSET1:
1797             if (ctx->stop == input) return;
1798             if ((unsigned char)*input < 128) {
1799                 if (Scm_CharSetContains(ctx->rx->sets[*code++], *input))
1800                     return;
1801                 input++;
1802             } else {
1803                 code++;
1804                 input += SCM_CHAR_NFOLLOWS((unsigned char)*input) + 1;
1805             }
1806             continue;
1807         case RE_SET:
1808             if (ctx->stop == input) return;
1809             SCM_CHAR_GET(input, ch);
1810             cset = ctx->rx->sets[*code++];
1811             if (!Scm_CharSetContains(cset, ch)) return;
1812             input += SCM_CHAR_NBYTES(ch);
1813             continue;
1814         case RE_NSET:
1815             if (ctx->stop == input) return;
1816             SCM_CHAR_GET(input, ch);
1817             cset = ctx->rx->sets[*code++];
1818             if (Scm_CharSetContains(cset, ch)) return;
1819             input += SCM_CHAR_NBYTES(ch);
1820             continue;
1821         case RE_BEGIN:
1822             mlist = push_match(mlist, *code++, input);
1823             continue;
1824         case RE_END:
1825             mlist = push_match(mlist, -(*code++), input);
1826             continue;
1827         case RE_BOL:
1828             if (input != ctx->input) return;
1829             continue;
1830         case RE_EOL:
1831             if (input != ctx->stop) return;
1832             continue;
1833         case RE_WB:
1834             if (!is_word_boundary(ctx, input)) return;
1835             continue;
1836         case RE_NWB:
1837             if (is_word_boundary(ctx, input)) return;
1838             continue;
1839         case RE_SUCCESS:
1840             ctx->last = input;
1841             ctx->matches = mlist;
1842             siglongjmp(*ctx->cont, 1);
1843             /*NOTREACHED*/
1844         case RE_FAIL:
1845             return;
1846         case RE_SET1R:
1847             cset = ctx->rx->sets[*code++];
1848             for (;;) {
1849                 if (ctx->stop <= input) break;
1850                 if ((unsigned char)*input >= 128) break;
1851                 if (!Scm_CharSetContains(cset, *input)) break;
1852                 input++;
1853             }
1854             continue;
1855         case RE_NSET1R:
1856             cset = ctx->rx->sets[*code++];
1857             for (;;) {
1858                 if (ctx->stop <= input) break;
1859                 if ((unsigned char)*input < 128 ) {
1860                     if (Scm_CharSetContains(cset, *input)) break;
1861                     input++;
1862                 } else {
1863                     input+=SCM_CHAR_NFOLLOWS(*input)+1;
1864                 }
1865             }
1866             continue;
1867         case RE_SETR:
1868             cset = ctx->rx->sets[*code++];
1869             for (;;) {
1870                 if (ctx->stop <= input) break;
1871                 SCM_CHAR_GET(input, ch);
1872                 if (!Scm_CharSetContains(cset, ch)) break;
1873                 input += SCM_CHAR_NBYTES(ch);
1874             }
1875             continue;
1876         case RE_NSETR:
1877             cset = ctx->rx->sets[*code++];
1878             for (;;) {
1879                 if (ctx->stop <= input) break;
1880                 SCM_CHAR_GET(input, ch);
1881                 if (Scm_CharSetContains(cset, ch)) break;
1882                 input += SCM_CHAR_NBYTES(ch);
1883             }
1884             continue;
1885         case RE_ASSERT: {
1886             sigjmp_buf cont, *ocont = ctx->cont;
1887             ctx->cont = &cont;
1888             if (sigsetjmp(cont, FALSE) == 0) {
1889                 rex_rec(code+2, input, ctx, mlist);
1890                 ctx->cont = ocont;
1891                 return;
1892             }
1893             code = ctx->codehead + code[0]*256 + code[1];
1894             ctx->cont = ocont;
1895             mlist = ctx->matches;
1896             continue;
1897         }
1898         case RE_NASSERT: {
1899             sigjmp_buf cont, *ocont = ctx->cont;
1900             ctx->cont = &cont;
1901             if (sigsetjmp(cont, FALSE) == 0) {
1902                 rex_rec(code+2, input, ctx, mlist);
1903                 code = ctx->codehead + code[0]*256 + code[1];
1904                 ctx->cont = ocont;
1905                 continue;
1906             }
1907             ctx->cont = ocont;
1908             return;
1909         }
1910         default:
1911             /* shouldn't be here */
1912             Scm_Error("regexp implementation seems broken\n");
1913         }
1914     }
1915 }
1916 
1917 static ScmObj make_match(ScmRegexp *rx, ScmString *orig,
1918                          struct match_ctx *ctx)
1919 {
1920     int i;
1921     struct match_list *ml;
1922     ScmRegMatch *rm = SCM_NEW(ScmRegMatch);
1923     const ScmStringBody *origb;
1924     SCM_SET_CLASS(rm, SCM_CLASS_REGMATCH);
1925     rm->numMatches = rx->numGroups;
1926     rm->matches = SCM_NEW_ARRAY(struct ScmRegMatchSub, rx->numGroups);
1927     /* we keep information of original string separately, instead of
1928        keeping a pointer to orig; For orig may be destructively modified,
1929        but its elements are not. */
1930     origb = SCM_STRING_BODY(orig);
1931     rm->input = SCM_STRING_BODY_START(origb);
1932     rm->inputLen = SCM_STRING_BODY_LENGTH(origb);
1933     rm->inputSize = SCM_STRING_BODY_SIZE(origb);
1934     for (i=0; i<rx->numGroups; i++) {
1935         rm->matches[i].start = -1;
1936         rm->matches[i].length = -1;
1937         rm->matches[i].startp = NULL;
1938         rm->matches[i].endp = NULL;
1939     }
1940 
1941     rm->matches[0].endp = ctx->last;
1942     /* scan through match result */
1943     for (ml = ctx->matches; ml; ml = ml->next) {
1944         if (ml->grpnum >= 0) {
1945             rm->matches[ml->grpnum].startp = ml->ptr;
1946         } else {
1947             rm->matches[-ml->grpnum].endp = ml->ptr;
1948         }
1949     }
1950 
1951     /* sanity check (not necessary, but for now...) */
1952     for (i=0; i<rx->numGroups; i++) {
1953         if ((rm->matches[i].startp && !rm->matches[i].endp)
1954             || (!rm->matches[i].startp && rm->matches[i].endp)) {
1955             Scm_Panic("implementation error: discrepancy in regexp match #%d!", i);
1956         }
1957     }
1958     return SCM_OBJ(rm);
1959 }
1960 
1961 static ScmObj rex(ScmRegexp *rx, ScmString *orig,
1962                   const char *start, const char *end)
1963 {
1964     struct match_ctx ctx;
1965     sigjmp_buf cont;
1966     ctx.rx = rx;
1967     ctx.codehead = rx->code;
1968     ctx.input = SCM_STRING_BODY_START(SCM_STRING_BODY(orig));
1969     ctx.stop = end;
1970     ctx.matches = NULL;
1971     ctx.begin_stack = (void*)&ctx;
1972     ctx.cont = &cont;
1973 
1974     if (sigsetjmp(cont, FALSE) == 0) {
1975         rex_rec(ctx.codehead, start, &ctx, NULL);
1976         return SCM_FALSE;
1977     } else {
1978         return make_match(rx, orig, &ctx);
1979     }
1980 }
1981 
1982 /*----------------------------------------------------------------------
1983  * entry point
1984  */
1985 ScmObj Scm_RegExec(ScmRegexp *rx, ScmString *str)
1986 {
1987     const ScmStringBody *b = SCM_STRING_BODY(str);
1988     const char *start = SCM_STRING_BODY_START(b);
1989     const char *end = start + SCM_STRING_BODY_SIZE(b);
1990     const ScmStringBody *mb = rx->mustMatch? SCM_STRING_BODY(rx->mustMatch) : NULL;
1991     int mustMatchLen = mb? SCM_STRING_BODY_SIZE(mb) : 0;
1992 
1993     if (SCM_STRING_INCOMPLETE_P(str)) {
1994         Scm_Error("incomplete string is not allowed: %S", str);
1995     }
1996 #if 0
1997     /* Disabled for now; we need to use more heuristics to determine
1998        when we should apply mustMatch.  For example, if the regexp
1999        begins with BOL assertion and constant string, then it would be
2000        faster to go for rex directly. */
2001     if (rx->mustMatch) {
2002         /* Prescreening.  If the input string doesn't contain mustMatch
2003            string, it can't match the entire expression. */
2004         if (SCM_FALSEP(Scm_StringScan(str, rx->mustMatch,
2005                                       SCM_STRING_SCAN_INDEX))) {
2006             return SCM_FALSE;
2007         }
2008     }
2009 #endif
2010     /* short cut : if rx matches only at the beginning of the string,
2011        we only run from the beginning of the string */
2012     if (rx->flags & SCM_REGEXP_BOL_ANCHORED) {
2013         return rex(rx, str, start, end);
2014     }
2015     /* normal matching */
2016     while (start <= end-mustMatchLen) {
2017         ScmObj r = rex(rx, str, start, end);
2018         if (!SCM_FALSEP(r)) return r;
2019         start += SCM_CHAR_NFOLLOWS(*start)+1;
2020     }
2021     return SCM_FALSE;
2022 }
2023 
2024 /*=======================================================================
2025  * Retrieving matches
2026  */
2027 
2028 /* TODO: MT Warning: these retrival functions change match object's     
2029  * internal state.
2030  */
2031 ScmObj Scm_RegMatchSubstr(ScmRegMatch *rm, int i)
2032 {
2033     struct ScmRegMatchSub *sub;
2034     if (i < 0 || i >= rm->numMatches)
2035         Scm_Error("submatch index out of range: %d", i);
2036     sub = &rm->matches[i];
2037     if (sub->startp == NULL) {
2038         return SCM_FALSE;
2039     } else if (sub->length >= 0) {
2040         return Scm_MakeString(sub->startp, sub->endp - sub->startp,
2041                               sub->length, 0);
2042     } else {
2043         ScmObj s = Scm_MakeString(sub->startp, sub->endp - sub->startp, -1, 0);
2044         sub->length = SCM_STRING_BODY_LENGTH(SCM_STRING_BODY(s));
2045         return s;
2046     }
2047 }
2048 
2049 ScmObj Scm_RegMatchStart(ScmRegMatch *rm, int i)
2050 {
2051     struct ScmRegMatchSub *sub;
2052     if (i < 0 || i >= rm->numMatches)
2053         Scm_Error("submatch index out of range: %d", i);
2054     sub = &rm->matches[i];
2055     if (sub->startp == NULL) {
2056         return SCM_FALSE;
2057     } else if (sub->start < 0) {
2058         sub->start = Scm_MBLen(rm->input, sub->startp);
2059     }
2060     return Scm_MakeInteger(sub->start);
2061 }
2062 
2063 ScmObj Scm_RegMatchEnd(ScmRegMatch *rm, int i)
2064 {
2065     struct ScmRegMatchSub *sub;
2066     if (i < 0 || i >= rm->numMatches)
2067         Scm_Error("submatch index out of range: %d", i);
2068     sub = &rm->matches[i];
2069     if (sub->startp == NULL) {
2070         return SCM_FALSE;
2071     } else if (sub->start < 0) {
2072         sub->start = Scm_MBLen(rm->input, sub->startp);
2073     }
2074     if (sub->length < 0) {
2075         sub->length = Scm_MBLen(sub->startp, sub->endp);
2076     }
2077     return Scm_MakeInteger(sub->start + sub->length);
2078 }
2079 
2080 ScmObj Scm_RegMatchBefore(ScmRegMatch *rm, int i)
2081 {
2082     struct ScmRegMatchSub *sub;
2083     if (i < 0 || i >= rm->numMatches)
2084         Scm_Error("submatch index out of range: %d", i);
2085     sub = &rm->matches[i];
2086     if (sub->startp == NULL) return SCM_FALSE;
2087     return Scm_MakeString(rm->input, sub->startp - rm->input, -1, 0);
2088 }
2089 
2090 ScmObj Scm_RegMatchAfter(ScmRegMatch *rm, int i)
2091 {
2092     struct ScmRegMatchSub *sub;
2093     if (i < 0 || i >= rm->numMatches)
2094         Scm_Error("submatch index out of range: %d", i);
2095     sub = &rm->matches[i];
2096     if (sub->startp == NULL) return SCM_FALSE;
2097     return Scm_MakeString(sub->endp,
2098                           rm->input + rm->inputSize - sub->endp, -1, 0);
2099 }
2100 
2101 /* for debug */
2102 #if SCM_DEBUG_HELPER
2103 void Scm_RegMatchDump(ScmRegMatch *rm)
2104 {
2105     int i;
2106     
2107     Scm_Printf(SCM_CUROUT, "RegMatch %p\n", rm);
2108     Scm_Printf(SCM_CUROUT, "  numMatches = %d\n", rm->numMatches);
2109     Scm_Printf(SCM_CUROUT, "  input = %S\n", rm->input);
2110     for (i=0; i<rm->numMatches; i++) {
2111         struct ScmRegMatchSub *sub = &rm->matches[i];
2112         if (sub->startp) {
2113             Scm_Printf(SCM_CUROUT, "[%3d-%3d]  %S\n",
2114                        sub->startp - rm->input,
2115                        sub->endp - rm->input,
2116                        Scm_MakeString(sub->startp, sub->endp-sub->startp,
2117                                       -1, 0));
2118         } else {
2119             Scm_Printf(SCM_CUROUT, "[---] #f\n");
2120         }
2121     }
2122 }
2123 #endif /*SCM_DEBUG_HELPER*/
2124 
2125 /*=======================================================================
2126  * Initializing stuff
2127  */
2128 
2129 void Scm__InitRegexp(void)
2130 {
2131 }

/* [<][>][^][v][top][bottom][index][help] */