/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- make_regexp
- regexp_print
- regexp_compare
- regcomp_ctx
- rc_ctx_init
- rc1_lex
- rc1_lex_xdigits
- rc1_maybe_lazy
- rc1_lex_open_paren
- rc1_lex_minmax
- rc1_fold_alts
- rc1_parse
- rc1
- rc_charset
- rc_register_charset
- rc_setup_charsets
- rc2_optimize_seq
- rc2_optimize
- is_distinct
- Scm_RegOptimizeAST
- rc3_charset_index
- rc3_emit
- rc3_emit_offset
- rc3_fill_offset
- rc3_seq
- rc3_rec
- is_bol_anchored
- rc3
- Scm_RegDump
- rc_setup_context
- rc_setup_context_seq
- Scm_RegComp
- Scm_RegCompFromAST
- push_match
- match_ci
- is_word_constituent
- is_word_boundary
- rex_rec
- make_match
- rex
- Scm_RegExec
- Scm_RegMatchSubstr
- Scm_RegMatchStart
- Scm_RegMatchEnd
- Scm_RegMatchBefore
- Scm_RegMatchAfter
- Scm_RegMatchDump
- Scm__InitRegexp
1 /*
2 * regexp.c - regular expression
3 *
4 * Copyright (c) 2000-2004 Shiro Kawai, All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * $Id: regexp.c,v 1.56 2005/11/02 10:49:19 shirok Exp $
34 */
35
36 #include <setjmp.h>
37 #include <ctype.h>
38 #define LIBGAUCHE_BODY
39 #include "gauche.h"
40 #include "gauche/class.h"
41 #include "gauche/builtin-syms.h"
42
43 /* I don't like to reinvent wheels, so I looked for a regexp implementation
44 * that can handle multibyte encodings and not bound to Unicode.
45 * Without assuming Unicode it'll be difficult to define character classes
46 * correctly, but there are domains that you don't want to do native
47 * charset <-> UTF-8 each time for regexp match, trading correctness of
48 * character classes.
49 *
50 * The most recent version of famous Henry Spencer's regex is found in Tcl
51 * 8.3, that supports wide characters (the state machine seems to work
52 * with UCS-4, but the internal tables seem to be set up for UCS-2 only).
53 * Tcl does UTF-8 <-> UCS-2 conversion in order to do regexp match.
54 *
55 * Lots of variants of Spencer's old regex code is floating around, such
56 * as http://arglist.com/regex/ and the one in BSD. They don't support
57 * multibyte strings, as far as I know.
58 *
59 * Another popular package is PCRE. PCRE 3.4 has UTF-8 support, but only
60 * experimentally.
61 *
62 * None seems to satisfy my criteria.
63 *
64 * So I reluctantly started to write my own. I don't think I can beat
65 * those guys, and am willing to grab someone's code anytime if it's suitable
66 * for my purpose and under a license like BSD one.
67 */
68
69 /*
70 * The idea here is to match string without converting mb <-> char as
71 * much as possible. Actually, the converion is done only when we see
72 * large character sets.
73 *
74 * The engine is a sort of NFA, by keeping state information for backtrack
75 * in C stack. It'll bust the C stack if you try to match something like
76 * (..)* with a long input string (there's a code to check the stack size
77 * and aborts matching when the recursion goes too deep).
78 * A possible fix is to check if recursion level exceeds some limit,
79 * then save the C stack into heap (as in the C-stack-copying continuation
80 * does) and reuse the stack area.
81 */
82
83 /* Instructions */
84 enum {
85 RE_MATCH1, /* followed by 1 byte to match */
86 RE_MATCH, /* followed by length, and bytes to match */
87 RE_MATCH1_CI, /* case insenstive match */
88 RE_MATCH_CI, /* case insenstive match */
89 RE_ANY, /* match any char */
90 RE_TRY, /* followed by offset (2 bytes). try matching
91 the following sequence, and if fails,
92 jump to offset. */
93 RE_SET, /* followed by charset #. match any char in
94 the charset. */
95 RE_NSET, /* followed by charset #. mathc any char but
96 in the charset */
97 RE_SET1, /* followed by charset #. match any char in
98 the charset. guaranteed that the charset
99 holds only range 0-127 */
100 RE_NSET1, /* followed by charset #. match any char
101 but the ones in the charset. guaranteed
102 that the charset holds only range 0-127. */
103 RE_JUMP, /* followed by offset (2 bytes). jump to that
104 bytecode. */
105 RE_FAIL, /* fail */
106 RE_SUCCESS, /* success */
107 RE_BEGIN, /* followed by a group number. start the
108 group. */
109 RE_END, /* followed by a group number. end the
110 group. */
111 RE_BOL, /* beginning of line assertion */
112 RE_EOL, /* end of line assertion */
113 RE_WB, /* word boundary assertion */
114 RE_NWB, /* negative word boundary assertion */
115 RE_ASSERT, /* positive look-ahead assertion. followed by
116 offset (2 bytes). */
117 RE_NASSERT, /* negative look-ahead assertion. followed by
118 * offset (2 bytes). */
119 /* The following instructions are not necessary to implement the basic
120 engine, but used in the optimized code */
121 RE_MATCH1B, /* (match 1 byte or branch)
122 followed by a byte, and offset.
123 if the next byte matches the input,
124 proceed. otherwise, jump to the offset. */
125 RE_SET1R, /* (1-byte set match repeat)
126 followed by charset #. Consumes all input
127 that matches the given set. */
128 RE_NSET1R, /* (1-byte negative set match repeat)
129 followed by charset #. Consumes all input
130 that don't match the given set. */
131 RE_SETR, /* (set match repeat)
132 followed by charset #. Consumes all input
133 that matches the given set. */
134 RE_NSETR, /* (negative set match repeat)
135 followed by charset #. Consumes all input
136 that don't match the given set. */
137 RE_NUM_INSN
138 };
139
140 /* maximum # of {n,m}-type limited repeat count */
141 #define MAX_LIMITED_REPEAT 255
142
143 /* internal regexp flag. */
144 #define SCM_REGEXP_BOL_ANCHORED (1L<<2)
145
146 /* AST - the first pass of regexp compiler creates intermediate AST.
147 * Alternatively, you can provide AST directly to the regexp compiler,
148 * using Scm_RegCompFromAST().
149 *
150 * <ast> : (<element> ...)
151 *
152 * <element> : <clause> ; special clause
153 * | <item> ; matches <item>
154 *
155 * <item> : <char> ; matches char
156 * | <char-set> ; matches char set
157 * | (comp . <char-set>) ; matches complement of char set
158 * | any ; matches any char
159 * | bol | eol ; beginning/end of line assertion
160 * | wb | nwb ; word-boundary/negative word boundary assertion
161 *
162 * <clause> : (seq . <ast>) ; sequence
163 * | (seq-uncase . <ast>) ; sequence (case insensitive match)
164 * | (seq-case . <ast>) ; sequence (case sensitive match)
165 * | (alt . <ast>) ; alternative
166 * | (rep . <ast>) ; 0 or more repetition of <ast> (greedy)
167 * | (rep-min . <ast>) ; 0 or more repetition of <ast> (lazy)
168 * | (rep-bound <n> . <ast>) ; repetition up to <n> (greedy)
169 * | (rep-bound-min <n> . <ast>) ; repetition up to <n> (lazy)
170 * | (rep-while . <ast>) ; like rep, but no backtrack
171 * | (<integer> . <ast>) ; capturing group
172 * | (assert . <ast>) ; positive look-ahead assertion
173 * | (nassert . <ast>) ; negative look-ahead assertion
174 *
175 * For seq-uncase, items inside <ast> has to be prepared for case-insensitive
176 * match, i.e. chars have to be downcased and char-sets have to be
177 * case-folded.
178 */
179
180 static void regexp_print(ScmObj obj, ScmPort *port, ScmWriteContext *c);
181 static int regexp_compare(ScmObj x, ScmObj y, int equalp);
182
183 SCM_DEFINE_BUILTIN_CLASS(Scm_RegexpClass,
184 regexp_print, regexp_compare, NULL, NULL,
185 SCM_CLASS_DEFAULT_CPL);
186 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_RegMatchClass, NULL);
187
188 static ScmRegexp *make_regexp(void)
189 {
190 ScmRegexp *rx = SCM_NEW(ScmRegexp);
191 SCM_SET_CLASS(rx, SCM_CLASS_REGEXP);
192 rx->code = NULL;
193 rx->numCodes = 0;
194 rx->numGroups = 0;
195 rx->numSets = 0;
196 rx->sets = NULL;
197 rx->mustMatch = NULL;
198 rx->flags = 0;
199 rx->pattern = NULL;
200 return rx;
201 }
202
203 static void regexp_print(ScmObj rx, ScmPort *out, ScmWriteContext *ctx)
204 {
205 if (SCM_REGEXP(rx)->pattern) {
206 Scm_Printf(out, "#/%A/", SCM_REGEXP(rx)->pattern);
207 } else {
208 /* fail safe */
209 Scm_Printf(out, "#<regexp %p>", rx);
210 }
211 }
212
213 static int regexp_compare(ScmObj x, ScmObj y, int equalp)
214 {
215 if (!equalp) {
216 Scm_Error("cannot compare regexps: %S and %S", x, y);
217 }
218 return !(SCM_REGEXP(x)->pattern
219 && SCM_REGEXP(y)->pattern
220 && Scm_StringEqual(SCM_STRING(SCM_REGEXP(x)->pattern),
221 SCM_STRING(SCM_REGEXP(y)->pattern))
222 && ((SCM_REGEXP(x)->flags&SCM_REGEXP_CASE_FOLD)
223 == (SCM_REGEXP(y)->flags&SCM_REGEXP_CASE_FOLD)));
224 }
225
226 #ifndef CHAR_MAX
227 #define CHAR_MAX 256
228 #endif
229
230 #define REGEXP_OFFSET_MAX 65535
231
232 /*=======================================================================
233 * Compiler
234 */
235
236 /* 3-pass compiler.
237 *
238 * pass 1: parses the pattern and creates an AST.
239 * pass 2: optimize on AST.
240 * pass 3: byte code generation.
241 */
242
243 /* compiler state information */
244 typedef struct regcomp_ctx_rec {
245 ScmRegexp *rx; /* the building regexp */
246 ScmString *pattern; /* original pattern */
247 int casefoldp; /* TRUE if case-folding match */
248 ScmPort *ipat; /* [pass1] string port for pattern */
249 ScmObj sets; /* [pass1] list of charsets */
250 int grpcount; /* [pass1] group count */
251 unsigned char *code; /* [pass3] code being built */
252 int codep; /* [pass3] front of code generation */
253 int emitp; /* [pass3] am I generating code? */
254 int codemax; /* [pass3] max codep */
255 } regcomp_ctx;
256
257 static void rc_ctx_init(regcomp_ctx *ctx, ScmRegexp *rx)
258 {
259 ctx->rx = rx;
260 ctx->pattern = rx->pattern;
261 ctx->casefoldp = FALSE;
262 if (rx->pattern) {
263 ctx->ipat = SCM_PORT(Scm_MakeInputStringPort(rx->pattern, FALSE));
264 } else {
265 ctx->ipat = NULL;
266 }
267 ctx->sets = SCM_NIL;
268 ctx->grpcount = 0;
269 ctx->code = NULL;
270 ctx->codep = 0;
271 ctx->emitp = FALSE;
272 ctx->codemax = 1;
273 }
274
275 static ScmObj rc_charset(regcomp_ctx *ctx);
276 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs);
277 static ScmObj rc1_maybe_lazy(regcomp_ctx *ctx, ScmObj greedy, ScmObj lazy);
278 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx);
279 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx);
280 static ScmChar rc1_lex_xdigits(ScmPort *port, int ndigs, int key);
281
282 /*----------------------------------------------------------------
283 * pass1 - parser
284 */
285
286 /* EBNF Syntax of Gauche's regexp.
287 * This is a rough sketch. The condition of BOL/EOL ("^" and "$"), for
288 * example, has to be dealt with context information.
289 * To follow the convention, "{" and "}" token that don't appear to
290 * consist of the "limited repetition" syntax are regarded as literal
291 * characters.
292 *
293 * <re> :
294 * | <re> <alt>
295 *
296 * <alt> :
297 * | <item>
298 * | <alt> "|" <item>
299 *
300 * <item> : <atom> "*"
301 * | <atom> "+"
302 * | <atom> "?"
303 * | <atom> "{" <n> ("," <m>?)? "}"
304 * | <atom> "*?"
305 * | <atom> "+?"
306 * | <atom> "??"
307 * | <atom> "{" <n> ("," <m>?)? "}?"
308 * | <atom>
309 *
310 * <atom> : a normal char, an escaped char, or a char-set
311 * | "(" <re> ")" ;; grouping w/ capturing
312 * | "(?:" <re> ")" ;; grouping w/o capturing
313 * | "(?i:" <re> ")" ;; grouping w/o capturing (case insensitive)
314 * | "(?-i:" <re> ")" ;; grouping w/o capturing (case sensitive)
315 * | "(?=" <re> ")" ;; positive look-ahead assertion
316 * | "(?!" <re> ")" ;; negative look-ahead assertion
317 */
318
319 /* Lexer */
320 static ScmObj rc1_lex(regcomp_ctx *ctx)
321 {
322 ScmChar ch;
323 ScmObj cs;
324
325 ch = Scm_GetcUnsafe(ctx->ipat);
326 if (ch == SCM_CHAR_INVALID) return SCM_EOF;
327 switch (ch) {
328 case '(': return rc1_lex_open_paren(ctx);
329 case ')': return SCM_SYM_CLOSE_PAREN;
330 case '|': return SCM_SYM_ALT;
331 case '^': return SCM_SYM_BOL;
332 case '.': return SCM_SYM_ANY;
333 case '$': return SCM_SYM_EOL;
334 case '[': return rc_charset(ctx);
335 case '{': return rc1_lex_minmax(ctx);
336 case '+': return rc1_maybe_lazy(ctx, SCM_SYM_PLUS, SCM_SYM_PLUSQ);
337 case '*': return rc1_maybe_lazy(ctx, SCM_SYM_STAR, SCM_SYM_STARQ);
338 case '?': return rc1_maybe_lazy(ctx, SCM_SYM_QUESTION, SCM_SYM_QUESTIONQ);
339 case '\\':
340 ch = Scm_GetcUnsafe(ctx->ipat);
341 if (ch == SCM_CHAR_INVALID) {
342 Scm_Error("stray backslash at the end of pattern: %S\n",
343 ctx->pattern);
344 }
345 switch (ch) {
346 case 'a': return SCM_MAKE_CHAR(0x07);
347 case 'n': return SCM_MAKE_CHAR('\n');
348 case 'r': return SCM_MAKE_CHAR('\r');
349 case 't': return SCM_MAKE_CHAR('\t');
350 case 'f': return SCM_MAKE_CHAR('\f');
351 case 'e': return SCM_MAKE_CHAR(0x1b);
352 case 'b': return SCM_SYM_WB;
353 case 'B': return SCM_SYM_NWB;
354 case 'x':
355 ch = rc1_lex_xdigits(ctx->ipat, 2, 'x');
356 return SCM_MAKE_CHAR(ch);
357 case 'u':
358 ch = rc1_lex_xdigits(ctx->ipat, 4, 'u');
359 return SCM_MAKE_CHAR(Scm_UcsToChar(ch));
360 case 'U':
361 ch = rc1_lex_xdigits(ctx->ipat, 8, 'U');
362 return SCM_MAKE_CHAR(Scm_UcsToChar(ch));
363 case 'd':
364 cs = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
365 rc_register_charset(ctx, SCM_CHARSET(cs));
366 return cs;
367 case 'D':
368 cs = Scm_GetStandardCharSet(SCM_CHARSET_DIGIT);
369 rc_register_charset(ctx, SCM_CHARSET(cs));
370 return Scm_Cons(SCM_SYM_COMP, cs);
371 case 'w':
372 cs = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
373 rc_register_charset(ctx, SCM_CHARSET(cs));
374 return cs;
375 case 'W':
376 cs = Scm_GetStandardCharSet(SCM_CHARSET_WORD);
377 rc_register_charset(ctx, SCM_CHARSET(cs));
378 return Scm_Cons(SCM_SYM_COMP, cs);
379 case 's':
380 cs = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
381 rc_register_charset(ctx, SCM_CHARSET(cs));
382 return cs;
383 case 'S':
384 cs = Scm_GetStandardCharSet(SCM_CHARSET_SPACE);
385 rc_register_charset(ctx, SCM_CHARSET(cs));
386 return Scm_Cons(SCM_SYM_COMP, cs);
387 }
388 /*FALLTHROUGH*/
389 default:
390 if (ctx->casefoldp) ch = SCM_CHAR_DOWNCASE(ch);
391 return SCM_MAKE_CHAR(ch);
392 }
393 /*NOTREACHED*/
394 }
395
396 /* Read \x, \u, \U escape sequence in the regexp spec. */
397 static ScmChar rc1_lex_xdigits(ScmPort *port, int ndigs, int key)
398 {
399 char buf[8];
400 int nread;
401 ScmChar r;
402 SCM_ASSERT(ndigs <= 8);
403 r = Scm_ReadXdigitsFromPort(port, ndigs, buf, &nread);
404 if (r == SCM_CHAR_INVALID) {
405 ScmDString ds;
406 int c, i;
407 /* skip chars to the end of regexp, so that the reader will read
408 after the erroneous string */
409 for (;;) {
410 SCM_GETC(c, port);
411 if (c == EOF || c == '/') break;
412 if (c == '\\') SCM_GETC(c, port);
413 }
414 /* construct an error message */
415 Scm_DStringInit(&ds);
416 Scm_DStringPutc(&ds, '\\');
417 Scm_DStringPutc(&ds, key);
418 for (i=0; i<nread; i++) Scm_DStringPutc(&ds, (unsigned char)buf[i]);
419 Scm_Error("Bad '\\%c' escape sequence in a regexp literal: %s",
420 key, Scm_DStringGetz(&ds));
421 }
422 return r;
423 }
424
425 /* Called after '+', '*' or '?' is read, and check if there's a
426 following '?' (lazy quantifier) */
427 static ScmObj rc1_maybe_lazy(regcomp_ctx *ctx, ScmObj greedy, ScmObj lazy)
428 {
429 ScmChar ch = Scm_GetcUnsafe(ctx->ipat);
430 if (ch == '?') return lazy;
431 Scm_UngetcUnsafe(ch, ctx->ipat);
432 return greedy;
433 }
434
435 /* Reads '('-sequence - either one of "(", "(?:", "(?i:" or "(?-i:".
436 The leading "(" has already been read. */
437 static ScmObj rc1_lex_open_paren(regcomp_ctx *ctx)
438 {
439 ScmObj pos;
440 ScmChar ch;
441
442 pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
443 ch = Scm_GetcUnsafe(ctx->ipat);
444 if (ch != '?') {
445 Scm_UngetcUnsafe(ch, ctx->ipat);
446 return SCM_SYM_OPEN_PAREN;
447 }
448 ch = Scm_GetcUnsafe(ctx->ipat);
449 if (ch == ':') return SCM_SYM_SEQ;
450 if (ch == '=') return SCM_SYM_ASSERT;
451 if (ch == '!') return SCM_SYM_NASSERT;
452 if (ch == 'i') {
453 ch = Scm_GetcUnsafe(ctx->ipat);
454 if (ch == ':') return SCM_SYM_SEQ_UNCASE;
455 /* fall through */
456 } else if (ch == '-') {
457 ch = Scm_GetcUnsafe(ctx->ipat);
458 if (ch == 'i') {
459 ch = Scm_GetcUnsafe(ctx->ipat);
460 if (ch == ':') return SCM_SYM_SEQ_CASE;
461 }
462 /* fall through */
463 }
464 /* fail. */
465 Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
466 return SCM_SYM_OPEN_PAREN;
467 }
468
469 /* Reads {n,m}-type repeat syntax. The leading "{" has been read.
470 If the character sequence doesn't consist of valid syntax, rollback
471 to the ordinary character sequence.
472 If successfully parsed, returns (rep-bound <n> . <m>) where
473 <m> == #f if the pattern is "{n}" (exact count), or
474 <m> == #t if the pattern is "{n,}" (minimum count), or
475 <m> == integer if the pattern is "{n,m}" (limited count).
476 If the pattern is followed by '?', rep-bound-min is used instead.
477 */
478 static ScmObj rc1_lex_minmax(regcomp_ctx *ctx)
479 {
480 int rep_min = -1, rep_max = -1, exact = FALSE, ch;
481 ScmObj pos, m;
482 ScmObj type = SCM_SYM_REP_BOUND; /* default is greedy */
483
484 pos = Scm_PortSeekUnsafe(ctx->ipat, SCM_MAKE_INT(0), SEEK_CUR);
485
486 for (;;) {
487 ch = Scm_GetcUnsafe(ctx->ipat);
488 if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
489 if (rep_min < 0) {
490 rep_min = (ch - '0');
491 } else {
492 rep_min = rep_min*10 + (ch - '0');
493 }
494 } else if (ch == ',') {
495 break;
496 } else if (ch == '}') {
497 exact = TRUE;
498 break;
499 } else {
500 goto bad_min_max;
501 }
502 }
503 if (rep_min < 0) goto bad_min_max;
504 if (rep_min > MAX_LIMITED_REPEAT) goto out_of_range;
505 if (!exact) {
506 for (;;) {
507 ch = Scm_GetcUnsafe(ctx->ipat);
508 if (SCM_CHAR_ASCII_P(ch) && isdigit(ch)) {
509 if (rep_max < 0) {
510 rep_max = (ch - '0');
511 } else {
512 rep_max = rep_max*10 + (ch - '0');
513 }
514 } else if (ch == '}') {
515 break;
516 } else {
517 goto bad_min_max;
518 }
519 }
520 if (rep_max > MAX_LIMITED_REPEAT) goto out_of_range;
521 if (rep_max >= 0 && rep_max < rep_min) {
522 Scm_Error("{n,m}-syntax requires n <= m: %S", ctx->pattern);
523 }
524 }
525
526 if (exact) m = SCM_FALSE;
527 else if (rep_max < 0) m = SCM_TRUE;
528 else m = SCM_MAKE_INT(rep_max);
529
530 ch = Scm_GetcUnsafe(ctx->ipat);
531 if (ch == '?') type = SCM_SYM_REP_BOUND_MIN;
532 else Scm_UngetcUnsafe(ch, ctx->ipat);
533 return Scm_Cons(type, Scm_Cons(SCM_MAKE_INT(rep_min), m));
534
535 out_of_range:
536 Scm_Error("{n,m}-syntax can accept up to %d count: %S",
537 MAX_LIMITED_REPEAT, ctx->pattern);
538 /*NOTREACHED*/
539 bad_min_max:
540 /* back up */
541 Scm_PortSeekUnsafe(ctx->ipat, pos, SEEK_SET);
542 return SCM_MAKE_CHAR('{');
543 }
544
545 static ScmObj rc1_fold_alts(regcomp_ctx *ctx, ScmObj alts)
546 {
547 ScmObj r = SCM_NIL, ap;
548 SCM_FOR_EACH(ap, alts) {
549 ScmObj alt = SCM_CAR(ap);
550 if (SCM_PAIRP(alt) && SCM_NULLP(SCM_CDR(alt))) {
551 r = Scm_Cons(SCM_CAR(alt), r);
552 } else {
553 r = Scm_Cons(Scm_Cons(SCM_SYM_SEQ, alt), r);
554 }
555 }
556 return Scm_Cons(SCM_SYM_ALT, r);
557 }
558
559 /* Parser */
560 static ScmObj rc1_parse(regcomp_ctx *ctx, int bolp, int topp)
561 {
562 ScmObj stack = SCM_NIL, alts = SCM_NIL;
563 ScmObj token, item;
564 int bolpsave = bolp;
565
566 #define PUSH(elt) (stack = Scm_Cons((elt), stack))
567 #define PUSH1(elt) (stack = Scm_Cons((elt), SCM_CDR(stack)))
568
569 for (;;) {
570 token = rc1_lex(ctx);
571 if (SCM_EOFP(token)) {
572 if (!topp) {
573 Scm_Error("unterminated grouping in regexp %S", ctx->pattern);
574 }
575 break;
576 }
577 if (SCM_EQ(token, SCM_SYM_CLOSE_PAREN)) {
578 if (topp) {
579 Scm_Error("extra close parenthesis in regexp %S", ctx->pattern);
580 }
581 break;
582 }
583 if (SCM_EQ(token, SCM_SYM_BOL)) {
584 if (bolp) {
585 PUSH(SCM_SYM_BOL);
586 bolp = FALSE;
587 continue;
588 } else {
589 token = SCM_MAKE_CHAR('^');
590 }
591 /*FALLTHROUGH*/
592 }
593 if (SCM_EQ(token, SCM_SYM_ALT)) {
594 alts = Scm_Cons(Scm_ReverseX(stack), alts);
595 stack = SCM_NIL;
596 bolp = bolpsave;
597 continue;
598 }
599 if (SCM_EQ(token, SCM_SYM_OPEN_PAREN)) {
600 int grpno = ++ctx->grpcount;
601 item = rc1_parse(ctx, bolp, FALSE);
602 PUSH(Scm_Cons(SCM_MAKE_INT(grpno), item));
603 bolp = FALSE;
604 continue;
605 }
606 if (SCM_EQ(token, SCM_SYM_SEQ)) {
607 item = rc1_parse(ctx, bolp, FALSE);
608 PUSH(Scm_Cons(SCM_SYM_SEQ, item));
609 bolp = FALSE;
610 continue;
611 }
612 if (SCM_EQ(token, SCM_SYM_SEQ_UNCASE) || SCM_EQ(token, SCM_SYM_SEQ_CASE)) {
613 int oldflag = ctx->casefoldp;
614 ctx->casefoldp = SCM_EQ(token, SCM_SYM_SEQ_UNCASE);
615 item = rc1_parse(ctx, bolp, FALSE);
616 PUSH(Scm_Cons(token, item));
617 ctx->casefoldp = oldflag;
618 bolp = FALSE;
619 continue;
620 }
621 if (SCM_EQ(token, SCM_SYM_ASSERT)) {
622 item = rc1_parse(ctx, bolp, FALSE);
623 PUSH(Scm_Cons(SCM_SYM_ASSERT, item));
624 continue;
625 }
626 if (SCM_EQ(token, SCM_SYM_NASSERT)) {
627 item = rc1_parse(ctx, bolp, FALSE);
628 PUSH(Scm_Cons(SCM_SYM_NASSERT, item));
629 continue;
630 }
631 if (SCM_EQ(token, SCM_SYM_STAR)) {
632 /* "x*" => (rep x) */
633 if (SCM_NULLP(stack)) goto synerr;
634 item = SCM_LIST2(SCM_SYM_REP, SCM_CAR(stack));
635 PUSH1(item);
636 continue;
637 }
638 if (SCM_EQ(token, SCM_SYM_STARQ)) {
639 /* "x*?" => (rep-min x) */
640 if (SCM_NULLP(stack)) goto synerr;
641 item = SCM_LIST2(SCM_SYM_REP_MIN, SCM_CAR(stack));
642 PUSH1(item);
643 continue;
644 }
645 if (SCM_EQ(token, SCM_SYM_PLUS)) {
646 /* "x+" => (seq x (rep x)) */
647 if (SCM_NULLP(stack)) goto synerr;
648 item = SCM_LIST3(SCM_SYM_SEQ, SCM_CAR(stack),
649 SCM_LIST2(SCM_SYM_REP, SCM_CAR(stack)));
650 PUSH1(item);
651 continue;
652 }
653 if (SCM_EQ(token, SCM_SYM_PLUSQ)) {
654 /* "x+?" => (seq x (rep-min x)) */
655 if (SCM_NULLP(stack)) goto synerr;
656 item = SCM_LIST3(SCM_SYM_SEQ, SCM_CAR(stack),
657 SCM_LIST2(SCM_SYM_REP_MIN, SCM_CAR(stack)));
658 PUSH1(item);
659 continue;
660 }
661 if (SCM_EQ(token, SCM_SYM_QUESTION)) {
662 /* "x?" => (alt x ()) */
663 if (SCM_NULLP(stack)) goto synerr;
664 item = rc1_fold_alts(ctx,
665 SCM_LIST2(SCM_NIL, SCM_LIST1(SCM_CAR(stack))));
666 PUSH1(item);
667 continue;
668 }
669 if (SCM_EQ(token, SCM_SYM_QUESTIONQ)) {
670 /* "x??" => (alt () x) */
671 if (SCM_NULLP(stack)) goto synerr;
672 item = rc1_fold_alts(ctx,
673 SCM_LIST2(SCM_LIST1(SCM_CAR(stack)), SCM_NIL));
674 PUSH1(item);
675 continue;
676 }
677 if (SCM_PAIRP(token)&&
678 (SCM_EQ(SCM_CAR(token), SCM_SYM_REP_BOUND) ||
679 SCM_EQ(SCM_CAR(token), SCM_SYM_REP_BOUND_MIN))) {
680 /* "x{n}" => (seq x .... x)
681 "x{n,}" => (seq x .... x (rep x))
682 "x{n,m}" => (seq x .... x (rep-bound m-n x))
683 "x{n,}?" => (seq x .... x (rep-min x))
684 "x{n,m}?" => (seq x .... x (rep-bound-min m-n x)) */
685 ScmObj n = SCM_CADR(token), m = SCM_CDDR(token);
686 int greedy = SCM_EQ(SCM_CAR(token), SCM_SYM_REP_BOUND);
687
688 if (SCM_NULLP(stack)) goto synerr;
689 SCM_ASSERT(SCM_INTP(n));
690 item = Scm_MakeList(SCM_INT_VALUE(n), SCM_CAR(stack));
691 if (SCM_FALSEP(m)) {
692 item = Scm_Cons(SCM_SYM_SEQ, item);
693 } else if (SCM_TRUEP(m)) {
694 item = Scm_Cons(SCM_SYM_SEQ, item);
695 item = Scm_Append2X(item,
696 SCM_LIST1(SCM_LIST2((greedy? SCM_SYM_REP : SCM_SYM_REP_MIN),
697 SCM_CAR(stack))));
698 } else {
699 int m_n;
700 SCM_ASSERT(SCM_INTP(m));
701 m_n = SCM_INT_VALUE(m)-SCM_INT_VALUE(n);
702 SCM_ASSERT(m_n >= 0);
703 item = Scm_Cons(SCM_SYM_SEQ, item);
704 if (m_n > 0) {
705 item = Scm_Append2X(item,
706 SCM_LIST1(SCM_LIST3((greedy? SCM_SYM_REP_BOUND : SCM_SYM_REP_BOUND_MIN),
707 SCM_MAKE_INT(m_n),
708 SCM_CAR(stack))));
709 }
710 }
711 PUSH1(item);
712 continue;
713 }
714 PUSH(token);
715 bolp = FALSE;
716 continue;
717 }
718 if (SCM_NULLP(alts)) {
719 return Scm_ReverseX(stack);
720 } else {
721 alts = Scm_Cons(Scm_ReverseX(stack), alts);
722 return SCM_LIST1(rc1_fold_alts(ctx, alts));
723 }
724 synerr:
725 Scm_Error("bad regexp syntax in %S", ctx->pattern);
726 return SCM_UNDEFINED; /* dummy */
727 #undef PUSH
728 #undef PUSH1
729 }
730
731 static ScmObj rc1(regcomp_ctx *ctx)
732 {
733 ScmObj ast = rc1_parse(ctx, TRUE, TRUE);
734 if (ctx->casefoldp) {
735 ast = SCM_LIST2(SCM_MAKE_INT(0), Scm_Cons(SCM_SYM_SEQ_UNCASE, ast));
736 } else {
737 ast = Scm_Cons(SCM_MAKE_INT(0), ast);
738 }
739 ctx->rx->numGroups = ctx->grpcount+1;
740 return ast;
741 }
742
743 /* character range */
744 static ScmObj rc_charset(regcomp_ctx *ctx)
745 {
746 int complement;
747 ScmObj set = Scm_CharSetRead(ctx->ipat, &complement, FALSE, TRUE);
748 if (!SCM_CHARSETP(set)) {
749 Scm_Error("bad charset spec in pattern: %S", ctx->pattern);
750 }
751 if (ctx->casefoldp) {
752 Scm_CharSetCaseFold(SCM_CHARSET(set));
753 }
754
755 rc_register_charset(ctx, SCM_CHARSET(set));
756 if (complement) {
757 return Scm_Cons(SCM_SYM_COMP, SCM_OBJ(set));
758 } else {
759 return SCM_OBJ(set);
760 }
761 }
762
763 /* Remember charset so that we can construct charset vector later */
764 static void rc_register_charset(regcomp_ctx *ctx, ScmCharSet *cs)
765 {
766 if (SCM_FALSEP(Scm_Memq(SCM_OBJ(cs), ctx->sets))) {
767 ctx->sets = Scm_Cons(SCM_OBJ(cs), ctx->sets);
768 }
769 }
770
771 /* An interlude between pass1 and pass2. From the information of
772 parser context, build a charset vector. */
773 static void rc_setup_charsets(ScmRegexp *rx, regcomp_ctx *ctx)
774 {
775 ScmObj cp;
776 int i = 0;
777 rx->numSets = Scm_Length(ctx->sets);
778 rx->sets = SCM_NEW_ARRAY(ScmCharSet*, rx->numSets);
779 for (i=0, cp = Scm_Reverse(ctx->sets); !SCM_NULLP(cp); cp = SCM_CDR(cp)) {
780 rx->sets[i++] = SCM_CHARSET(SCM_CAR(cp));
781 }
782 }
783
784 /*-------------------------------------------------------------
785 * pass 2: optimizer
786 *
787 * - flattening nested sequences: (seq a (seq b) c) => (seq a b c)
788 * - introduces short-cut construct for certain cases.
789 * (... (rep #\a) #\b ...) => (... (rep-while #\a) #\b ...)
790 */
791 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest);
792 static int is_distinct(ScmObj x, ScmObj y);
793
794 static ScmObj rc2_optimize_seq(ScmObj seq, ScmObj rest)
795 {
796 ScmObj elt, tail, etype, opted;
797 if (!SCM_PAIRP(seq)) return seq;
798 elt = SCM_CAR(seq);
799 tail = rc2_optimize_seq(SCM_CDR(seq), rest);
800 rest = SCM_NULLP(tail)? rest : tail;
801 if (!SCM_PAIRP(elt) || SCM_EQ(SCM_CAR(elt), SCM_SYM_COMP)) {
802 if (SCM_EQ(tail, SCM_CDR(seq))) return seq;
803 else return Scm_Cons(elt, tail);
804 }
805 etype = SCM_CAR(elt);
806 if (SCM_EQ(etype, SCM_SYM_SEQ)) {
807 return Scm_Append2(rc2_optimize_seq(SCM_CDR(elt), rest), tail);
808 }
809 if (SCM_EQ(etype, SCM_SYM_REP)) {
810 /* If the head of repeating sequence and the beginning of the
811 following sequence are distinct, like #/\s*foo/, the branch
812 becomes deterministic (i.e. we don't need backtrack). */
813 ScmObj repbody = rc2_optimize_seq(SCM_CDR(elt), rest);
814 SCM_ASSERT(SCM_PAIRP(repbody));
815 if (SCM_NULLP(rest) || is_distinct(SCM_CAR(repbody), SCM_CAR(rest))) {
816 return Scm_Cons(Scm_Cons(SCM_SYM_REP_WHILE, repbody), tail);
817 }
818 if (SCM_EQ(repbody, SCM_CDR(elt))) opted = elt;
819 else opted = Scm_Cons(SCM_SYM_REP, repbody);
820 } else {
821 opted = rc2_optimize(elt, rest);
822 }
823 if (SCM_EQ(elt, opted) && SCM_EQ(tail, SCM_CDR(seq))) return seq;
824 else return Scm_Cons(opted, tail);
825 }
826
827 static ScmObj rc2_optimize(ScmObj ast, ScmObj rest)
828 {
829 ScmObj type, seq, seqo;
830 if (!SCM_PAIRP(ast)) return ast;
831 type = SCM_CAR(ast);
832 if (SCM_EQ(type, SCM_SYM_COMP)) return ast;
833
834 if (SCM_EQ(type, SCM_SYM_ALT)) {
835 ScmObj sp, sp2, e = SCM_UNBOUND, h, t;
836 SCM_FOR_EACH(sp, SCM_CDR(ast)) {
837 e = rc2_optimize(SCM_CAR(sp), rest);
838 if (!SCM_EQ(e, SCM_CAR(sp))) break;
839 }
840 if (SCM_NULLP(sp)) return ast;
841 /* need to copy the spine */
842 h = t = SCM_NIL;
843 SCM_FOR_EACH(sp2, SCM_CDR(ast)) {
844 if (SCM_EQ(sp, sp2)) { SCM_APPEND1(h, t, e); break; }
845 SCM_APPEND1(h, t, SCM_CAR(sp2));
846 }
847 SCM_FOR_EACH(sp2, SCM_CDR(sp2)) {
848 SCM_APPEND1(h, t, rc2_optimize(SCM_CAR(sp2), rest));
849 }
850 return Scm_Cons(SCM_SYM_ALT, h);
851 }
852 if (SCM_EQ(type, SCM_SYM_REP_BOUND)) seq = SCM_CDDR(ast);
853 else seq = SCM_CDR(ast);
854 seqo = rc2_optimize_seq(seq, rest);
855 if (SCM_EQ(seq, seqo)) return ast;
856 else {
857 if (SCM_EQ(type, SCM_SYM_REP_BOUND)) {
858 return Scm_Cons(type, Scm_Cons(SCM_CADR(ast), seqo));
859 } else {
860 return Scm_Cons(type, seqo);
861 }
862 }
863 }
864
865 static int is_distinct(ScmObj x, ScmObj y)
866 {
867 ScmObj carx;
868 if (SCM_PAIRP(x)) {
869 carx = SCM_CAR(x);
870 if (SCM_EQ(carx, SCM_SYM_COMP)) {
871 SCM_ASSERT(SCM_CHARSETP(SCM_CDR(x)));
872 if (SCM_CHARP(y) || SCM_CHARSETP(y)) {
873 return !is_distinct(SCM_CDR(x), y);
874 }
875 return FALSE;
876 }
877 if (SCM_INTP(carx)
878 || SCM_EQ(carx, SCM_SYM_SEQ_UNCASE)
879 || SCM_EQ(carx, SCM_SYM_SEQ_CASE)) {
880 if (SCM_PAIRP(SCM_CDR(x))) {
881 return is_distinct(SCM_CADR(x), y);
882 }
883 }
884 return FALSE;
885 }
886 if (SCM_CHARP(x)) {
887 if (SCM_CHARP(y)) return !SCM_EQ(x, y);
888 return is_distinct(y, x);
889 }
890 if (SCM_CHARSETP(x)) {
891 if (SCM_CHARP(y)) {
892 return !Scm_CharSetContains(SCM_CHARSET(x), SCM_CHAR_VALUE(y));
893 }
894 if (SCM_CHARSETP(y)) {
895 ScmObj ccs = Scm_CopyCharSet(SCM_CHARSET(y));
896 ccs = Scm_CharSetComplement(SCM_CHARSET(ccs));
897 return Scm_CharSetLE(SCM_CHARSET(x), SCM_CHARSET(ccs));
898 }
899 return is_distinct(y, x);
900 }
901 return FALSE;
902 }
903
904 ScmObj Scm_RegOptimizeAST(ScmObj ast)
905 {
906 return rc2_optimize(ast, SCM_NIL);
907 }
908
909 /*-------------------------------------------------------------
910 * pass 3 - code generation
911 * This pass actually called twice; the first run counts
912 * the size of the bytecode, and the second run fills
913 * the bytecode. EMITP == FALSE for the first, EMITP == TRUE
914 * for the second.
915 * LASTP indicates this call is dealing with the last part of
916 * the compiled tree, thus need to deal with EOL marker.
917 */
918
919 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp, int toplevelp);
920
921 /* Util function for pass2, to get an index of the charset vector
922 * for the given charset.
923 */
924 static int rc3_charset_index(ScmRegexp *rx, ScmObj cs)
925 {
926 int i;
927 for (i=0; i<rx->numSets; i++)
928 if (cs == SCM_OBJ(rx->sets[i])) return i;
929 Scm_Panic("rc3_charset_index: can't be here");
930 return 0; /* dummy */
931 }
932
933 static void rc3_emit(regcomp_ctx *ctx, char code)
934 {
935 if (ctx->emitp) {
936 SCM_ASSERT(ctx->codep < ctx->codemax);
937 ctx->code[ctx->codep++] = code;
938 } else {
939 ctx->codemax++;
940 }
941 }
942
943 static void rc3_emit_offset(regcomp_ctx *ctx, int offset)
944 {
945 if (offset > REGEXP_OFFSET_MAX) {
946 Scm_Error("regexp too large. consider splitting it up: %50.1S",
947 SCM_OBJ(ctx->rx));
948 }
949
950 if (ctx->emitp) {
951 SCM_ASSERT(ctx->codep < ctx->codemax-1);
952 ctx->code[ctx->codep++] = (offset>>8) & 0xff;
953 ctx->code[ctx->codep++] = offset & 0xff;
954 } else {
955 ctx->codemax+=2;
956 }
957 }
958
959 static void rc3_fill_offset(regcomp_ctx *ctx, int codep, int offset)
960 {
961 if (offset > REGEXP_OFFSET_MAX) {
962 Scm_Error("regexp too large. consider splitting it up: %50.1S",
963 SCM_OBJ(ctx->rx));
964 }
965
966 if (ctx->emitp) {
967 SCM_ASSERT(codep < ctx->codemax-1);
968 ctx->code[codep] = (offset >> 8) & 0xff;
969 ctx->code[codep+1] = offset & 0xff;
970 }
971 }
972
973 static void rc3_seq(regcomp_ctx *ctx, ScmObj seq, int lastp, int toplevelp)
974 {
975 ScmObj cp, item;
976
977 SCM_FOR_EACH(cp, seq) {
978 item = SCM_CAR(cp);
979
980 /* concatenate literal character sequence */
981 if (SCM_CHARP(item)) {
982 int nrun = 0, ocodep = ctx->codep, nb, i;
983 ScmChar ch;
984 char chbuf[SCM_CHAR_MAX_BYTES];
985
986 rc3_emit(ctx, (ctx->casefoldp? RE_MATCH_CI:RE_MATCH));
987 rc3_emit(ctx, 0); /* patched later */
988 do {
989 ch = SCM_CHAR_VALUE(item);
990 nb = SCM_CHAR_NBYTES(ch);
991 SCM_CHAR_PUT(chbuf, ch);
992 for (i=0; i<nb; i++) rc3_emit(ctx, chbuf[i]);
993 nrun += nb;
994 cp = SCM_CDR(cp);
995 if (SCM_NULLP(cp)) break;
996 item = SCM_CAR(cp);
997 } while (SCM_CHARP(item) && nrun < CHAR_MAX);
998 if (ctx->emitp) {
999 /* patches the run length. if we are matching to a
1000 single byte char, use MATCH1 insn. */
1001 if (nrun == 1) {
1002 ctx->code[ocodep] =
1003 ctx->casefoldp?RE_MATCH1_CI:RE_MATCH1;
1004 ctx->code[ocodep+1] = ctx->code[ocodep+2];
1005 ctx->codep = ocodep+2;
1006 } else {
1007 ctx->code[ocodep+1] = (char)nrun;
1008 }
1009 }
1010 if (SCM_NULLP(cp)) break;
1011 cp = Scm_Cons(item, cp); /* pushback */
1012 } else {
1013 rc3_rec(ctx, item, lastp&&SCM_NULLP(SCM_CDR(cp)), toplevelp);
1014 }
1015 }
1016 }
1017
1018 static void rc3_rec(regcomp_ctx *ctx, ScmObj ast, int lastp, int toplevelp)
1019 {
1020 ScmObj type;
1021 ScmRegexp *rx = ctx->rx;
1022
1023 /* first, deal with atoms */
1024 if (!SCM_PAIRP(ast)) {
1025 /* a char */
1026 if (SCM_CHARP(ast)) {
1027 char chbuf[SCM_CHAR_MAX_BYTES];
1028 ScmChar ch = SCM_CHAR_VALUE(ast);
1029 int i, nb = SCM_CHAR_NBYTES(ch);
1030 SCM_CHAR_PUT(chbuf, ch);
1031 if (nb == 1) {
1032 rc3_emit(ctx, (ctx->casefoldp? RE_MATCH1_CI:RE_MATCH1));
1033 rc3_emit(ctx, chbuf[0]);
1034 } else {
1035 rc3_emit(ctx, (ctx->casefoldp? RE_MATCH_CI:RE_MATCH));
1036 rc3_emit(ctx, nb);
1037 for (i=0; i<nb; i++) rc3_emit(ctx, chbuf[i]);
1038 }
1039 return;
1040 }
1041 /* charset */
1042 if (SCM_CHARSETP(ast)) {
1043 if (SCM_CHARSET_SMALLP(ast)) {
1044 rc3_emit(ctx, RE_SET1);
1045 } else {
1046 rc3_emit(ctx, RE_SET);
1047 }
1048 rc3_emit(ctx, rc3_charset_index(rx, ast));
1049 return;
1050 }
1051 /* special stuff */
1052 if (SCM_SYMBOLP(ast)) {
1053 if (SCM_EQ(ast, SCM_SYM_ANY)) {
1054 rc3_emit(ctx, RE_ANY);
1055 return;
1056 }
1057 if (SCM_EQ(ast, SCM_SYM_BOL)) {
1058 rc3_emit(ctx, RE_BOL);
1059 return;
1060 }
1061 if (SCM_EQ(ast, SCM_SYM_EOL)) {
1062 if (lastp) {
1063 rc3_emit(ctx, RE_EOL);
1064 } else {
1065 rc3_emit(ctx, RE_MATCH1);
1066 rc3_emit(ctx, '$');
1067 }
1068 return;
1069 }
1070 if (SCM_EQ(ast, SCM_SYM_WB)) {
1071 rc3_emit(ctx, RE_WB);
1072 return;
1073 }
1074 if (SCM_EQ(ast, SCM_SYM_NWB)) {
1075 rc3_emit(ctx, RE_NWB);
1076 return;
1077 }
1078 /* fallback */
1079 }
1080 Scm_Error("internal error in regexp compilation: unrecognized AST item: %S", ast);
1081 }
1082
1083 /* now we have a structured node */
1084 type = SCM_CAR(ast);
1085 if (SCM_EQ(type, SCM_SYM_COMP)) {
1086 ScmObj cs = SCM_CDR(ast);
1087 SCM_ASSERT(SCM_CHARSETP(cs));
1088 if (SCM_CHARSET_SMALLP(cs)) {
1089 rc3_emit(ctx, RE_NSET1);
1090 } else {
1091 rc3_emit(ctx, RE_NSET);
1092 }
1093 rc3_emit(ctx, rc3_charset_index(rx, cs));
1094 return;
1095 }
1096 if (SCM_EQ(type, SCM_SYM_SEQ)) {
1097 rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1098 return;
1099 }
1100 if (SCM_INTP(type)) {
1101 int grpno = SCM_INT_VALUE(type);
1102 rc3_emit(ctx, RE_BEGIN);
1103 rc3_emit(ctx, grpno);
1104 rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1105 rc3_emit(ctx, RE_END);
1106 rc3_emit(ctx, grpno);
1107 return;
1108 }
1109 if (SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1110 int oldcase = ctx->casefoldp;
1111 ctx->casefoldp = SCM_EQ(type, SCM_SYM_SEQ_UNCASE);
1112 rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1113 ctx->casefoldp = oldcase;
1114 return;
1115 }
1116 if (SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1117 /* here we have an opportunity to generate an optimized code. */
1118 if (SCM_PAIRP(SCM_CDR(ast)) && SCM_NULLP(SCM_CDDR(ast))) {
1119 ScmObj elem = SCM_CADR(ast);
1120 if (SCM_CHARSETP(elem)) {
1121 rc3_emit(ctx, SCM_CHARSET_SMALLP(elem)?RE_SET1R:RE_SETR);
1122 rc3_emit(ctx, rc3_charset_index(rx, elem));
1123 return;
1124 }
1125 if (SCM_PAIRP(elem)&&SCM_EQ(SCM_CAR(elem), SCM_SYM_COMP)) {
1126 elem = SCM_CDR(elem);
1127 SCM_ASSERT(SCM_CHARSETP(elem));
1128 rc3_emit(ctx, SCM_CHARSET_SMALLP(elem)?RE_NSET1R:RE_NSETR);
1129 rc3_emit(ctx, rc3_charset_index(rx, elem));
1130 return;
1131 }
1132 }
1133 /* fallthrough to rep */
1134 type = SCM_SYM_REP;
1135 }
1136 if (SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)) {
1137 int ocodep = ctx->codep;
1138 rc3_emit(ctx, SCM_EQ(type, SCM_SYM_ASSERT) ? RE_ASSERT : RE_NASSERT);
1139 rc3_emit_offset(ctx, 0); /* will be patched */
1140 rc3_seq(ctx, SCM_CDR(ast), lastp, toplevelp);
1141 rc3_emit(ctx, RE_SUCCESS);
1142 rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1143 return;
1144 }
1145 if (SCM_EQ(type, SCM_SYM_REP)) {
1146 /* rep: TRY next
1147 <seq>
1148 JUMP rep
1149 next:
1150 */
1151 int ocodep = ctx->codep;
1152 rc3_emit(ctx, RE_TRY);
1153 rc3_emit_offset(ctx, 0); /* will be patched */
1154 rc3_seq(ctx, SCM_CDR(ast), FALSE, FALSE);
1155 rc3_emit(ctx, RE_JUMP);
1156 rc3_emit_offset(ctx, ocodep);
1157 rc3_fill_offset(ctx, ocodep+1, ctx->codep);
1158 return;
1159 }
1160 if (SCM_EQ(type, SCM_SYM_REP_MIN)) {
1161 /* non-greedy repeat
1162 rep: TRY seq
1163 JUMP next
1164 seq: <seq>
1165 JUMP rep
1166 next:
1167 */
1168 int ocodep1 = ctx->codep, ocodep2;
1169 rc3_emit(ctx, RE_TRY);
1170 rc3_emit_offset(ctx, 0); /* will be patched */
1171 ocodep2 = ctx->codep;
1172 rc3_emit(ctx, RE_JUMP);
1173 rc3_emit_offset(ctx, 0); /* will be patched */
1174 rc3_fill_offset(ctx, ocodep1+1, ctx->codep);
1175 rc3_seq(ctx, SCM_CDR(ast), FALSE, FALSE);
1176 rc3_emit(ctx, RE_JUMP);
1177 rc3_emit_offset(ctx, ocodep1);
1178 rc3_fill_offset(ctx, ocodep2+1, ctx->codep);
1179 return;
1180 }
1181 if (SCM_EQ(type, SCM_SYM_ALT)) {
1182 /* TRY #1
1183 <alt0>
1184 JUMP next
1185 #1: TRY #2
1186 <alt1>
1187 JUMP next
1188 :
1189 :
1190 TRY next
1191 <altN>
1192 next:
1193 */
1194 ScmObj clause;
1195 ScmObj jumps = SCM_NIL;
1196 int patchp;
1197
1198 for (clause = SCM_CDR(ast);
1199 SCM_PAIRP(SCM_CDR(clause));
1200 clause = SCM_CDR(clause)) {
1201 rc3_emit(ctx, RE_TRY);
1202 patchp = ctx->codep;
1203 rc3_emit_offset(ctx, 0); /* will be patched */
1204 rc3_rec(ctx, SCM_CAR(clause), lastp, FALSE);
1205 rc3_emit(ctx, RE_JUMP);
1206 if (ctx->emitp) {
1207 jumps = Scm_Cons(SCM_MAKE_INT(ctx->codep), jumps);
1208 }
1209 rc3_emit_offset(ctx, 0); /* will be patched */
1210 rc3_fill_offset(ctx, patchp, ctx->codep);
1211 }
1212 rc3_rec(ctx, SCM_CAR(clause), lastp, FALSE);
1213 if (ctx->emitp) {
1214 SCM_FOR_EACH(jumps, jumps) {
1215 patchp = SCM_INT_VALUE(SCM_CAR(jumps));
1216 rc3_fill_offset(ctx, patchp, ctx->codep);
1217 }
1218 }
1219 return;
1220 }
1221 if (SCM_EQ(type, SCM_SYM_REP_BOUND) || SCM_EQ(type, SCM_SYM_REP_BOUND_MIN)) {
1222 /* (rep-bound <n> . <x>)
1223
1224 TRY #01
1225 JUMP #11
1226 #01:TRY #02
1227 JUMP #12
1228 :
1229 #0n:JUMP #1N
1230 #11:<X>
1231 #12:<X>
1232 :
1233 #1n:<X>
1234 #1N:
1235
1236 (rep-bound-min <n> . <x>)
1237
1238 TRY #01
1239 JUMP #1N
1240 #01:TRY #02
1241 JUMP #1n
1242 :
1243 #0n TRY #11
1244 JUMP #12
1245 #11:<X>
1246 #12:<X>
1247 :
1248 #1n:<X>
1249 #1N:
1250
1251 */
1252 ScmObj item, jlist = SCM_NIL;
1253 int count, n, j0 = 0, jn;
1254 int greedy = SCM_EQ(type, SCM_SYM_REP_BOUND);
1255
1256 SCM_ASSERT(Scm_Length(ast) >= 3 && SCM_INTP(SCM_CADR(ast)));
1257 count = SCM_INT_VALUE(SCM_CADR(ast));
1258 SCM_ASSERT(count > 0);
1259 item = SCM_CDDR(ast);
1260 /* first part - TRYs and JUMPs
1261 j0 is used to patch the label #0k
1262 the destination of jumps to be patched are linked to jlist */
1263 for (n=0; n<count; n++) {
1264 if (n>0) rc3_fill_offset(ctx, j0, ctx->codep);
1265 rc3_emit(ctx, RE_TRY);
1266 if (ctx->emitp) j0 = ctx->codep;
1267 rc3_emit_offset(ctx, 0); /* to be patched */
1268 rc3_emit(ctx, RE_JUMP);
1269 if (ctx->emitp) {
1270 jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1271 }
1272 rc3_emit_offset(ctx, 0); /* to be patched */
1273 }
1274 rc3_fill_offset(ctx, j0, ctx->codep); /* patch #0n */
1275 /* finishing the first part.
1276 for non-greedy match, we need one more TRY. */
1277 if (greedy) {
1278 rc3_emit(ctx, RE_JUMP);
1279 jn = ctx->codep;
1280 rc3_emit_offset(ctx, 0); /* to be patched */
1281 } else {
1282 rc3_emit(ctx, RE_TRY);
1283 jn = ctx->codep;
1284 rc3_emit_offset(ctx, 0); /* to be patched */
1285 rc3_emit(ctx, RE_JUMP);
1286 if (ctx->emitp) {
1287 jlist = Scm_Cons(SCM_MAKE_INT(ctx->codep), jlist);
1288 }
1289 rc3_emit_offset(ctx, 0); /* to be patched */
1290 rc3_fill_offset(ctx, jn, ctx->codep);
1291 }
1292 if (ctx->emitp && greedy) jlist = Scm_ReverseX(jlist);
1293 if (!greedy) rc3_seq(ctx, item, FALSE, toplevelp);
1294 for (n=0; n<count; n++) {
1295 if (ctx->emitp) {
1296 rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)),
1297 ctx->codep);
1298 }
1299 rc3_seq(ctx, item, FALSE, toplevelp);
1300 if (ctx->emitp) jlist = SCM_CDR(jlist);
1301 }
1302 if (greedy) {
1303 /* the last JUMP to #1N */
1304 rc3_fill_offset(ctx, jn, ctx->codep);
1305 } else {
1306 /* the first JUMP to #1N */
1307 if (ctx->emitp) {
1308 SCM_ASSERT(SCM_PAIRP(jlist));
1309 rc3_fill_offset(ctx, SCM_INT_VALUE(SCM_CAR(jlist)), ctx->codep);
1310 }
1311 }
1312 return;
1313 }
1314 Scm_Error("internal error in regexp compilation: bad node: %S", ast);
1315 }
1316
1317 static int is_bol_anchored(ScmObj ast)
1318 {
1319 ScmObj type;
1320 if (!SCM_PAIRP(ast)) {
1321 if (SCM_EQ(ast, SCM_SYM_BOL)) return TRUE;
1322 else return FALSE;
1323 }
1324 type = SCM_CAR(ast);
1325 if (SCM_INTP(type) || SCM_EQ(type, SCM_SYM_SEQ)
1326 || SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
1327 if (!SCM_PAIRP(SCM_CDR(ast))) return FALSE;
1328 return is_bol_anchored(SCM_CADR(ast));
1329 }
1330 if (SCM_EQ(type, SCM_SYM_ALT)) {
1331 ScmObj ap;
1332 SCM_FOR_EACH(ap, SCM_CDR(ast)) {
1333 if (!is_bol_anchored(SCM_CAR(ap))) return FALSE;
1334 }
1335 return TRUE;
1336 }
1337 return FALSE;
1338 }
1339
1340 /* pass 3 */
1341 static ScmObj rc3(regcomp_ctx *ctx, ScmObj ast)
1342 {
1343 /* check if ast is bol-anchored */
1344 if (is_bol_anchored(ast)) ctx->rx->flags |= SCM_REGEXP_BOL_ANCHORED;
1345
1346 /* pass 3-1 : count # of insns */
1347 ctx->codemax = 1;
1348 ctx->emitp = FALSE;
1349 rc3_rec(ctx, ast, TRUE, TRUE);
1350
1351 /* pass 3-2 : code generation */
1352 ctx->code = SCM_NEW_ATOMIC2(unsigned char *, ctx->codemax);
1353 ctx->emitp = TRUE;
1354 rc3_rec(ctx, ast, TRUE, TRUE);
1355 rc3_emit(ctx, RE_SUCCESS);
1356 ctx->rx->code = ctx->code;
1357 ctx->rx->numCodes = ctx->codep;
1358 return SCM_OBJ(ctx->rx);
1359 }
1360
1361 /* For debug */
1362 #if SCM_DEBUG_HELPER
1363 void Scm_RegDump(ScmRegexp *rx)
1364 {
1365 int end = rx->numCodes, codep;
1366
1367 Scm_Printf(SCM_CUROUT, "Regexp %p: (flags=%08x)\n", rx, rx->flags);
1368 Scm_Printf(SCM_CUROUT, " must = ");
1369 if (rx->mustMatch) {
1370 Scm_Printf(SCM_CUROUT, "%S\n", rx->mustMatch);
1371 } else {
1372 Scm_Printf(SCM_CUROUT, "(none)\n");
1373 }
1374
1375 for (codep = 0; codep < end; codep++) {
1376 switch (rx->code[codep]) {
1377 case RE_MATCH1:;
1378 case RE_MATCH1_CI:
1379 codep++;
1380 Scm_Printf(SCM_CUROUT, "%4d %s 0x%02x '%c'\n",
1381 codep-1,
1382 (rx->code[codep-1]==RE_MATCH1? "MATCH1":"MATCH1_CI"),
1383 rx->code[codep], rx->code[codep]);
1384 continue;
1385 case RE_MATCH:;
1386 case RE_MATCH_CI:
1387 codep++;
1388 {
1389 u_int numchars = (u_int)rx->code[codep];
1390 int i;
1391 Scm_Printf(SCM_CUROUT, "%4d %s(%3d) '",
1392 codep-1,
1393 (rx->code[codep-1]==RE_MATCH? "MATCH":"MATCH_CI"),
1394 numchars);
1395 for (i=0; i< numchars; i++)
1396 Scm_Printf(SCM_CUROUT, "%c", rx->code[++codep]);
1397 Scm_Printf(SCM_CUROUT, "'\n");
1398 }
1399 continue;
1400 case RE_ANY:
1401 Scm_Printf(SCM_CUROUT, "%4d ANY\n", codep);
1402 continue;
1403 case RE_TRY:
1404 codep++;
1405 Scm_Printf(SCM_CUROUT, "%4d TRY %d\n", codep-1,
1406 (rx->code[codep])*256 + rx->code[codep+1]);
1407 codep++;
1408 continue;
1409 case RE_SET:
1410 codep++;
1411 Scm_Printf(SCM_CUROUT, "%4d SET %d %S\n",
1412 codep-1, rx->code[codep],
1413 rx->sets[rx->code[codep]]);
1414 continue;
1415 case RE_NSET:
1416 codep++;
1417 Scm_Printf(SCM_CUROUT, "%4d NSET %d %S\n",
1418 codep-1, rx->code[codep],
1419 rx->sets[rx->code[codep]]);
1420 continue;
1421 case RE_SET1:
1422 codep++;
1423 Scm_Printf(SCM_CUROUT, "%4d SET1 %d %S\n",
1424 codep-1, rx->code[codep],
1425 rx->sets[rx->code[codep]]);
1426 continue;
1427 case RE_NSET1:
1428 codep++;
1429 Scm_Printf(SCM_CUROUT, "%4d NSET1 %d %S\n",
1430 codep-1, rx->code[codep],
1431 rx->sets[rx->code[codep]]);
1432 continue;
1433 case RE_JUMP:
1434 codep++;
1435 Scm_Printf(SCM_CUROUT, "%4d JUMP %d\n", codep-1,
1436 (rx->code[codep])*256 + rx->code[codep+1]);
1437 codep++;
1438 continue;
1439 case RE_FAIL:
1440 Scm_Printf(SCM_CUROUT, "%4d FAIL\n", codep);
1441 continue;
1442 case RE_SUCCESS:
1443 Scm_Printf(SCM_CUROUT, "%4d SUCCESS\n", codep);
1444 continue;
1445 case RE_BEGIN:
1446 codep++;
1447 Scm_Printf(SCM_CUROUT, "%4d BEGIN %d\n", codep-1, rx->code[codep]);
1448 continue;
1449 case RE_END:
1450 codep++;
1451 Scm_Printf(SCM_CUROUT, "%4d END %d\n", codep-1, rx->code[codep]);
1452 continue;
1453 case RE_BOL:
1454 Scm_Printf(SCM_CUROUT, "%4d BOL\n", codep);
1455 continue;
1456 case RE_EOL:
1457 Scm_Printf(SCM_CUROUT, "%4d EOL\n", codep);
1458 continue;
1459 case RE_WB:
1460 Scm_Printf(SCM_CUROUT, "%4d WB\n", codep);
1461 continue;
1462 case RE_NWB:
1463 Scm_Printf(SCM_CUROUT, "%4d NWB\n", codep);
1464 continue;
1465 case RE_MATCH1B:
1466 Scm_Printf(SCM_CUROUT, "%4d MATCH1B %02x '%c', %d\n",
1467 codep, rx->code[codep+1], rx->code[codep+1],
1468 rx->code[codep+2]*256+rx->code[codep+3]);
1469 codep += 3;
1470 continue;
1471 case RE_SET1R:
1472 codep++;
1473 Scm_Printf(SCM_CUROUT, "%4d SET1R %d %S\n",
1474 codep-1, rx->code[codep],
1475 rx->sets[rx->code[codep]]);
1476 continue;
1477 case RE_NSET1R:
1478 codep++;
1479 Scm_Printf(SCM_CUROUT, "%4d NSET1R %d %S\n",
1480 codep-1, rx->code[codep],
1481 rx->sets[rx->code[codep]]);
1482 continue;
1483 case RE_SETR:
1484 codep++;
1485 Scm_Printf(SCM_CUROUT, "%4d SETR %d %S\n",
1486 codep-1, rx->code[codep],
1487 rx->sets[rx->code[codep]]);
1488 continue;
1489 case RE_NSETR:
1490 codep++;
1491 Scm_Printf(SCM_CUROUT, "%4d NSETR %d %S\n",
1492 codep-1, rx->code[codep],
1493 rx->sets[rx->code[codep]]);
1494 continue;
1495 case RE_ASSERT:
1496 codep++;
1497 Scm_Printf(SCM_CUROUT, "%4d ASSERT %d\n", codep-1,
1498 (rx->code[codep])*256 + rx->code[codep+1]);
1499 codep++;
1500 continue;
1501 case RE_NASSERT:
1502 codep++;
1503 Scm_Printf(SCM_CUROUT, "%4d NASSERT %d\n", codep-1,
1504 (rx->code[codep])*256 + rx->code[codep+1]);
1505 codep++;
1506 continue;
1507 default:
1508 Scm_Error("regexp screwed up\n");
1509 }
1510 }
1511 }
1512 #endif /* SCM_DEBUG_HELPER */
1513
1514 /* Helper routine to be used for compilation from AST.
1515 Traverses AST to reorder groups and collect charsets.
1516 Note that the native regcomp path doesn't use these fns.
1517 Only the AST provided from outside is processed. */
1518 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq);
1519
1520 static ScmObj rc_setup_context(regcomp_ctx *ctx, ScmObj ast)
1521 {
1522 ScmObj type, rest;
1523 if (!SCM_PAIRP(ast)) {
1524 if (SCM_CHARP(ast)) return ast;
1525 if (SCM_CHARSETP(ast)) {
1526 rc_register_charset(ctx, SCM_CHARSET(ast));
1527 return ast;
1528 }
1529 if (SCM_EQ(ast, SCM_SYM_BOL) || SCM_EQ(ast, SCM_SYM_EOL)
1530 || SCM_EQ(ast, SCM_SYM_WB) || SCM_EQ(ast, SCM_SYM_NWB)
1531 || SCM_EQ(ast, SCM_SYM_ANY)) {
1532 return ast;
1533 }
1534 goto badast;
1535 }
1536 type = SCM_CAR(ast);
1537 if (SCM_INTP(type)) {
1538 int grpno = ctx->grpcount++;
1539 rest = rc_setup_context_seq(ctx, SCM_CDR(ast));
1540 if (SCM_INT_VALUE(type) == grpno && SCM_EQ(SCM_CDR(ast), rest)) {
1541 return ast;
1542 } else {
1543 return Scm_Cons(SCM_MAKE_INT(grpno), rest);
1544 }
1545 }
1546 if (SCM_EQ(type, SCM_SYM_COMP)) {
1547 if (!SCM_CHARSETP(SCM_CDR(ast))) goto badast;
1548 rc_register_charset(ctx, SCM_CHARSET(SCM_CDR(ast)));
1549 return ast;
1550 }
1551 if (SCM_EQ(type, SCM_SYM_SEQ) || SCM_EQ(type, SCM_SYM_ALT)
1552 || SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)
1553 || SCM_EQ(type, SCM_SYM_REP) || SCM_EQ(type, SCM_SYM_REP_MIN)
1554 || SCM_EQ(type, SCM_SYM_ASSERT) || SCM_EQ(type, SCM_SYM_NASSERT)
1555 || SCM_EQ(type, SCM_SYM_REP_WHILE)) {
1556 rest = rc_setup_context_seq(ctx, SCM_CDR(ast));
1557 if (SCM_EQ(SCM_CDR(ast), rest)) return ast;
1558 else return Scm_Cons(type, rest);
1559 }
1560 if (SCM_EQ(type, SCM_SYM_REP_BOUND) || SCM_EQ(type, SCM_SYM_REP_BOUND_MIN)) {
1561 if (!SCM_PAIRP(SCM_CDR(ast)) || !SCM_INTP(SCM_CADR(ast))
1562 || SCM_INT_VALUE(SCM_CADR(ast)) < 0) {
1563 goto badast;
1564 }
1565 rest = rc_setup_context_seq(ctx, SCM_CDDR(ast));
1566 if (SCM_EQ(SCM_CDDR(ast), rest)) return ast;
1567 else return Scm_Cons(type, Scm_Cons(SCM_CADR(ast), rest));
1568 }
1569 badast:
1570 Scm_Error("invalid regexp AST: %S", ast);
1571 return SCM_UNDEFINED; /* dummy */
1572 }
1573
1574 static ScmObj rc_setup_context_seq(regcomp_ctx *ctx, ScmObj seq)
1575 {
1576 ScmObj sp, sp2, obj, head = SCM_NIL, tail = SCM_NIL;
1577 SCM_FOR_EACH(sp, seq) {
1578 obj = rc_setup_context(ctx, SCM_CAR(sp));
1579 if (!SCM_EQ(obj, SCM_CAR(sp))) break;
1580 }
1581 if (SCM_NULLP(sp)) return seq;
1582 /* we need to copy the spine */
1583 SCM_FOR_EACH(sp2, seq) {
1584 if (SCM_EQ(sp2, sp)) break;
1585 SCM_APPEND1(head, tail, SCM_CAR(sp2));
1586 }
1587 SCM_FOR_EACH(sp2, sp2) {
1588 SCM_APPEND1(head, tail, rc_setup_context(ctx, SCM_CAR(sp2)));
1589 }
1590 return head;
1591 }
1592
1593 /*--------------------------------------------------------------
1594 * Compiler entry point
1595 */
1596 ScmObj Scm_RegComp(ScmString *pattern, int flags)
1597 {
1598 ScmRegexp *rx = make_regexp();
1599 ScmObj ast;
1600 regcomp_ctx cctx;
1601
1602 if (SCM_STRING_INCOMPLETE_P(pattern)) {
1603 Scm_Error("incomplete string is not allowed: %S", pattern);
1604 }
1605 rx->pattern = SCM_STRING(Scm_CopyStringWithFlags(pattern,
1606 SCM_STRING_IMMUTABLE,
1607 SCM_STRING_IMMUTABLE));
1608 rc_ctx_init(&cctx, rx);
1609 cctx.casefoldp = flags & SCM_REGEXP_CASE_FOLD;
1610 rx->flags |= (flags & SCM_REGEXP_CASE_FOLD);
1611
1612 /* pass 1 : parse regexp spec */
1613 ast = rc1(&cctx);
1614 rc_setup_charsets(rx, &cctx);
1615 if (flags & SCM_REGEXP_PARSE_ONLY) return ast;
1616
1617 /* pass 2 : optimization */
1618 ast = rc2_optimize(ast, SCM_NIL);
1619
1620 /* pass 3 : generate bytecode */
1621 return rc3(&cctx, ast);
1622 }
1623
1624 /* alternative entry that compiles from AST */
1625 ScmObj Scm_RegCompFromAST(ScmObj ast)
1626 {
1627 ScmRegexp *rx = make_regexp();
1628 regcomp_ctx cctx;
1629 rc_ctx_init(&cctx, rx);
1630
1631 /* prepare some context */
1632 if (!SCM_PAIRP(ast) || !SCM_INTP(SCM_CAR(ast))) {
1633 /* ensure the entire AST is in a group #0 */
1634 ast = SCM_LIST2(SCM_MAKE_INT(0), ast);
1635 }
1636 ast = rc_setup_context(&cctx, ast);
1637 rc_setup_charsets(rx, &cctx);
1638 rx->numGroups = cctx.grpcount+1;
1639
1640 /* pass 3 */
1641 return rc3(&cctx, ast);
1642 }
1643
1644 /*=======================================================================
1645 * Matcher
1646 */
1647
1648 /* For now, I use C-stack directly to keep information for backtrack,
1649 * i.e. anytime I should try something I recursively call rex_rec().
1650 * It may run out the stack space if regexp requires deep recursion.
1651 *
1652 * Rex_rec doesn't return as long as match succeeds. At the end of
1653 * code, it longjmp's to the start of matcher.
1654 *
1655 * My preliminary test showed that using C-stack & longjmp is faster than
1656 * allocating and maintaining the stack by myself. Further test is required
1657 * for practical case, though.
1658 */
1659
1660 struct match_list {
1661 struct match_list *next;
1662 int grpnum;
1663 const char *ptr;
1664 };
1665
1666 struct match_ctx {
1667 ScmRegexp *rx;
1668 const unsigned char *codehead; /* start of code */
1669 const char *input; /* start of input */
1670 const char *stop; /* end of input */
1671 const char *last;
1672 struct match_list *matches;
1673 void *begin_stack; /* C stack pointer the match began from. */
1674 sigjmp_buf *cont;
1675 };
1676
1677 #define MAX_STACK_USAGE 0x100000
1678
1679 static struct match_list *push_match(struct match_list *mlist,
1680 int grpnum, const char *ptr)
1681 {
1682 struct match_list *elt = SCM_NEW(struct match_list);
1683 elt->next = mlist;
1684 elt->grpnum = grpnum;
1685 elt->ptr = ptr;
1686 return elt;
1687 }
1688
1689 static int match_ci(const char **input, const unsigned char **code, int length)
1690 {
1691 unsigned char inch, c;
1692 int csize, i;
1693 do {
1694 inch = *(*input)++;
1695 c = *(*code)++;
1696 if ((csize = SCM_CHAR_NFOLLOWS(inch)) == 0) {
1697 if (c != SCM_CHAR_DOWNCASE(inch)) return FALSE;
1698 } else {
1699 if (c != inch) return FALSE;
1700 for (i=0; i<csize; i++) {
1701 if ((unsigned char)*(*code)++ != (unsigned char)*(*input)++)
1702 return FALSE;
1703 }
1704 }
1705 length -= (csize+1);
1706 } while (length > 0);
1707 return TRUE;
1708 }
1709
1710 /* Check if input points to the word boundary. For now, I consider
1711 all multibyte chars word-constituent. */
1712 static int is_word_constituent(unsigned char b)
1713 {
1714 if (b >= 128) return TRUE;
1715 if (b >= '0' && b <= '9') return TRUE;
1716 if (b >= 'A' && b <= 'Z') return TRUE;
1717 if (b >= 'a' && b <= 'z') return TRUE;
1718 return FALSE;
1719 }
1720
1721 static int is_word_boundary(struct match_ctx *ctx, const char *input)
1722 {
1723 unsigned char nextb, prevb;
1724 const char *prevp;
1725
1726 if (input == ctx->input || input == ctx->stop) return TRUE;
1727 nextb = (unsigned char)*input;
1728 SCM_CHAR_BACKWARD(input, ctx->input, prevp);
1729 SCM_ASSERT(prevp != NULL);
1730 prevb = (unsigned char)*prevp;
1731 if ((is_word_constituent(nextb) && !is_word_constituent(prevb))
1732 || (!is_word_constituent(nextb) && is_word_constituent(prevb))) {
1733 return TRUE;
1734 }
1735 return FALSE;
1736 }
1737
1738 static void rex_rec(const unsigned char *code,
1739 const char *input,
1740 struct match_ctx *ctx,
1741 struct match_list *mlist)
1742 {
1743 register int param;
1744 register ScmChar ch;
1745 ScmCharSet *cset;
1746
1747 /* TODO: here we assume C-stack grows downward; need to check by
1748 configure */
1749 if ((char*)&cset < (char*)ctx->begin_stack - MAX_STACK_USAGE) {
1750 Scm_Error("stack overrun during matching regexp %S", ctx->rx);
1751 }
1752
1753 for (;;) {
1754 switch(*code++) {
1755 case RE_MATCH:
1756 param = *code++;
1757 if (ctx->stop - input < param) return;
1758 while (param-- > 0) {
1759 if (*code++ != (unsigned char)*input++) return;
1760 }
1761 continue;
1762 case RE_MATCH1:
1763 if (ctx->stop == input) return;
1764 if (*code++ != (unsigned char)*input++) return;
1765 continue;
1766 case RE_MATCH_CI:
1767 param = *code++;
1768 if (ctx->stop - input < param) return;
1769 if (!match_ci(&input, &code, param)) return;
1770 continue;
1771 case RE_MATCH1_CI:
1772 if (ctx->stop == input) return;
1773 param = (unsigned char)*input++;
1774 if (SCM_CHAR_NFOLLOWS(param)!=0
1775 || (*code++)!=SCM_CHAR_DOWNCASE(param)) {
1776 return;
1777 }
1778 continue;
1779 case RE_ANY:
1780 if (ctx->stop == input) return;
1781 input += SCM_CHAR_NFOLLOWS(*input) + 1;
1782 continue;
1783 case RE_TRY:
1784 rex_rec(code+2, input, ctx, mlist);
1785 code = ctx->codehead + code[0]*256 + code[1];
1786 continue;
1787 case RE_JUMP:
1788 code = ctx->codehead + code[0]*256 + code[1];
1789 continue;
1790 case RE_SET1:
1791 if (ctx->stop == input) return;
1792 if ((unsigned char)*input >= 128) return;
1793 if (!Scm_CharSetContains(ctx->rx->sets[*code++], *input)) return;
1794 input++;
1795 continue;
1796 case RE_NSET1:
1797 if (ctx->stop == input) return;
1798 if ((unsigned char)*input < 128) {
1799 if (Scm_CharSetContains(ctx->rx->sets[*code++], *input))
1800 return;
1801 input++;
1802 } else {
1803 code++;
1804 input += SCM_CHAR_NFOLLOWS((unsigned char)*input) + 1;
1805 }
1806 continue;
1807 case RE_SET:
1808 if (ctx->stop == input) return;
1809 SCM_CHAR_GET(input, ch);
1810 cset = ctx->rx->sets[*code++];
1811 if (!Scm_CharSetContains(cset, ch)) return;
1812 input += SCM_CHAR_NBYTES(ch);
1813 continue;
1814 case RE_NSET:
1815 if (ctx->stop == input) return;
1816 SCM_CHAR_GET(input, ch);
1817 cset = ctx->rx->sets[*code++];
1818 if (Scm_CharSetContains(cset, ch)) return;
1819 input += SCM_CHAR_NBYTES(ch);
1820 continue;
1821 case RE_BEGIN:
1822 mlist = push_match(mlist, *code++, input);
1823 continue;
1824 case RE_END:
1825 mlist = push_match(mlist, -(*code++), input);
1826 continue;
1827 case RE_BOL:
1828 if (input != ctx->input) return;
1829 continue;
1830 case RE_EOL:
1831 if (input != ctx->stop) return;
1832 continue;
1833 case RE_WB:
1834 if (!is_word_boundary(ctx, input)) return;
1835 continue;
1836 case RE_NWB:
1837 if (is_word_boundary(ctx, input)) return;
1838 continue;
1839 case RE_SUCCESS:
1840 ctx->last = input;
1841 ctx->matches = mlist;
1842 siglongjmp(*ctx->cont, 1);
1843 /*NOTREACHED*/
1844 case RE_FAIL:
1845 return;
1846 case RE_SET1R:
1847 cset = ctx->rx->sets[*code++];
1848 for (;;) {
1849 if (ctx->stop <= input) break;
1850 if ((unsigned char)*input >= 128) break;
1851 if (!Scm_CharSetContains(cset, *input)) break;
1852 input++;
1853 }
1854 continue;
1855 case RE_NSET1R:
1856 cset = ctx->rx->sets[*code++];
1857 for (;;) {
1858 if (ctx->stop <= input) break;
1859 if ((unsigned char)*input < 128 ) {
1860 if (Scm_CharSetContains(cset, *input)) break;
1861 input++;
1862 } else {
1863 input+=SCM_CHAR_NFOLLOWS(*input)+1;
1864 }
1865 }
1866 continue;
1867 case RE_SETR:
1868 cset = ctx->rx->sets[*code++];
1869 for (;;) {
1870 if (ctx->stop <= input) break;
1871 SCM_CHAR_GET(input, ch);
1872 if (!Scm_CharSetContains(cset, ch)) break;
1873 input += SCM_CHAR_NBYTES(ch);
1874 }
1875 continue;
1876 case RE_NSETR:
1877 cset = ctx->rx->sets[*code++];
1878 for (;;) {
1879 if (ctx->stop <= input) break;
1880 SCM_CHAR_GET(input, ch);
1881 if (Scm_CharSetContains(cset, ch)) break;
1882 input += SCM_CHAR_NBYTES(ch);
1883 }
1884 continue;
1885 case RE_ASSERT: {
1886 sigjmp_buf cont, *ocont = ctx->cont;
1887 ctx->cont = &cont;
1888 if (sigsetjmp(cont, FALSE) == 0) {
1889 rex_rec(code+2, input, ctx, mlist);
1890 ctx->cont = ocont;
1891 return;
1892 }
1893 code = ctx->codehead + code[0]*256 + code[1];
1894 ctx->cont = ocont;
1895 mlist = ctx->matches;
1896 continue;
1897 }
1898 case RE_NASSERT: {
1899 sigjmp_buf cont, *ocont = ctx->cont;
1900 ctx->cont = &cont;
1901 if (sigsetjmp(cont, FALSE) == 0) {
1902 rex_rec(code+2, input, ctx, mlist);
1903 code = ctx->codehead + code[0]*256 + code[1];
1904 ctx->cont = ocont;
1905 continue;
1906 }
1907 ctx->cont = ocont;
1908 return;
1909 }
1910 default:
1911 /* shouldn't be here */
1912 Scm_Error("regexp implementation seems broken\n");
1913 }
1914 }
1915 }
1916
1917 static ScmObj make_match(ScmRegexp *rx, ScmString *orig,
1918 struct match_ctx *ctx)
1919 {
1920 int i;
1921 struct match_list *ml;
1922 ScmRegMatch *rm = SCM_NEW(ScmRegMatch);
1923 const ScmStringBody *origb;
1924 SCM_SET_CLASS(rm, SCM_CLASS_REGMATCH);
1925 rm->numMatches = rx->numGroups;
1926 rm->matches = SCM_NEW_ARRAY(struct ScmRegMatchSub, rx->numGroups);
1927 /* we keep information of original string separately, instead of
1928 keeping a pointer to orig; For orig may be destructively modified,
1929 but its elements are not. */
1930 origb = SCM_STRING_BODY(orig);
1931 rm->input = SCM_STRING_BODY_START(origb);
1932 rm->inputLen = SCM_STRING_BODY_LENGTH(origb);
1933 rm->inputSize = SCM_STRING_BODY_SIZE(origb);
1934 for (i=0; i<rx->numGroups; i++) {
1935 rm->matches[i].start = -1;
1936 rm->matches[i].length = -1;
1937 rm->matches[i].startp = NULL;
1938 rm->matches[i].endp = NULL;
1939 }
1940
1941 rm->matches[0].endp = ctx->last;
1942 /* scan through match result */
1943 for (ml = ctx->matches; ml; ml = ml->next) {
1944 if (ml->grpnum >= 0) {
1945 rm->matches[ml->grpnum].startp = ml->ptr;
1946 } else {
1947 rm->matches[-ml->grpnum].endp = ml->ptr;
1948 }
1949 }
1950
1951 /* sanity check (not necessary, but for now...) */
1952 for (i=0; i<rx->numGroups; i++) {
1953 if ((rm->matches[i].startp && !rm->matches[i].endp)
1954 || (!rm->matches[i].startp && rm->matches[i].endp)) {
1955 Scm_Panic("implementation error: discrepancy in regexp match #%d!", i);
1956 }
1957 }
1958 return SCM_OBJ(rm);
1959 }
1960
1961 static ScmObj rex(ScmRegexp *rx, ScmString *orig,
1962 const char *start, const char *end)
1963 {
1964 struct match_ctx ctx;
1965 sigjmp_buf cont;
1966 ctx.rx = rx;
1967 ctx.codehead = rx->code;
1968 ctx.input = SCM_STRING_BODY_START(SCM_STRING_BODY(orig));
1969 ctx.stop = end;
1970 ctx.matches = NULL;
1971 ctx.begin_stack = (void*)&ctx;
1972 ctx.cont = &cont;
1973
1974 if (sigsetjmp(cont, FALSE) == 0) {
1975 rex_rec(ctx.codehead, start, &ctx, NULL);
1976 return SCM_FALSE;
1977 } else {
1978 return make_match(rx, orig, &ctx);
1979 }
1980 }
1981
1982 /*----------------------------------------------------------------------
1983 * entry point
1984 */
1985 ScmObj Scm_RegExec(ScmRegexp *rx, ScmString *str)
1986 {
1987 const ScmStringBody *b = SCM_STRING_BODY(str);
1988 const char *start = SCM_STRING_BODY_START(b);
1989 const char *end = start + SCM_STRING_BODY_SIZE(b);
1990 const ScmStringBody *mb = rx->mustMatch? SCM_STRING_BODY(rx->mustMatch) : NULL;
1991 int mustMatchLen = mb? SCM_STRING_BODY_SIZE(mb) : 0;
1992
1993 if (SCM_STRING_INCOMPLETE_P(str)) {
1994 Scm_Error("incomplete string is not allowed: %S", str);
1995 }
1996 #if 0
1997 /* Disabled for now; we need to use more heuristics to determine
1998 when we should apply mustMatch. For example, if the regexp
1999 begins with BOL assertion and constant string, then it would be
2000 faster to go for rex directly. */
2001 if (rx->mustMatch) {
2002 /* Prescreening. If the input string doesn't contain mustMatch
2003 string, it can't match the entire expression. */
2004 if (SCM_FALSEP(Scm_StringScan(str, rx->mustMatch,
2005 SCM_STRING_SCAN_INDEX))) {
2006 return SCM_FALSE;
2007 }
2008 }
2009 #endif
2010 /* short cut : if rx matches only at the beginning of the string,
2011 we only run from the beginning of the string */
2012 if (rx->flags & SCM_REGEXP_BOL_ANCHORED) {
2013 return rex(rx, str, start, end);
2014 }
2015 /* normal matching */
2016 while (start <= end-mustMatchLen) {
2017 ScmObj r = rex(rx, str, start, end);
2018 if (!SCM_FALSEP(r)) return r;
2019 start += SCM_CHAR_NFOLLOWS(*start)+1;
2020 }
2021 return SCM_FALSE;
2022 }
2023
2024 /*=======================================================================
2025 * Retrieving matches
2026 */
2027
2028 /* TODO: MT Warning: these retrival functions change match object's
2029 * internal state.
2030 */
2031 ScmObj Scm_RegMatchSubstr(ScmRegMatch *rm, int i)
2032 {
2033 struct ScmRegMatchSub *sub;
2034 if (i < 0 || i >= rm->numMatches)
2035 Scm_Error("submatch index out of range: %d", i);
2036 sub = &rm->matches[i];
2037 if (sub->startp == NULL) {
2038 return SCM_FALSE;
2039 } else if (sub->length >= 0) {
2040 return Scm_MakeString(sub->startp, sub->endp - sub->startp,
2041 sub->length, 0);
2042 } else {
2043 ScmObj s = Scm_MakeString(sub->startp, sub->endp - sub->startp, -1, 0);
2044 sub->length = SCM_STRING_BODY_LENGTH(SCM_STRING_BODY(s));
2045 return s;
2046 }
2047 }
2048
2049 ScmObj Scm_RegMatchStart(ScmRegMatch *rm, int i)
2050 {
2051 struct ScmRegMatchSub *sub;
2052 if (i < 0 || i >= rm->numMatches)
2053 Scm_Error("submatch index out of range: %d", i);
2054 sub = &rm->matches[i];
2055 if (sub->startp == NULL) {
2056 return SCM_FALSE;
2057 } else if (sub->start < 0) {
2058 sub->start = Scm_MBLen(rm->input, sub->startp);
2059 }
2060 return Scm_MakeInteger(sub->start);
2061 }
2062
2063 ScmObj Scm_RegMatchEnd(ScmRegMatch *rm, int i)
2064 {
2065 struct ScmRegMatchSub *sub;
2066 if (i < 0 || i >= rm->numMatches)
2067 Scm_Error("submatch index out of range: %d", i);
2068 sub = &rm->matches[i];
2069 if (sub->startp == NULL) {
2070 return SCM_FALSE;
2071 } else if (sub->start < 0) {
2072 sub->start = Scm_MBLen(rm->input, sub->startp);
2073 }
2074 if (sub->length < 0) {
2075 sub->length = Scm_MBLen(sub->startp, sub->endp);
2076 }
2077 return Scm_MakeInteger(sub->start + sub->length);
2078 }
2079
2080 ScmObj Scm_RegMatchBefore(ScmRegMatch *rm, int i)
2081 {
2082 struct ScmRegMatchSub *sub;
2083 if (i < 0 || i >= rm->numMatches)
2084 Scm_Error("submatch index out of range: %d", i);
2085 sub = &rm->matches[i];
2086 if (sub->startp == NULL) return SCM_FALSE;
2087 return Scm_MakeString(rm->input, sub->startp - rm->input, -1, 0);
2088 }
2089
2090 ScmObj Scm_RegMatchAfter(ScmRegMatch *rm, int i)
2091 {
2092 struct ScmRegMatchSub *sub;
2093 if (i < 0 || i >= rm->numMatches)
2094 Scm_Error("submatch index out of range: %d", i);
2095 sub = &rm->matches[i];
2096 if (sub->startp == NULL) return SCM_FALSE;
2097 return Scm_MakeString(sub->endp,
2098 rm->input + rm->inputSize - sub->endp, -1, 0);
2099 }
2100
2101 /* for debug */
2102 #if SCM_DEBUG_HELPER
2103 void Scm_RegMatchDump(ScmRegMatch *rm)
2104 {
2105 int i;
2106
2107 Scm_Printf(SCM_CUROUT, "RegMatch %p\n", rm);
2108 Scm_Printf(SCM_CUROUT, " numMatches = %d\n", rm->numMatches);
2109 Scm_Printf(SCM_CUROUT, " input = %S\n", rm->input);
2110 for (i=0; i<rm->numMatches; i++) {
2111 struct ScmRegMatchSub *sub = &rm->matches[i];
2112 if (sub->startp) {
2113 Scm_Printf(SCM_CUROUT, "[%3d-%3d] %S\n",
2114 sub->startp - rm->input,
2115 sub->endp - rm->input,
2116 Scm_MakeString(sub->startp, sub->endp-sub->startp,
2117 -1, 0));
2118 } else {
2119 Scm_Printf(SCM_CUROUT, "[---] #f\n");
2120 }
2121 }
2122 }
2123 #endif /*SCM_DEBUG_HELPER*/
2124
2125 /*=======================================================================
2126 * Initializing stuff
2127 */
2128
2129 void Scm__InitRegexp(void)
2130 {
2131 }