/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- make_str
- make_str_body
- Scm_StringDump
- count_size_and_length
- count_length
- Scm_MBLen
- Scm_MakeString
- Scm_MakeFillString
- Scm_ListToString
- Scm_GetString
- get_string_from_body
- Scm_GetStringConst
- Scm_GetStringContent
- Scm_CopyStringWithFlags
- Scm_StringCompleteToIncompleteX
- Scm_StringCompleteToIncomplete
- Scm_StringIncompleteToCompleteX
- Scm_StringIncompleteToComplete
- Scm_StringEqual
- Scm_StringCmp
- sb_strcasecmp
- mb_strcasecmp
- Scm_StringCiCmp
- forward_pos
- Scm_StringRef
- Scm_StringByteRef
- Scm_StringPosition
- Scm_StringAppend2
- Scm_StringAppendC
- Scm_StringAppend
- Scm_StringJoin
- string_substitute
- Scm_StringSubstitute
- Scm_StringSet
- Scm_StringByteSet
- substring
- Scm_Substring
- Scm_MaybeSubstring
- Scm_StringSplitByChar
- boyer_moore
- string_scan
- Scm_StringScan
- Scm_StringScanChar
- Scm_StringToList
- Scm_StringFill
- Scm_ConstCStringArrayToList
- Scm_CStringArrayToList
- string_putc
- string_print
- Scm_MakeStringPointer
- Scm_StringPointerRef
- Scm_StringPointerNext
- Scm_StringPointerPrev
- Scm_StringPointerSet
- Scm_StringPointerSubstring
- Scm_StringPointerCopy
- Scm_StringPointerDump
- Scm_DStringInit
- Scm_DStringSize
- Scm__DStringRealloc
- dstring_getz
- Scm_DStringGet
- Scm_DStringGetz
- Scm_DStringPutz
- Scm_DStringAdd
- Scm_DStringPutb
- Scm_DStringPutc
- Scm_DStringDump
1 /*
2 * string.c - string implementation
3 *
4 * Copyright (c) 2000-2005 Shiro Kawai, All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * $Id: string.c,v 1.75 2005/10/13 08:14:13 shirok Exp $
34 */
35
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <sys/types.h>
39 #include <string.h>
40 #define LIBGAUCHE_BODY
41 #include "gauche.h"
42
43 void Scm_DStringDump(FILE *out, ScmDString *dstr);
44
45 static void string_print(ScmObj obj, ScmPort *port, ScmWriteContext *ctx);
46 SCM_DEFINE_BUILTIN_CLASS(Scm_StringClass, string_print, NULL, NULL, NULL,
47 SCM_CLASS_SEQUENCE_CPL);
48
49 #define CHECK_MUTABLE(str) \
50 do { \
51 if (SCM_STRING_IMMUTABLE_P(str)) \
52 Scm_Error("attempted to modify immutable string: %S", str); \
53 } while (0)
54
55 /* Internal primitive constructor. LEN can be negative if the string
56 is incomplete. */
57 static ScmString *make_str(int len, int siz, const char *p, int flags)
58 {
59 ScmString *s = SCM_NEW(ScmString);
60 SCM_SET_CLASS(s, SCM_CLASS_STRING);
61
62 if (len < 0) flags |= SCM_STRING_INCOMPLETE;
63 if (flags & SCM_STRING_INCOMPLETE) len = siz;
64
65 s->body = NULL;
66 s->initialBody.flags = flags & SCM_STRING_FLAG_MASK;
67 s->initialBody.length = len;
68 s->initialBody.size = siz;
69 s->initialBody.start = p;
70 return s;
71 }
72
73 static ScmStringBody *make_str_body(int len, int siz, const char *p, int flags)
74 {
75 ScmStringBody *b = SCM_NEW(ScmStringBody);
76 b->length = (len < 0)? siz : len;
77 b->size = siz;
78 b->start = p;
79 b->flags = flags;
80 return b;
81 }
82
83 #define DUMP_LENGTH 50
84
85 /* for debug */
86 #if SCM_DEBUG_HELPER
87 void Scm_StringDump(FILE *out, ScmObj str)
88 {
89 int i;
90 const ScmStringBody *b = SCM_STRING_BODY(str);
91 int s = SCM_STRING_BODY_SIZE(b);
92 const char *p = SCM_STRING_BODY_START(b);
93
94 fprintf(out, "STR(len=%d,siz=%d) \"", SCM_STRING_BODY_LENGTH(b), s);
95 for (i=0; i < DUMP_LENGTH && s > 0;) {
96 int n = SCM_CHAR_NFOLLOWS(*p) + 1;
97 for (; n > 0 && s > 0; p++, n--, s--, i++) {
98 putc(*p, out);
99 }
100 }
101 if (s > 0) {
102 fputs("...\"\n", out);
103 } else {
104 fputs("\"\n", out);
105 }
106 }
107 #endif /*SCM_DEBUG_HELPER*/
108
109 /*
110 * Multibyte length calculation
111 */
112
113 /* We have multiple similar functions, due to performance reasons. */
114
115 /* Calculate both length and size of C-string str.
116 If str is incomplete, *plen gets -1. */
117 static inline int count_size_and_length(const char *str, int *psize, int *plen)
118 {
119 char c;
120 const char *p = str;
121 int size = 0, len = 0;
122 while ((c = *p++) != 0) {
123 int i = SCM_CHAR_NFOLLOWS(c);
124 len++;
125 size++;
126 while (i-- > 0) {
127 if (!*p++) { len = -1; goto eos; }
128 size++;
129 }
130 }
131 eos:
132 *psize = size;
133 *plen = len;
134 return len;
135 }
136
137 /* Calculate length of known size string. str can contain NUL character. */
138 static inline int count_length(const char *str, int size)
139 {
140 int count = 0;
141
142 while (size-- > 0) {
143 ScmChar ch;
144 unsigned char c = (unsigned char)*str;
145 int i = SCM_CHAR_NFOLLOWS(c);
146 if (i < 0 || i > size) return -1;
147 SCM_CHAR_GET(str, ch);
148 if (ch == SCM_CHAR_INVALID) return -1;
149 count++;
150 str += i+1;
151 size -= i;
152 }
153 return count;
154 }
155
156 /* Returns length of string, starts from str and end at stop.
157 If stop is NULL, str is regarded as C-string (NUL terminated).
158 If the string is incomplete, returns -1. */
159 int Scm_MBLen(const char *str, const char *stop)
160 {
161 int size = (stop == NULL)? strlen(str) : (stop - str);
162 return count_length(str, size);
163 }
164
165 /*----------------------------------------------------------------
166 * Constructors
167 */
168
169 /* General constructor. */
170 ScmObj Scm_MakeString(const char *str, int size, int len, int flags)
171 {
172 ScmString *s;
173
174 if (size < 0) count_size_and_length(str, &size, &len);
175 else if (len < 0) len = count_length(str, size);
176 if (flags & SCM_MAKSTR_COPYING) {
177 char *nstr = SCM_NEW_ATOMIC2(char *, size + 1);
178 memcpy(nstr, str, size);
179 nstr[size] = '\0'; /* be kind to C */
180 s = make_str(len, size, nstr, flags);
181 } else {
182 s = make_str(len, size, str, flags);
183 }
184 return SCM_OBJ(s);
185 }
186
187 ScmObj Scm_MakeFillString(int len, ScmChar fill)
188 {
189 int size = SCM_CHAR_NBYTES(fill), i;
190 char *ptr = SCM_NEW_ATOMIC2(char *, size*len+1);
191 char *p;
192
193 if (len < 0) Scm_Error("length out of range: %d", len);
194 for (i=0, p=ptr; i<len; i++, p+=size) {
195 SCM_CHAR_PUT(p, fill);
196 }
197 ptr[size*len] = '\0';
198 return SCM_OBJ(make_str(len, size*len, ptr, 0));
199 }
200
201 ScmObj Scm_ListToString(ScmObj chars)
202 {
203 ScmObj cp;
204 int size = 0, len = 0;
205 ScmChar ch;
206 char *buf, *bufp;
207
208 SCM_FOR_EACH(cp, chars) {
209 if (!SCM_CHARP(SCM_CAR(cp)))
210 Scm_Error("character required, but got %S", SCM_CAR(cp));
211 ch = SCM_CHAR_VALUE(SCM_CAR(cp));
212 size += SCM_CHAR_NBYTES(ch);
213 len++;
214 }
215 bufp = buf = SCM_NEW_ATOMIC2(char *, size+1);
216 SCM_FOR_EACH(cp, chars) {
217 ch = SCM_CHAR_VALUE(SCM_CAR(cp));
218 SCM_CHAR_PUT(bufp, ch);
219 bufp += SCM_CHAR_NBYTES(ch);
220 }
221 *bufp = '\0';
222 return Scm_MakeString(buf, size, len, 0);
223 }
224
225 /* Extract string as C-string. This one guarantees to return
226 mutable string (we always copy) */
227 char *Scm_GetString(ScmString *str)
228 {
229 int size;
230 char *p;
231 const ScmStringBody *b = SCM_STRING_BODY(str);
232
233 size = SCM_STRING_BODY_SIZE(b);
234 p = SCM_NEW_ATOMIC2(char *, size+1);
235 memcpy(p, SCM_STRING_BODY_START(b), size);
236 p[size] = '\0';
237 return p;
238 }
239
240 /* Common routine for Scm_GetStringConst and Scm_GetStringContent */
241 static const char *get_string_from_body(const ScmStringBody *b)
242 {
243 int size = SCM_STRING_BODY_SIZE(b);
244 if (SCM_STRING_BODY_START(b)[size] == '\0') {
245 /* we can use string data as C-string */
246 return SCM_STRING_BODY_START(b);
247 } else {
248 char *p = SCM_NEW_ATOMIC2(char *, size+1);
249 memcpy(p, SCM_STRING_BODY_START(b), size);
250 p[size] = '\0';
251 /* kludge! This breaks 'const' qualification, but we know
252 this is an idempotent operation from the outside */
253 ((ScmStringBody*)b)->start = p; /* discard const qualifier */
254 return p;
255 }
256 }
257
258
259 /* Extract string as C-string. Returned string is immutable,
260 so we can directly return the body of the string. */
261 const char *Scm_GetStringConst(ScmString *str)
262 {
263 return get_string_from_body(SCM_STRING_BODY(str));
264 }
265
266 /* Atomically extracts C-string, length, size, and incomplete flag.
267 MT-safe. */
268 const char *Scm_GetStringContent(ScmString *str,
269 unsigned int *psize, /* out */
270 unsigned int *plength, /* out */
271 unsigned int *pflags) /* out */
272 {
273 const ScmStringBody *b = SCM_STRING_BODY(str);
274 if (psize) *psize = SCM_STRING_BODY_SIZE(b);
275 if (plength) *plength = SCM_STRING_BODY_LENGTH(b);
276 if (pflags) *pflags = SCM_STRING_BODY_FLAGS(b);
277 return get_string_from_body(b);
278 }
279
280
281 /* Copy string. You can modify the flags of the newly created string
282 by FLAGS and MASK arguments; for the bits set in MASK, corresponding
283 bits in FLAGS are copied to the new string, and for other bits, the
284 original flags are copied.
285
286 The typical semantics of copy-string is achieved by passing 0 to
287 FLAGS and SCM_STRING_IMMUTABLE to MASK (i.e. reset IMMUTABLE flag,
288 and keep other flags intact.
289
290 NB: This routine doesn't check whether specified flag is valid
291 with the string content, i.e. you can drop INCOMPLETE flag with
292 copying, while the string content won't be checked if it consists
293 valid complete string. */
294 ScmObj Scm_CopyStringWithFlags(ScmString *x, int flags, int mask)
295 {
296 const ScmStringBody *b = SCM_STRING_BODY(x);
297 int size = SCM_STRING_BODY_SIZE(b);
298 int len = SCM_STRING_BODY_LENGTH(b);
299 const char *start = SCM_STRING_BODY_START(b);
300 int newflags = ((SCM_STRING_BODY_FLAGS(b) & ~mask)
301 | (flags & mask));
302
303 return SCM_OBJ(make_str(len, size, start, newflags));
304 }
305
306 ScmObj Scm_StringCompleteToIncompleteX(ScmString *x)
307 {
308 const ScmStringBody *b;
309 CHECK_MUTABLE(x);
310 b = SCM_STRING_BODY(x);
311 x->body = make_str_body(SCM_STRING_BODY_SIZE(b),
312 SCM_STRING_BODY_SIZE(b),
313 SCM_STRING_BODY_START(b),
314 SCM_STRING_BODY_FLAGS(b) | SCM_STRING_INCOMPLETE);
315 return SCM_OBJ(x);
316 }
317
318 ScmObj Scm_StringCompleteToIncomplete(ScmString *x)
319 {
320 return Scm_CopyStringWithFlags(x, SCM_STRING_INCOMPLETE,
321 SCM_STRING_INCOMPLETE);
322 }
323
324 /* DEPRECATED. MT-UNSAFE */
325 ScmObj Scm_StringIncompleteToCompleteX(ScmString *x)
326 {
327 ScmStringBody *b;
328 CHECK_MUTABLE(x);
329 b = (ScmStringBody*)SCM_STRING_BODY(x);
330 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
331 int len = count_length(SCM_STRING_BODY_START(b),
332 SCM_STRING_BODY_SIZE(b));
333 if (len < 0) return SCM_FALSE;
334 b->flags &= ~SCM_STRING_INCOMPLETE;
335 b->length = len;
336 }
337 return SCM_OBJ(x);
338 }
339
340 ScmObj Scm_StringIncompleteToComplete(ScmString *x)
341 {
342 return Scm_StringIncompleteToCompleteX(SCM_STRING(Scm_CopyString(x)));
343 }
344
345 /*----------------------------------------------------------------
346 * Comparison
347 */
348
349 /* TODO: merge Equal and Cmp API; required generic comparison protocol */
350 int Scm_StringEqual(ScmString *x, ScmString *y)
351 {
352 const ScmStringBody *xb = SCM_STRING_BODY(x);
353 const ScmStringBody *yb = SCM_STRING_BODY(y);
354 if ((SCM_STRING_BODY_FLAGS(xb)^SCM_STRING_BODY_FLAGS(yb))&SCM_STRING_INCOMPLETE) {
355 return FALSE;
356 }
357 if (SCM_STRING_BODY_SIZE(xb) != SCM_STRING_BODY_SIZE(yb)) {
358 return FALSE;
359 }
360 return (memcmp(SCM_STRING_BODY_START(xb),
361 SCM_STRING_BODY_START(yb),
362 SCM_STRING_BODY_SIZE(xb)) == 0? TRUE : FALSE);
363 }
364
365 int Scm_StringCmp(ScmString *x, ScmString *y)
366 {
367 int sizx, sizy, siz, r;
368 const ScmStringBody *xb = SCM_STRING_BODY(x);
369 const ScmStringBody *yb = SCM_STRING_BODY(y);
370 if ((SCM_STRING_BODY_FLAGS(xb)^SCM_STRING_BODY_FLAGS(yb))&SCM_STRING_INCOMPLETE) {
371 Scm_Error("cannot compare incomplete vs complete string: %S, %S",
372 SCM_OBJ(x), SCM_OBJ(y));
373 }
374 sizx = SCM_STRING_BODY_SIZE(xb);
375 sizy = SCM_STRING_BODY_SIZE(yb);
376 siz = (sizx < sizy)? sizx : sizy;
377 r = memcmp(SCM_STRING_BODY_START(xb), SCM_STRING_BODY_START(yb), siz);
378 if (r == 0) return (sizx - sizy);
379 else return r;
380 }
381
382 /* single-byte case insensitive comparison */
383 static int sb_strcasecmp(const char *px, int sizx,
384 const char *py, int sizy)
385 {
386 char cx, cy;
387 for (; sizx > 0 && sizy > 0; sizx--, sizy--, px++, py++) {
388 cx = tolower(*px);
389 cy = tolower(*py);
390 if (cx == cy) continue;
391 return (cx - cy);
392 }
393 if (sizx > 0) return 1;
394 if (sizy > 0) return -1;
395 return 0;
396 }
397
398 /* multi-byte case insensitive comparison */
399 static int mb_strcasecmp(const char *px, int lenx,
400 const char *py, int leny)
401 {
402 int cx, cy, ccx, ccy, ix, iy;
403 for (; lenx > 0 && leny > 0; lenx--, leny--, px+=ix, py+=iy) {
404 SCM_CHAR_GET(px, cx);
405 SCM_CHAR_GET(py, cy);
406 ccx = SCM_CHAR_UPCASE(cx);
407 ccy = SCM_CHAR_UPCASE(cy);
408 if (ccx != ccy) return (ccx - ccy);
409 ix = SCM_CHAR_NBYTES(cx);
410 iy = SCM_CHAR_NBYTES(cy);
411 }
412 if (lenx > 0) return 1;
413 if (leny > 0) return -1;
414 return 0;
415 }
416
417 int Scm_StringCiCmp(ScmString *x, ScmString *y)
418 {
419 int sizx, lenx, sizy, leny;
420 const char *px, *py;
421 const ScmStringBody *xb = SCM_STRING_BODY(x);
422 const ScmStringBody *yb = SCM_STRING_BODY(y);
423
424 if ((SCM_STRING_BODY_FLAGS(xb)^SCM_STRING_BODY_FLAGS(yb))&SCM_STRING_INCOMPLETE) {
425 Scm_Error("cannot compare incomplete strings in case-insensitive way: %S, %S",
426 SCM_OBJ(x), SCM_OBJ(y));
427 }
428 sizx = SCM_STRING_BODY_SIZE(xb); lenx = SCM_STRING_BODY_SIZE(xb);
429 sizy = SCM_STRING_BODY_SIZE(yb); leny = SCM_STRING_BODY_SIZE(yb);
430 px = SCM_STRING_BODY_START(xb);
431 py = SCM_STRING_BODY_START(yb);
432
433 if (sizx == lenx && sizy == leny) {
434 return sb_strcasecmp(px, sizx, py, sizy);
435 } else {
436 return mb_strcasecmp(px, lenx, py, leny);
437 }
438 }
439
440 /*----------------------------------------------------------------
441 * Reference
442 */
443
444 /* Internal fn for index -> position. Args assumed in boundary. */
445 static const char *forward_pos(const char *current, int offset)
446 {
447 int n;
448
449 while (offset--) {
450 n = SCM_CHAR_NFOLLOWS(*current);
451 current += n + 1;
452 }
453 return current;
454 }
455
456 /* string-ref.
457 * If POS is out of range,
458 * - returns SCM_CHAR_INVALID if range_error is FALSE
459 * - raise error otherwise.
460 * This differs from Scheme version, which takes an optional 'fallback'
461 * argument which will be returned when POS is out-of-range. We can't
462 * have the same semantics since the return type is limited.
463 */
464 ScmChar Scm_StringRef(ScmString *str, int pos, int range_error)
465 {
466 const ScmStringBody *b = SCM_STRING_BODY(str);
467 int len = SCM_STRING_BODY_LENGTH(b);
468
469 /* we can't allow string-ref on incomplete strings, since it may yield
470 invalid character object. */
471 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
472 Scm_Error("incomplete string not allowed : %S", str);
473 }
474 if (pos < 0 || pos >= len) {
475 if (range_error) {
476 Scm_Error("argument out of range: %d", pos);
477 } else {
478 return SCM_CHAR_INVALID;
479 }
480 }
481 if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
482 return (ScmChar)(((unsigned char *)SCM_STRING_BODY_START(b))[pos]);
483 } else {
484 const char *p = forward_pos(SCM_STRING_BODY_START(b), pos);
485 ScmChar c;
486 SCM_CHAR_GET(p, c);
487 return c;
488 }
489 }
490
491 /* The meaning and rationale of range_error is the same as Scm_StringRef.
492 * Returns -1 if OFFSET is out-of-range and RANGE_ERROR is FALSE.
493 * (Because of this, the return type is not ScmByte but int.
494 */
495 int Scm_StringByteRef(ScmString *str, int offset, int range_error)
496 {
497 const ScmStringBody *b = SCM_STRING_BODY(str);
498 if (offset < 0 || offset >= SCM_STRING_BODY_SIZE(b)) {
499 if (range_error) {
500 Scm_Error("argument out of range: %d", offset);
501 } else {
502 return -1;
503 }
504 }
505 return (ScmByte)SCM_STRING_BODY_START(b)[offset];
506 }
507
508 /* External interface of forward_pos. Returns the pointer to the
509 offset-th character in str. */
510 /* NB: this function allows offset == length of the string; in that
511 case, the return value points the location past the string body,
512 but it is necessary sometimes to do a pointer arithmetic with the
513 returned values. */
514 const char *Scm_StringPosition(ScmString *str, int offset)
515 {
516 const ScmStringBody *b = SCM_STRING_BODY(str);
517 if (offset < 0 || offset > SCM_STRING_BODY_LENGTH(b)) {
518 Scm_Error("argument out of range: %d", offset);
519 }
520 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
521 return (SCM_STRING_BODY_START(b)+offset);
522 } else {
523 return (forward_pos(SCM_STRING_BODY_START(b), offset));
524 }
525 }
526
527 /*----------------------------------------------------------------
528 * Concatenation
529 */
530
531 ScmObj Scm_StringAppend2(ScmString *x, ScmString *y)
532 {
533 const ScmStringBody *xb = SCM_STRING_BODY(x);
534 const ScmStringBody *yb = SCM_STRING_BODY(y);
535 int sizex = SCM_STRING_BODY_SIZE(xb), lenx = SCM_STRING_BODY_LENGTH(xb);
536 int sizey = SCM_STRING_BODY_SIZE(yb), leny = SCM_STRING_BODY_LENGTH(yb);
537 int flags = 0;
538 char *p = SCM_NEW_ATOMIC2(char *,sizex + sizey + 1);
539
540 memcpy(p, xb->start, sizex);
541 memcpy(p+sizex, yb->start, sizey);
542 p[sizex + sizey] = '\0';
543
544 if (SCM_STRING_BODY_INCOMPLETE_P(xb) || SCM_STRING_BODY_INCOMPLETE_P(yb)) {
545 flags |= SCM_STRING_INCOMPLETE; /* yields incomplete string */
546 }
547 return SCM_OBJ(make_str(lenx+leny, sizex+sizey, p, flags));
548 }
549
550 ScmObj Scm_StringAppendC(ScmString *x, const char *str, int sizey, int leny)
551 {
552 const ScmStringBody *xb = SCM_STRING_BODY(x);
553 int sizex = SCM_STRING_BODY_SIZE(xb), lenx = SCM_STRING_BODY_LENGTH(xb);
554 int flags = 0;
555 char *p;
556
557 if (sizey < 0) count_size_and_length(str, &sizey, &leny);
558 else if (leny < 0) leny = count_length(str, sizey);
559
560 p = SCM_NEW_ATOMIC2(char *, sizex + sizey + 1);
561 memcpy(p, xb->start, sizex);
562 memcpy(p+sizex, str, sizey);
563 p[sizex+sizey] = '\0';
564
565 if (SCM_STRING_BODY_INCOMPLETE_P(xb) || leny < 0) {
566 flags |= SCM_STRING_INCOMPLETE;
567 }
568 return SCM_OBJ(make_str(lenx + leny, sizex + sizey, p, flags));
569 }
570
571 ScmObj Scm_StringAppend(ScmObj strs)
572 {
573 #define BODY_ARRAY_SIZE 32
574 ScmObj cp;
575 int size = 0, len = 0, flags = 0, numstrs, i;
576 char *buf, *bufp;
577 const ScmStringBody *bodies_s[BODY_ARRAY_SIZE], **bodies;
578
579 /* It is trickier than it appears, since the strings may be modified
580 by another thread during we're dealing with it. So in the first
581 pass to sum up the lenghts of strings, we extract the string bodies
582 and save it. */
583 numstrs = Scm_Length(strs);
584 if (numstrs < 0) Scm_Error("improper list not allowed: %S", strs);
585 if (numstrs >= BODY_ARRAY_SIZE) {
586 bodies = SCM_NEW_ARRAY(const ScmStringBody*, numstrs);
587 } else {
588 bodies = bodies_s;
589 }
590
591 i=0;
592 SCM_FOR_EACH(cp, strs) {
593 const ScmStringBody *b;
594 if (!SCM_STRINGP(SCM_CAR(cp))) {
595 Scm_Error("string required, but got %S\n", SCM_CAR(cp));
596 }
597 b = SCM_STRING_BODY(SCM_CAR(cp));
598 size += SCM_STRING_BODY_SIZE(b);
599 len += SCM_STRING_BODY_LENGTH(b);
600 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
601 flags |= SCM_STRING_INCOMPLETE;
602 }
603 bodies[i++] = b;
604 }
605
606 bufp = buf = SCM_NEW_ATOMIC2(char *, size+1);
607 for (i=0; i<numstrs; i++) {
608 const ScmStringBody *b = bodies[i];
609 memcpy(bufp, SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
610 bufp += SCM_STRING_BODY_SIZE(b);
611 }
612 *bufp = '\0';
613 bodies = NULL; /* to help GC */
614 return SCM_OBJ(make_str(len, size, buf, flags));
615 #undef BODY_ARRAY_SIZE
616 }
617
618 ScmObj Scm_StringJoin(ScmObj strs, ScmString *delim, int grammer)
619 {
620 #define BODY_ARRAY_SIZE 32
621 ScmObj cp;
622 int size = 0, len = 0, nstrs, ndelim, i, flags = 0;
623 int dsize, dlen; /* for delimiter string */
624 const ScmStringBody *bodies_s[BODY_ARRAY_SIZE], **bodies;
625 const ScmStringBody *dbody;
626 char *buf, *bufp;
627
628 nstrs = Scm_Length(strs);
629 if (nstrs < 0) Scm_Error("improper list not allowed: %S", strs);
630 if (nstrs == 0) {
631 if (grammer == SCM_STRING_JOIN_STRICT_INFIX) {
632 Scm_Error("can't join empty list of strings with strict-infix grammer");
633 }
634 return SCM_MAKE_STR("");
635 }
636
637 if (nstrs >= BODY_ARRAY_SIZE) {
638 bodies = SCM_NEW_ARRAY(const ScmStringBody *, nstrs);
639 } else {
640 bodies = bodies_s;
641 }
642
643 dbody = SCM_STRING_BODY(delim);
644 dsize = SCM_STRING_BODY_SIZE(dbody);
645 dlen = SCM_STRING_BODY_LENGTH(dbody);
646 if (SCM_STRING_BODY_INCOMPLETE_P(dbody)) {
647 flags |= SCM_STRING_INCOMPLETE;
648 }
649
650 i = 0;
651 SCM_FOR_EACH(cp, strs) {
652 const ScmStringBody *b;
653 if (!SCM_STRINGP(SCM_CAR(cp))) {
654 Scm_Error("string required, but got %S\n", SCM_CAR(cp));
655 }
656 b = SCM_STRING_BODY(SCM_CAR(cp));
657 size += SCM_STRING_BODY_SIZE(b);
658 len += SCM_STRING_BODY_LENGTH(b);
659 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
660 flags |= SCM_STRING_INCOMPLETE;
661 }
662 bodies[i++] = b;
663 }
664 if (grammer == SCM_STRING_JOIN_INFIX
665 || grammer == SCM_STRING_JOIN_STRICT_INFIX) {
666 ndelim = nstrs - 1;
667 } else {
668 ndelim = nstrs;
669 }
670 size += dsize * ndelim;
671 len += dlen * ndelim;
672
673 bufp = buf = SCM_NEW_ATOMIC2(char *, size+1);
674 if (grammer == SCM_STRING_JOIN_PREFIX) {
675 memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
676 bufp += dsize;
677 }
678 for (i=0; i<nstrs; i++) {
679 const ScmStringBody *b = bodies[i];
680 memcpy(bufp, SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
681 bufp += SCM_STRING_BODY_SIZE(b);
682 if (i < nstrs-1) {
683 memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
684 bufp += dsize;
685 }
686 }
687 if (grammer == SCM_STRING_JOIN_SUFFIX) {
688 memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
689 bufp += dsize;
690 }
691 *bufp = '\0';
692 bodies = NULL; /* to help GC */
693 return SCM_OBJ(make_str(len, size, buf, flags));
694 #undef BODY_ARRAY_SIZE
695 }
696
697 /*----------------------------------------------------------------
698 * Substitution
699 */
700
701 static ScmObj string_substitute(ScmString *x,
702 const ScmStringBody *xb, int start,
703 const char *str, int sizey, int leny,
704 int incompletep)
705 {
706 int sizex = SCM_STRING_BODY_SIZE(xb), lenx = SCM_STRING_BODY_LENGTH(xb);
707 int end = start + leny, sizez, newlen;
708 unsigned int newflags;
709 char *p;
710
711 if (start < 0) Scm_Error("start index out of range: %d", start);
712 if (end > lenx) {
713 Scm_Error("substitution string too long: %S",
714 make_str(leny, sizey, str, 0));
715 }
716
717 if (SCM_STRING_BODY_SINGLE_BYTE_P(xb)) {
718 /* x is sbstring */
719 sizez = sizex - leny + sizey;
720 p = SCM_NEW_ATOMIC2(char *, sizez+1);
721 if (start > 0) memcpy(p, SCM_STRING_BODY_START(xb), start);
722 memcpy(p+start, str, sizey);
723 memcpy(p+start+sizey, SCM_STRING_BODY_START(xb)+end, sizex-end);
724 p[sizez] = '\0';
725 } else {
726 /* x is mbstring */
727 const char *s, *e;
728 s = forward_pos(SCM_STRING_BODY_START(xb), start);
729 e = forward_pos(s, end - start);
730 sizez = sizex + sizey - (e - s);
731 p = SCM_NEW_ATOMIC2(char *, sizez+1);
732 if (start > 0) {
733 memcpy(p, SCM_STRING_BODY_START(xb), s - SCM_STRING_BODY_START(xb));
734 }
735 memcpy(p + (s - SCM_STRING_BODY_START(xb)), str, sizey);
736 memcpy(p + (s - SCM_STRING_BODY_START(xb)) + sizey, e,
737 SCM_STRING_BODY_START(xb) + sizex - e);
738 p[sizez] = '\0';
739 }
740 /* Modify x atomically */
741 newlen = SCM_STRING_BODY_INCOMPLETE_P(xb)? sizez : lenx;
742 newflags = SCM_STRING_BODY_FLAGS(xb) & ~SCM_STRING_IMMUTABLE;
743 if (incompletep) newflags |= SCM_STRING_INCOMPLETE;
744 x->body = make_str_body(newlen, /* len */
745 sizez, /* size */
746 p, /* start */
747 newflags);/* flags */
748 return SCM_OBJ(x);
749 }
750
751 ScmObj Scm_StringSubstitute(ScmString *x, int start, ScmString *y)
752 {
753 const ScmStringBody *yb = SCM_STRING_BODY(y);
754 CHECK_MUTABLE(x);
755 return string_substitute(x, SCM_STRING_BODY(x),
756 start,
757 SCM_STRING_BODY_START(yb),
758 SCM_STRING_BODY_SIZE(yb),
759 SCM_STRING_BODY_LENGTH(yb),
760 SCM_STRING_BODY_INCOMPLETE_P(yb));
761 }
762
763 ScmObj Scm_StringSet(ScmString *x, int k, ScmChar ch)
764 {
765 const ScmStringBody *xb = SCM_STRING_BODY(x);
766 CHECK_MUTABLE(x);
767 if (SCM_STRING_BODY_INCOMPLETE_P(xb)) {
768 char byte = (char)ch;
769 return string_substitute(x, xb, k, &byte, 1, 1, TRUE);
770 } else {
771 char buf[SCM_CHAR_MAX_BYTES+1];
772 int size = SCM_CHAR_NBYTES(ch);
773 SCM_CHAR_PUT(buf, ch);
774 return string_substitute(x, xb, k, buf, size, 1, FALSE);
775 }
776 }
777
778 ScmObj Scm_StringByteSet(ScmString *x, int k, ScmByte b)
779 {
780 const ScmStringBody *xb = SCM_STRING_BODY(x);
781 int size = SCM_STRING_BODY_SIZE(xb);
782 char *p;
783
784 CHECK_MUTABLE(x);
785 if (k < 0 || k >= size) Scm_Error("argument out of range: %d", k);
786 p = SCM_NEW_ATOMIC2(char *, size+1);
787 memcpy(p, xb->start, size);
788 p[size] = '\0';
789 p[k] = (char)b;
790
791 /* Modify x atomically */
792 x->body = make_str_body(size, size, p, SCM_STRING_INCOMPLETE);
793 return SCM_OBJ(x);
794 }
795
796 /*----------------------------------------------------------------
797 * Substring
798 */
799
800 static ScmObj substring(const ScmStringBody *xb, int start, int end)
801 {
802 if (start < 0)
803 Scm_Error("start argument needs to be positive: %d", start);
804 if (end > SCM_STRING_BODY_LENGTH(xb))
805 Scm_Error("end argument is out of range: %d", end);
806 if (end < start)
807 Scm_Error("end argument must be equal to or greater than the start argument: start=%d, end=%d", start, end);
808 if (SCM_STRING_BODY_SINGLE_BYTE_P(xb)) {
809 return SCM_OBJ(make_str(end-start,
810 end-start,
811 SCM_STRING_BODY_START(xb) + start,
812 SCM_STRING_BODY_FLAGS(xb)&~SCM_STRING_IMMUTABLE));
813 } else {
814 const char *s, *e;
815 if (start) s = forward_pos(SCM_STRING_BODY_START(xb), start);
816 else s = SCM_STRING_BODY_START(xb);
817 e = forward_pos(s, end - start);
818 return SCM_OBJ(make_str(end - start, e - s, s, 0));
819 }
820 }
821
822 ScmObj Scm_Substring(ScmString *x, int start, int end)
823 {
824 return substring(SCM_STRING_BODY(x), start, end);
825 }
826
827 /* Auxiliary procedure to support optional start/end parameter specified
828 in lots of SRFI-13 functions. If start and end is specified and restricts
829 string range, call substring. Otherwise returns x itself. */
830 ScmObj Scm_MaybeSubstring(ScmString *x, ScmObj start, ScmObj end)
831 {
832 int istart, iend;
833 const ScmStringBody *xb = SCM_STRING_BODY(x);
834 if (SCM_UNBOUNDP(start) || SCM_UNDEFINEDP(start)) {
835 istart = 0;
836 } else {
837 if (!SCM_INTP(start))
838 Scm_Error("exact integer required for start, but got %S", start);
839 istart = SCM_INT_VALUE(start);
840 }
841
842 if (SCM_UNBOUNDP(end) || SCM_UNDEFINEDP(end)) {
843 if (istart == 0) return SCM_OBJ(x);
844 iend = SCM_STRING_BODY_LENGTH(xb);
845 } else {
846 if (!SCM_INTP(end))
847 Scm_Error("exact integer required for start, but got %S", end);
848 iend = SCM_INT_VALUE(end);
849 }
850 return substring(xb, istart, iend);
851 }
852
853 /*----------------------------------------------------------------
854 * Search & parse
855 */
856
857 /* Split string by char. Char itself is not included in the result. */
858 /* TODO: fix semantics. What should be returned for (string-split "" #\.)? */
859 ScmObj Scm_StringSplitByChar(ScmString *str, ScmChar ch)
860 {
861 const ScmStringBody *strb = SCM_STRING_BODY(str);
862 int size = SCM_STRING_BODY_SIZE(strb), sizecnt = 0;
863 int lencnt = 0;
864 const char *s = SCM_STRING_BODY_START(strb), *p = s, *e = s + size;
865 ScmObj head = SCM_NIL, tail = SCM_NIL;
866
867 if (SCM_STRING_BODY_INCOMPLETE_P(strb)) {
868 /* TODO: fix the policy of handling incomplete string */
869 Scm_Error("incomplete string not accepted: %S", str);
870 }
871
872 while (p < e) {
873 ScmChar cc;
874 int ncc;
875
876 SCM_CHAR_GET(p, cc);
877 ncc = SCM_CHAR_NBYTES(cc);
878 if (ch == cc) {
879 SCM_APPEND1(head, tail, Scm_MakeString(s, sizecnt, lencnt, 0));
880 sizecnt = lencnt = 0;
881 p += ncc;
882 s = p;
883 } else {
884 p += ncc;
885 sizecnt += ncc;
886 lencnt ++;
887 }
888 }
889 SCM_APPEND1(head, tail, Scm_MakeString(s, sizecnt, lencnt, 0));
890 return head;
891 }
892
893 /* Boyer-Moore string search. assuming siz1 > siz2, siz2 < 256. */
894 static inline int boyer_moore(const char *ss1, int siz1,
895 const char *ss2, int siz2)
896 {
897 unsigned char shift[256];
898 int i, j, k;
899 for (i=0; i<256; i++) { shift[i] = siz2; }
900 for (j=0; j<siz2-1; j++) {
901 shift[(unsigned char)ss2[j]] = siz2-j-1;
902 }
903 for (i=siz2-1; i<siz1; i+=shift[(unsigned char)ss1[i]]) {
904 for (j=siz2-1, k = i; j>=0 && ss1[k] == ss2[j]; j--, k--)
905 ;
906 if (j == -1) return k+1;
907 }
908 return -1;
909 }
910
911 /* Scan s2 in s1. If both strings are single-byte, and s1 is long,
912 we use Boyer-Moore.
913
914 To avoid rescanning of the string, this function can return
915 various information, depends on retmode argument.
916
917 SCM_STRING_SCAN_INDEX : return the index of s1
918 s1 = "abcde" and s2 = "cd" => 2
919 SCM_STRING_SCAN_BEFORE : return substring of s1 before s2
920 s1 = "abcde" and s2 = "cd" => "ab"
921 SCM_STRING_SCAN_AFTER : return substring of s1 after s2
922 s1 = "abcde" and s2 = "cd" => "e"
923 SCM_STRING_SCAN_BEFORE2 : return substring of s1 before s2, and rest
924 s1 = "abcde" and s2 = "cd" => "ab" and "cde"
925 SCM_STRING_SCAN_AFTER2 : return substring of s1 up to s2 and rest
926 s1 = "abcde" and s2 = "cd" => "abcd" and "e"
927 SCM_STRING_SCAN_BOTH : return substring of s1 before and after s2
928 s1 = "abcde" and s2 = "cd" => "ab" and "e"
929 */
930 static ScmObj string_scan(ScmString *s1, const char *ss2,
931 int siz2, int len2, int incomplete2,
932 int retmode)
933 {
934 int i, incomplete;
935 const ScmStringBody *sb = SCM_STRING_BODY(s1);
936 const char *ss1 = SCM_STRING_BODY_START(sb);
937 int siz1 = SCM_STRING_BODY_SIZE(sb);
938 int len1 = SCM_STRING_BODY_LENGTH(sb);
939
940 if (retmode < 0 || retmode > SCM_STRING_SCAN_BOTH) {
941 Scm_Error("return mode out fo range: %d", retmode);
942 }
943
944 if (siz2 == 0) {
945 /* shortcut */
946 switch (retmode) {
947 case SCM_STRING_SCAN_INDEX: return SCM_MAKE_INT(0);
948 case SCM_STRING_SCAN_BEFORE: return SCM_MAKE_STR("");
949 case SCM_STRING_SCAN_AFTER: return Scm_CopyString(s1);
950 case SCM_STRING_SCAN_BEFORE2:;
951 case SCM_STRING_SCAN_AFTER2:;
952 case SCM_STRING_SCAN_BOTH:
953 return Scm_Values2(SCM_MAKE_STR(""), Scm_CopyString(s1));
954 }
955 }
956
957 if (siz1 == len1) {
958 if (siz2 == len2) goto sbstring;
959 goto failed; /* sbstring can't contain mbstring. */
960 }
961 if (len1 >= len2) {
962 const char *ssp = ss1;
963 for (i=0; i<=len1-len2; i++) {
964 if (memcmp(ssp, ss2, siz2) == 0) {
965 switch (retmode) {
966 case SCM_STRING_SCAN_INDEX:
967 return Scm_MakeInteger(i);
968 case SCM_STRING_SCAN_BEFORE:
969 return Scm_MakeString(ss1, ssp-ss1, i, 0);
970 case SCM_STRING_SCAN_AFTER:
971 return Scm_MakeString(ssp+siz2, siz1-(ssp-ss1+siz2),
972 len1-i-len2, 0);
973 case SCM_STRING_SCAN_BEFORE2:
974 return Scm_Values2(Scm_MakeString(ss1, ssp-ss1, i, 0),
975 Scm_MakeString(ssp, siz1-(ssp-ss1),
976 len1-i, 0));
977 case SCM_STRING_SCAN_AFTER2:
978 return Scm_Values2(Scm_MakeString(ss1, ssp-ss1+siz2,
979 i+len2, 0),
980 Scm_MakeString(ssp+siz2,
981 siz1-(ssp-ss1+siz2),
982 len1-i-len2, 0));
983 case SCM_STRING_SCAN_BOTH:
984 return Scm_Values2(Scm_MakeString(ss1, ssp-ss1, i, 0),
985 Scm_MakeString(ssp+siz2,
986 siz1-(ssp-ss1+siz2),
987 len1-i-len2, 0));
988 }
989 }
990 ssp += SCM_CHAR_NFOLLOWS(*ssp) + 1;
991 }
992 }
993 goto failed;
994
995 sbstring: /* short cut for single-byte strings */
996 if (siz1 < siz2) goto failed;
997 if (siz1 < 256 || siz2 >= 256) {
998 /* brute-force search */
999 for (i=0; i<=siz1-siz2; i++) {
1000 if (memcmp(ss2, ss1+i, siz2) == 0) break;
1001 }
1002 if (i == siz1-siz2+1) goto failed;
1003 } else {
1004 i = boyer_moore(ss1, siz1, ss2, siz2);
1005 if (i < 0) goto failed;
1006 }
1007 incomplete =
1008 (SCM_STRING_BODY_INCOMPLETE_P(sb) || incomplete2)?
1009 SCM_MAKSTR_INCOMPLETE : 0;
1010 switch (retmode) {
1011 case SCM_STRING_SCAN_INDEX:
1012 return Scm_MakeInteger(i);
1013 case SCM_STRING_SCAN_BEFORE:
1014 return Scm_MakeString(ss1, i, i, incomplete);
1015 case SCM_STRING_SCAN_AFTER:
1016 return Scm_MakeString(ss1+i+siz2, siz1-(i+siz2), siz1-(i+siz2),
1017 incomplete);
1018 case SCM_STRING_SCAN_BEFORE2:
1019 return Scm_Values2(Scm_MakeString(ss1, i, i, incomplete),
1020 Scm_MakeString(ss1+i, siz1-i, siz1-i, incomplete));
1021 case SCM_STRING_SCAN_AFTER2:
1022 return Scm_Values2(Scm_MakeString(ss1, i+siz2, i+siz2, incomplete),
1023 Scm_MakeString(ss1+i+siz2, siz1-(i+siz2),
1024 siz1-(i+siz2), incomplete));
1025 case SCM_STRING_SCAN_BOTH:
1026 return Scm_Values2(Scm_MakeString(ss1, i, i, incomplete),
1027 Scm_MakeString(ss1+i+siz2, siz1-(i+siz2),
1028 siz1-(i+siz2), incomplete));
1029 }
1030 failed:
1031 if (retmode <= SCM_STRING_SCAN_AFTER) {
1032 return SCM_FALSE;
1033 } else {
1034 return Scm_Values2(SCM_FALSE, SCM_FALSE);
1035 }
1036 }
1037
1038
1039 ScmObj Scm_StringScan(ScmString *s1, ScmString *s2, int retmode)
1040 {
1041 const ScmStringBody *s2b = SCM_STRING_BODY(s2);
1042 return string_scan(s1,
1043 SCM_STRING_BODY_START(s2b),
1044 SCM_STRING_BODY_SIZE(s2b),
1045 SCM_STRING_BODY_LENGTH(s2b),
1046 SCM_STRING_BODY_INCOMPLETE_P(s2b),
1047 retmode);
1048 }
1049
1050 ScmObj Scm_StringScanChar(ScmString *s1, ScmChar ch, int retmode)
1051 {
1052 char buf[SCM_CHAR_MAX_BYTES];
1053 SCM_CHAR_PUT(buf, ch);
1054 return string_scan(s1, buf, SCM_CHAR_NBYTES(ch), 1, FALSE, retmode);
1055 }
1056
1057 /*----------------------------------------------------------------
1058 * Miscellaneous functions
1059 */
1060
1061 ScmObj Scm_StringToList(ScmString *str)
1062 {
1063 const ScmStringBody *b = SCM_STRING_BODY(str);
1064 ScmObj start = SCM_NIL, end = SCM_NIL;
1065 const char *bufp = SCM_STRING_BODY_START(b);
1066 int len = SCM_STRING_BODY_LENGTH(b);
1067 ScmChar ch;
1068
1069 if (SCM_STRING_BODY_INCOMPLETE_P(b))
1070 Scm_Error("incomplete string not supported: %S", str);
1071 while (len-- > 0) {
1072 SCM_CHAR_GET(bufp, ch);
1073 bufp += SCM_CHAR_NBYTES(ch);
1074 SCM_APPEND1(start, end, SCM_MAKE_CHAR(ch));
1075 }
1076 return start;
1077 }
1078
1079 ScmObj Scm_StringFill(ScmString *str, ScmChar ch,
1080 ScmObj maybe_start, ScmObj maybe_end)
1081 {
1082 int len, i, start, end, prelen, midlen, postlen;
1083 int chlen = SCM_CHAR_NBYTES(ch);
1084 char *newstr, *p;
1085 const unsigned char *s, *r;
1086 const ScmStringBody *strb = SCM_STRING_BODY(str);
1087
1088 CHECK_MUTABLE(str);
1089 if (SCM_STRING_BODY_INCOMPLETE_P(strb)) {
1090 Scm_Error("incomplete string not allowed: %S", str);
1091 }
1092 len = SCM_STRING_BODY_LENGTH(strb);
1093
1094 if (SCM_UNBOUNDP(maybe_start) || SCM_UNDEFINEDP(maybe_start)) {
1095 start = 0;
1096 } else {
1097 if (!SCM_INTP(maybe_start))
1098 Scm_Error("exact integer required for start, but got %S",
1099 maybe_start);
1100 start = SCM_INT_VALUE(maybe_start);
1101 }
1102 if (SCM_UNBOUNDP(maybe_end) || SCM_UNDEFINEDP(maybe_end)) {
1103 end = len;
1104 } else {
1105 if (!SCM_INTP(maybe_end))
1106 Scm_Error("exact integer required for end, but got %S",
1107 maybe_end);
1108 end = SCM_INT_VALUE(maybe_end);
1109 }
1110 if (start < 0 || start > end || end > len) {
1111 Scm_Error("start/end pair is out of range: (%d %d)", start, end);
1112 }
1113 if (start == end) return SCM_OBJ(str);
1114
1115 s = (unsigned char*)SCM_STRING_BODY_START(strb);
1116 for (i = 0; i < start; i++) s += SCM_CHAR_NFOLLOWS(*s)+1;
1117 prelen = s - (unsigned char*)SCM_STRING_BODY_START(strb);
1118 r = s;
1119 for (; i < end; i++) s += SCM_CHAR_NFOLLOWS(*s)+1;
1120 midlen = s - r;
1121 postlen = SCM_STRING_BODY_SIZE(strb) - midlen - prelen;
1122
1123 p = newstr = SCM_NEW_ATOMIC2(char *,
1124 prelen + (end-start)*chlen + postlen + 1);
1125 memcpy(p, SCM_STRING_BODY_START(strb), prelen);
1126 p += prelen;
1127 for (i=0; i < end-start; i++) {
1128 SCM_CHAR_PUT(p, ch);
1129 p += chlen;
1130 }
1131 memcpy(p, SCM_STRING_BODY_START(strb) + prelen + midlen, postlen);
1132 p[postlen] = '\0'; /* be friendly to C */
1133 /* modify str atomically */
1134 str->body = make_str_body(SCM_STRING_BODY_LENGTH(strb),
1135 prelen + (end-start)*chlen + postlen,
1136 newstr,
1137 0);
1138 return SCM_OBJ(str);
1139 }
1140
1141 ScmObj Scm_ConstCStringArrayToList(const char **array, int size)
1142 {
1143 int i;
1144 ScmObj h = SCM_NIL, t = SCM_NIL;
1145 if (size < 0) {
1146 for (;*array; array++) SCM_APPEND1(h, t, SCM_MAKE_STR(*array));
1147 } else {
1148 for (i=0; i<size; i++) SCM_APPEND1(h, t, SCM_MAKE_STR(*array++));
1149 }
1150 return h;
1151 }
1152
1153 ScmObj Scm_CStringArrayToList(char **array, int size)
1154 {
1155 int i;
1156 ScmObj h = SCM_NIL, t = SCM_NIL;
1157 if (size < 0) {
1158 for (;*array; array++)
1159 SCM_APPEND1(h, t, SCM_MAKE_STR_COPYING(*array));
1160 } else {
1161 for (i=0; i<size; i++)
1162 SCM_APPEND1(h, t, SCM_MAKE_STR_COPYING(*array++));
1163 }
1164 return h;
1165 }
1166
1167 /*----------------------------------------------------------------
1168 * printer
1169 */
1170 static inline void string_putc(ScmChar ch, ScmPort *port, int bytemode)
1171 {
1172 char buf[5];
1173 switch (ch) {
1174 case '\\': SCM_PUTZ("\\\\", -1, port); break;
1175 case '"': SCM_PUTZ("\\\"", -1, port); break;
1176 case '\n': SCM_PUTZ("\\n", -1, port); break;
1177 case '\t': SCM_PUTZ("\\t", -1, port); break;
1178 case '\r': SCM_PUTZ("\\r", -1, port); break;
1179 case '\f': SCM_PUTZ("\\f", -1, port); break;
1180 case '\0': SCM_PUTZ("\\0", -1, port); break;
1181 default:
1182 if (ch < ' ' || ch == 0x7f || (bytemode && ch >= 0x80)) {
1183 snprintf(buf, 5, "\\x%02x", (unsigned char)ch);
1184 SCM_PUTZ(buf, -1, port);
1185 } else {
1186 SCM_PUTC(ch, port);
1187 }
1188 }
1189 }
1190
1191 static void string_print(ScmObj obj, ScmPort *port, ScmWriteContext *ctx)
1192 {
1193 ScmString *str = SCM_STRING(obj);
1194 if (SCM_WRITE_MODE(ctx) == SCM_WRITE_DISPLAY) {
1195 SCM_PUTS(str, port);
1196 } else {
1197 const ScmStringBody *b = SCM_STRING_BODY(str);
1198 if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
1199 const char *cp = SCM_STRING_BODY_START(b);
1200 int size = SCM_STRING_BODY_SIZE(b);
1201 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
1202 SCM_PUTZ("#*\"", -1, port);
1203 } else {
1204 SCM_PUTC('"', port);
1205 }
1206 while (size--) {
1207 string_putc(*cp++, port, SCM_STRING_BODY_INCOMPLETE_P(b));
1208 }
1209 } else {
1210 ScmChar ch;
1211 const char *cp = SCM_STRING_BODY_START(b);
1212 int len = SCM_STRING_BODY_LENGTH(b);
1213
1214 SCM_PUTC('"', port);
1215 while (len--) {
1216 SCM_CHAR_GET(cp, ch);
1217 string_putc(ch, port, FALSE);
1218 cp += SCM_CHAR_NBYTES(ch);
1219 }
1220 }
1221 SCM_PUTC('"', port);
1222 }
1223 }
1224
1225 /*==================================================================
1226 *
1227 * String pointer
1228 *
1229 */
1230
1231 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_StringPointerClass, NULL);
1232
1233 ScmObj Scm_MakeStringPointer(ScmString *src, int index, int start, int end)
1234 {
1235 const ScmStringBody *srcb = SCM_STRING_BODY(src);
1236 int len = SCM_STRING_BODY_LENGTH(srcb);
1237 int effective_size;
1238 const char *sptr, *ptr, *eptr;
1239 ScmStringPointer *sp;
1240
1241 SCM_CHECK_START_END(start, end, len);
1242 while (index < 0) index += (end - start) + 1;
1243 if (index > (end - start)) goto badindex;
1244
1245 if (SCM_STRING_BODY_SINGLE_BYTE_P(srcb)) {
1246 sptr = SCM_STRING_BODY_START(srcb) + start;
1247 ptr = sptr + index;
1248 effective_size = end - start;
1249 } else {
1250 sptr = forward_pos(SCM_STRING_BODY_START(srcb), start);
1251 ptr = forward_pos(sptr, index);
1252 eptr = forward_pos(sptr, end - start);
1253 effective_size = eptr - ptr;
1254 }
1255 sp = SCM_NEW(ScmStringPointer);
1256 SCM_SET_CLASS(sp, SCM_CLASS_STRING_POINTER);
1257 sp->length = (SCM_STRING_BODY_INCOMPLETE_P(srcb)? -1 : (end-start));
1258 sp->size = effective_size;
1259 sp->start = sptr;
1260 sp->index = index;
1261 sp->current = ptr;
1262 return SCM_OBJ(sp);
1263 badindex:
1264 Scm_Error("index out of range: %d", index);
1265 return SCM_UNDEFINED;
1266 }
1267
1268 ScmObj Scm_StringPointerRef(ScmStringPointer *sp)
1269 {
1270 ScmChar ch;
1271 if (sp->length < 0 || sp->size == sp->length) {
1272 if (sp->index >= sp->size) return SCM_EOF;
1273 ch = *sp->current;
1274 } else {
1275 if (sp->index >= sp->length) return SCM_EOF;
1276 SCM_CHAR_GET(sp->current, ch);
1277 }
1278 return SCM_MAKE_CHAR(ch);
1279 }
1280
1281 ScmObj Scm_StringPointerNext(ScmStringPointer *sp)
1282 {
1283 ScmChar ch;
1284 if (sp->length < 0 || sp->size == sp->length) {
1285 if (sp->index >= sp->size) return SCM_EOF;
1286 sp->index++;
1287 ch = *sp->current++;
1288 } else {
1289 if (sp->index >= sp->length) return SCM_EOF;
1290 SCM_CHAR_GET(sp->current, ch);
1291 sp->index++;
1292 sp->current += SCM_CHAR_NFOLLOWS(*sp->current) + 1;
1293 }
1294 return SCM_MAKE_CHAR(ch);
1295 }
1296
1297 ScmObj Scm_StringPointerPrev(ScmStringPointer *sp)
1298 {
1299 ScmChar ch;
1300 if (sp->index <= 0) return SCM_EOF;
1301 if (sp->length < 0 || sp->size == sp->length) {
1302 sp->index--;
1303 ch = *--sp->current;
1304 } else {
1305 const char *prev;
1306 SCM_CHAR_BACKWARD(sp->current, sp->start, prev);
1307 SCM_ASSERT(prev != NULL);
1308 SCM_CHAR_GET(prev, ch);
1309 sp->index--;
1310 sp->current = prev;
1311 }
1312 return SCM_MAKE_CHAR(ch);
1313 }
1314
1315 ScmObj Scm_StringPointerSet(ScmStringPointer *sp, int index)
1316 {
1317 if (index < 0) goto badindex;
1318 if (sp->length < 0 || sp->size == sp->length) {
1319 if (index > sp->size) goto badindex;
1320 sp->index = index;
1321 sp->current = sp->start + index;
1322 } else {
1323 if (index > sp->length) goto badindex;
1324 sp->index = index;
1325 sp->current = forward_pos(sp->start, index);
1326 }
1327 return SCM_OBJ(sp);
1328 badindex:
1329 Scm_Error("index out of range: %d", index);
1330 return SCM_UNDEFINED;
1331 }
1332
1333 ScmObj Scm_StringPointerSubstring(ScmStringPointer *sp, int afterp)
1334 {
1335 if (sp->length < 0) {
1336 if (afterp)
1337 return SCM_OBJ(make_str(-1, sp->size - sp->index, sp->current, 0));
1338 else
1339 return SCM_OBJ(make_str(-1, sp->index, sp->start, 0));
1340 } else {
1341 if (afterp)
1342 return SCM_OBJ(make_str(sp->length - sp->index,
1343 sp->start + sp->size - sp->current,
1344 sp->current, 0));
1345 else
1346 return SCM_OBJ(make_str(sp->index,
1347 sp->current - sp->start,
1348 sp->start, 0));
1349 }
1350 }
1351
1352 /* Copy string pointer.
1353 Thanks to Alex Shinn (foof@synthcode.com) */
1354 ScmObj Scm_StringPointerCopy(ScmStringPointer *sp1)
1355 {
1356 ScmStringPointer *sp2 = SCM_NEW(ScmStringPointer);
1357 SCM_SET_CLASS(sp2, SCM_CLASS_STRING_POINTER);
1358 sp2->length = sp1->length;
1359 sp2->size = sp1->size;
1360 sp2->start = sp1->start;
1361 sp2->index = sp1->index;
1362 sp2->current = sp1->current;
1363 return SCM_OBJ(sp2);
1364 }
1365
1366 /* Dump string pointer info for debugging.
1367 Thanks to Alex Shinn (foof@synthcode.com) */
1368 #if SCM_DEBUG_HELPER
1369 void Scm_StringPointerDump(ScmStringPointer *sp1)
1370 {
1371 Scm_Printf(SCM_CUROUT,
1372 "<sp addr: %p len: %d size: %d start: %p index: %d cur: %d>\n",
1373 sp1, sp1->length, sp1->size, sp1->start, sp1->index,
1374 sp1->current);
1375 }
1376 #endif /*SCM_DEBUG_HELPER*/
1377
1378 /*==================================================================
1379 *
1380 * Dynamic strings
1381 *
1382 */
1383
1384 /* I used to use realloc() to grow the storage; now I avoid it, for
1385 Boehm GC's realloc almost always copies the original content and
1386 we don't get any benefit.
1387 The growing string is kept in the chained chunks. The size of
1388 chunk getting bigger as the string grows, until a certain threshold.
1389 The memory for actual chunks and the chain is allocated separately,
1390 in order to use SCM_NEW_ATOMIC.
1391 */
1392
1393 /* NB: it is important that DString functions don't call any
1394 * time-consuming procedures except memory allocation. Some of
1395 * mutex code in other parts relies on that fact.
1396 */
1397
1398 /* maximum chunk size */
1399 #define DSTRING_MAX_CHUNK_SIZE 8180
1400
1401 void Scm_DStringInit(ScmDString *dstr)
1402 {
1403 dstr->init.bytes = 0;
1404 dstr->anchor = dstr->tail = NULL;
1405 dstr->current = dstr->init.data;
1406 dstr->end = dstr->current + SCM_DSTRING_INIT_CHUNK_SIZE;
1407 dstr->lastChunkSize = SCM_DSTRING_INIT_CHUNK_SIZE;
1408 dstr->length = 0;
1409 }
1410
1411 inline int Scm_DStringSize(ScmDString *dstr)
1412 {
1413 ScmDStringChain *chain;
1414 int size;
1415 if (dstr->tail) {
1416 size = dstr->init.bytes;
1417 dstr->tail->chunk->bytes = (int)(dstr->current - dstr->tail->chunk->data);
1418 for (chain = dstr->anchor; chain; chain = chain->next) {
1419 size += chain->chunk->bytes;
1420 }
1421 } else {
1422 size = (int)(dstr->current - dstr->init.data);
1423 }
1424 return size;
1425 }
1426
1427 void Scm__DStringRealloc(ScmDString *dstr, int minincr)
1428 {
1429 ScmDStringChunk *newchunk;
1430 ScmDStringChain *newchain;
1431 int newsize;
1432
1433 /* sets the byte count of the last chunk */
1434 if (dstr->tail) {
1435 dstr->tail->chunk->bytes = (int)(dstr->current - dstr->tail->chunk->data);
1436 } else {
1437 dstr->init.bytes = (int)(dstr->current - dstr->init.data);
1438 }
1439
1440 /* determine the size of the new chunk. the increase factor 3 is
1441 somewhat arbitrary, determined by rudimental benchmarking. */
1442 newsize = dstr->lastChunkSize * 3;
1443 if (newsize > DSTRING_MAX_CHUNK_SIZE) {
1444 newsize = DSTRING_MAX_CHUNK_SIZE;
1445 }
1446 if (newsize < minincr) {
1447 newsize = minincr;
1448 }
1449
1450 newchunk = SCM_NEW_ATOMIC2(ScmDStringChunk*,
1451 sizeof(ScmDStringChunk)+newsize-SCM_DSTRING_INIT_CHUNK_SIZE);
1452 newchunk->bytes = 0;
1453
1454 newchain = SCM_NEW(ScmDStringChain);
1455
1456 newchain->next = NULL;
1457 newchain->chunk = newchunk;
1458 if (dstr->tail) {
1459 dstr->tail->next = newchain;
1460 dstr->tail = newchain;
1461 } else {
1462 dstr->anchor = dstr->tail = newchain;
1463 }
1464 dstr->current = newchunk->data;
1465 dstr->end = newchunk->data + newsize;
1466 dstr->lastChunkSize = newsize;
1467 }
1468
1469 /* Retrieve accumulated string. */
1470 static const char *dstring_getz(ScmDString *dstr, int *plen, int *psiz)
1471 {
1472 int size, len;
1473 char *buf;
1474 if (dstr->anchor == NULL) {
1475 /* we only have one chunk */
1476 size = (int)(dstr->current - dstr->init.data);
1477 len = dstr->length;
1478 buf = SCM_NEW_ATOMIC2(char*, size+1);
1479 memcpy(buf, dstr->init.data, size);
1480 buf[size] = '\0';
1481 } else {
1482 ScmDStringChain *chain = dstr->anchor;
1483 char *bptr;
1484
1485 size = Scm_DStringSize(dstr);
1486 len = dstr->length;
1487 bptr = buf = SCM_NEW_ATOMIC2(char*, size+1);
1488
1489 memcpy(bptr, dstr->init.data, dstr->init.bytes);
1490 bptr += dstr->init.bytes;
1491 for (; chain; chain = chain->next) {
1492 memcpy(bptr, chain->chunk->data, chain->chunk->bytes);
1493 bptr += chain->chunk->bytes;
1494 }
1495 *bptr = '\0';
1496 }
1497 if (len < 0) len = count_length(buf, size);
1498 *plen = len;
1499 *psiz = size;
1500 return buf;
1501 }
1502
1503 ScmObj Scm_DStringGet(ScmDString *dstr, int flags)
1504 {
1505 int len, size;
1506 const char *str = dstring_getz(dstr, &len, &size);
1507 return SCM_OBJ(make_str(len, size, str, flags));
1508 }
1509
1510 /* For conveninence. Note that dstr may already contain NUL byte in it,
1511 in that case you'll get chopped string. */
1512 const char *Scm_DStringGetz(ScmDString *dstr)
1513 {
1514 int len, size;
1515 return dstring_getz(dstr, &len, &size);
1516 }
1517
1518 void Scm_DStringPutz(ScmDString *dstr, const char *str, int size)
1519 {
1520 if (size < 0) size = strlen(str);
1521 if (dstr->current + size > dstr->end) {
1522 Scm__DStringRealloc(dstr, size);
1523 }
1524 memcpy(dstr->current, str, size);
1525 dstr->current += size;
1526 if (dstr->length >= 0) {
1527 int len = count_length(str, size);
1528 if (len >= 0) dstr->length += len;
1529 else dstr->length = -1;
1530 }
1531 }
1532
1533 void Scm_DStringAdd(ScmDString *dstr, ScmString *str)
1534 {
1535 const ScmStringBody *b = SCM_STRING_BODY(str);
1536 int size = SCM_STRING_BODY_SIZE(b);
1537 if (size == 0) return;
1538 if (dstr->current + size > dstr->end) {
1539 Scm__DStringRealloc(dstr, size);
1540 }
1541 memcpy(dstr->current, SCM_STRING_BODY_START(b), size);
1542 dstr->current += size;
1543 if (dstr->length >= 0 && !SCM_STRING_BODY_INCOMPLETE_P(b)) {
1544 dstr->length += SCM_STRING_BODY_LENGTH(b);
1545 } else {
1546 dstr->length = -1;
1547 }
1548 }
1549
1550 void Scm_DStringPutb(ScmDString *ds, char byte)
1551 {
1552 SCM_DSTRING_PUTB(ds, byte);
1553 }
1554
1555 void Scm_DStringPutc(ScmDString *ds, ScmChar ch)
1556 {
1557 SCM_DSTRING_PUTC(ds, ch);
1558 }
1559
1560
1561 /* for debug */
1562 #if SCM_DEBUG_HELPER
1563 void Scm_DStringDump(FILE *out, ScmDString *dstr)
1564 {
1565 fprintf(out, "DString %p\n", dstr);
1566 if (dstr->anchor) {
1567 ScmDStringChain *chain; int i;
1568 fprintf(out, " chunk0[%3d] = \"", dstr->init.bytes);
1569 fwrite(dstr->init.data, 1, dstr->init.bytes, out);
1570 fprintf(out, "\"\n");
1571 for (i=1, chain = dstr->anchor; chain; chain = chain->next, i++) {
1572 int size = (chain->next? chain->chunk->bytes : (int)(dstr->current - dstr->tail->chunk->data));
1573 fprintf(out, " chunk%d[%3d] = \"", i, size);
1574 fwrite(chain->chunk->data, 1, size, out);
1575 fprintf(out, "\"\n");
1576 }
1577 } else {
1578 int size = (int)(dstr->current - dstr->init.data);
1579 fprintf(out, " chunk0[%3d] = \"", size);
1580 fwrite(dstr->init.data, 1, size, out);
1581 fprintf(out, "\"\n");
1582 }
1583 }
1584 #endif /*SCM_DEBUG_HELPER*/
1585