root/src/gauche/char_utf_8.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. Scm_CharUtf8Getc
  2. Scm_CharUtf8Putc

   1 /*
   2  * char_utf8.h - UTF8 encoding interface
   3  *
   4  *   Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
   5  * 
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  * 
  10  *   1. Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *
  13  *   2. Redistributions in binary form must reproduce the above copyright
  14  *      notice, this list of conditions and the following disclaimer in the
  15  *      documentation and/or other materials provided with the distribution.
  16  *
  17  *   3. Neither the name of the authors nor the names of its contributors
  18  *      may be used to endorse or promote products derived from this
  19  *      software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  $Id: char_utf_8.h,v 1.10 2005/08/05 11:38:26 shirok Exp $
  34  */
  35 
  36 #ifndef SCM_CHAR_ENCODING_BODY
  37 /*===============================================================
  38  * Header part
  39  */
  40 
  41 /* The name of the encoding.  Scheme procedure 
  42  * gauche-character-encoding returns a symbol with this name.
  43  */
  44 #define SCM_CHAR_ENCODING_NAME "utf-8"
  45 
  46 SCM_EXTERN char Scm_CharSizeTable[];
  47 SCM_EXTERN ScmChar Scm_CharUtf8Getc(const unsigned char *);
  48 SCM_EXTERN void Scm_CharUtf8Putc(unsigned char *, ScmChar);
  49 
  50 /* Given first byte of the multibyte character, returns # of
  51  * bytes that follows, i.e. if the byte consists a single-byte
  52  * character, it returns 0; if the byte is the first byte of
  53  * two-byte character, it returns 1.   It may return -1 if
  54  * the given byte can't be a valid first byte of multibyte characters.
  55  */
  56 #define SCM_CHAR_NFOLLOWS(ch) ((int)Scm_CharSizeTable[(unsigned char)(ch)])
  57 
  58 /* Given wide character CH, returns # of bytes used when CH is
  59  * encoded in multibyte string.
  60  */
  61 #define SCM_CHAR_NBYTES(ch)                     \
  62     (((ch) < 0x80) ? 1 :                        \
  63      (((ch) < 0x800) ? 2 :                      \
  64       (((ch) < 0x10000) ? 3 :                   \
  65        (((ch) < 0x200000) ? 4 :                 \
  66         (((ch) < 0x4000000) ? 5 : 6)))))
  67 
  68 /* Maximun # of multibyte character */
  69 #define SCM_CHAR_MAX_BYTES     6
  70 
  71 /* From a multibyte string pointed by const char *cp, extract a character
  72  * and store it in ScmChar ch.  If cp doesn't point to valid multibyte
  73  * character, store SCM_CHAR_INVALID to ch.  cp is not modified.
  74  */
  75 #define SCM_CHAR_GET(cp, ch)                                    \
  76     do {                                                        \
  77         if (((ch) = (unsigned char)*(cp)) >= 0x80) {            \
  78             (ch) = Scm_CharUtf8Getc((unsigned char*)cp);        \
  79         }                                                       \
  80     } while (0)
  81 
  82 /* Convert a character CH to multibyte form and put it to the buffer
  83  * starting from char *cp.  You can assume the buffer has enough length
  84  * to contain the multibyte char.   cp is not modified.
  85  */
  86 #define SCM_CHAR_PUT(cp, ch)                            \
  87     do {                                                \
  88         if (ch >= 0x80) {                               \
  89             Scm_CharUtf8Putc((unsigned char*)cp, ch);   \
  90         } else {                                        \
  91             *(cp) = (ch);                               \
  92         }                                               \
  93     } while (0)
  94 
  95 /* const char *cp points to a multibyte string.  Set const char *result
  96  * to point to the previous character of the one cp points to.
  97  * const char *start points to the beginning of the buffer.
  98  * result is set to NULL if there's no valid multibyte char found
  99  * just before cp.   cp and start is not modified.
 100  */
 101 #define SCM_CHAR_BACKWARD(cp, start, result)                    \
 102     do {                                                        \
 103         switch ((cp) - (start)) {                               \
 104         default:                                                \
 105             (result) = (cp) - 6;                                \
 106             if (SCM_CHAR_NFOLLOWS(*(result)) == 5) break;       \
 107             /* FALLTHROUGH */                                   \
 108         case 5:                                                 \
 109             (result) = (cp) - 5;                                \
 110             if (SCM_CHAR_NFOLLOWS(*(result)) == 4) break;       \
 111             /* FALLTHROUGH */                                   \
 112         case 4:                                                 \
 113             (result) = (cp) - 4;                                \
 114             if (SCM_CHAR_NFOLLOWS(*(result)) == 3) break;       \
 115             /* FALLTHROUGH */                                   \
 116         case 3:                                                 \
 117             (result) = (cp) - 3;                                \
 118             if (SCM_CHAR_NFOLLOWS(*(result)) == 2) break;       \
 119             /* FALLTHROUGH */                                   \
 120         case 2:                                                 \
 121             (result) = (cp) - 2;                                \
 122             if (SCM_CHAR_NFOLLOWS(*(result)) == 1) break;       \
 123             /* FALLTHROUGH */                                   \
 124         case 1:                                                 \
 125             (result) = (cp) - 1;                                \
 126             if (SCM_CHAR_NFOLLOWS(*(result)) == 0) break;       \
 127             (result) = NULL;                                    \
 128         }                                                       \
 129     } while (0)
 130 
 131 #else  /* !SCM_CHAR_ENCODING_BODY */
 132 /*==================================================================
 133  * This part is included in char.c
 134  */
 135 
 136 /* Array of character encoding names, recognizable by iconv, that are
 137    compatible with this native encoding. */
 138 static const char *supportedCharacterEncodings[] = {
 139     "UTF-8",
 140     "ISO-10646/UTF-8",
 141     "UTF8",
 142     NULL
 143 };
 144 
 145 char Scm_CharSizeTable[256] = {
 146     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */
 147     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */
 148     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
 149     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 3x */
 150     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 4x */
 151     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 5x */
 152     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 6x */
 153     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 7x */
 154     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 8x */
 155     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9x */
 156     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* ax */
 157     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* bx */
 158     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* cx */
 159     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* dx */
 160     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* ex */
 161     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0  /* fx */
 162 };
 163 
 164 ScmChar Scm_CharUtf8Getc(const unsigned char *cp)
 165 {
 166     ScmChar ch;
 167     unsigned char *ucp = (unsigned char *)cp;
 168     unsigned char first = *ucp++;
 169     if (first < 0x80) { ch = first; }
 170     else if (first < 0xc0) { ch = SCM_CHAR_INVALID; }
 171     else if (first < 0xe0) {
 172         ch = first&0x1f;
 173         ch = (ch<<6) | (*ucp++&0x3f);
 174         if (ch < 0x80) ch = SCM_CHAR_INVALID;
 175     }
 176     else if (first < 0xf0) {
 177         ch = first&0x0f;
 178         ch = (ch<<6) | (*ucp++&0x3f);
 179         ch = (ch<<6) | (*ucp++&0x3f);
 180         if (ch < 0x800) ch = SCM_CHAR_INVALID;
 181     }
 182     else if (first < 0xf8) {
 183         ch = first&0x07;
 184         ch = (ch<<6) | (*ucp++&0x3f);
 185         ch = (ch<<6) | (*ucp++&0x3f);
 186         ch = (ch<<6) | (*ucp++&0x3f);
 187         if (ch < 0x10000) ch = SCM_CHAR_INVALID;
 188     }
 189     else if (first < 0xfc) {
 190         ch = first&0x03;
 191         ch = (ch<<6) | (*ucp++&0x3f);
 192         ch = (ch<<6) | (*ucp++&0x3f);
 193         ch = (ch<<6) | (*ucp++&0x3f);
 194         ch = (ch<<6) | (*ucp++&0x3f);
 195         if (ch < 0x200000) ch = SCM_CHAR_INVALID;
 196     }
 197     else if (first < 0xfe) {
 198         ch = first&0x01;
 199         ch = (ch<<6) | (*ucp++&0x3f);
 200         ch = (ch<<6) | (*ucp++&0x3f);
 201         ch = (ch<<6) | (*ucp++&0x3f);
 202         ch = (ch<<6) | (*ucp++&0x3f);
 203         ch = (ch<<6) | (*ucp++&0x3f);
 204         if (ch < 0x4000000) ch = SCM_CHAR_INVALID;
 205     }
 206     else {
 207         ch = SCM_CHAR_INVALID;
 208     }
 209     return ch;
 210 }
 211 
 212 void Scm_CharUtf8Putc(unsigned char *cp, ScmChar ch)
 213 {
 214     if (ch < 0x80) {
 215         *cp = ch;
 216     }
 217     else if (ch < 0x800) {
 218         *cp++ = ((ch>>6)&0x1f) | 0xc0;
 219         *cp = (ch&0x3f) | 0x80;
 220     }
 221     else if (ch < 0x10000) {
 222         *cp++ = ((ch>>12)&0x0f) | 0xe0;
 223         *cp++ = ((ch>>6)&0x3f) | 0x80;
 224         *cp = (ch&0x3f) | 0x80;
 225     }
 226     else if (ch < 0x200000) {
 227         *cp++ = ((ch>>18)&0x07) | 0xf0;
 228         *cp++ = ((ch>>12)&0x3f) | 0x80;
 229         *cp++ = ((ch>>6)&0x3f) | 0x80;
 230         *cp = (ch&0x3f) | 0x80;
 231     }
 232     else if (ch < 0x4000000) {
 233         *cp++ = ((ch>>24)&0x03) | 0xf8;
 234         *cp++ = ((ch>>18)&0x3f) | 0x80;
 235         *cp++ = ((ch>>12)&0x3f) | 0x80;
 236         *cp++ = ((ch>>6)&0x3f) | 0x80;
 237         *cp = (ch&0x3f) | 0x80;
 238     } else {
 239         *cp++ = ((ch>>30)&0x1) | 0xfc;
 240         *cp++ = ((ch>>24)&0x3f) | 0x80;
 241         *cp++ = ((ch>>18)&0x3f) | 0x80;
 242         *cp++ = ((ch>>12)&0x3f) | 0x80;
 243         *cp++ = ((ch>>6)&0x3f) | 0x80;
 244         *cp++ = (ch&0x3f) | 0x80;
 245     }
 246 }
 247 
 248 #endif /* !SCM_CHAR_ENCODING_BODY */

/* [<][>][^][v][top][bottom][index][help] */