src/gauche/char_euc

root/src/gauche/char_euc_jp.h

/* [<][>][^][v][top][bottom][index][help] */
INCLUDED FROM

DEFINITIONS

This source file includes following definitions.
Scm_CharBackwardEUC
   1 /*
   2  * char-euc-jp.h
   3  *
   4  *   Copyright (c) 2000-2004 Shiro Kawai, All rights reserved.
   5  * 
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  * 
  10  *   1. Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *
  13  *   2. Redistributions in binary form must reproduce the above copyright
  14  *      notice, this list of conditions and the following disclaimer in the
  15  *      documentation and/or other materials provided with the distribution.
  16  *
  17  *   3. Neither the name of the authors nor the names of its contributors
  18  *      may be used to endorse or promote products derived from this
  19  *      software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  $Id: char_euc_jp.h,v 1.15 2004/08/01 05:27:11 shirok Exp $
  34  */
  35 
  36 #ifndef SCM_CHAR_ENCODING_BODY
  37 /*===============================================================
  38  * Header part
  39  */
  40 
  41 /* The name of the encoding.  Scheme procedure 
  42  * gauche-character-encoding returns a symbol with this name.
  43  */
  44 #define SCM_CHAR_ENCODING_NAME "euc-jp"
  45 
  46 /* Given first byte of the multibyte character, returns # of
  47  * bytes that follows, i.e. if the byte consists a single-byte
  48  * character, it returns 0; if the byte is the first byte of
  49  * two-byte character, it returns 1.   It may return -1 if
  50  * the given byte can't be a valid first byte of multibyte characters.
  51  */
  52 #define SCM_CHAR_NFOLLOWS(ch)                           \
  53     ((((unsigned char)(ch)) >= 0x80) ?                  \
  54      ((((unsigned char)(ch)) == 0x8f) ? 2 : 1) : 0)
  55 
  56 /* Given wide character CH, returns # of bytes used when CH is
  57  * encoded in multibyte string.
  58  */
  59 #define SCM_CHAR_NBYTES(ch) \
  60     (((ch) >= 0x80) ? (((ch) >= 0x10000)? 3 : 2) : 1)
  61 
  62 /* Maximun # of multibyte character */
  63 #define SCM_CHAR_MAX_BYTES     3
  64 
  65 /* From a multibyte string pointed by const char *cp, extract a character
  66  * and store it in ScmChar ch.  If cp doesn't point to valid multibyte
  67  * character, store SCM_CHAR_INVALID to ch.  cp is not modified.
  68  */
  69 /* The tests aren't "exact" in the sense that it accepts not-quite
  70    EUC-JP sequence, but I hope they at least exclude the 'harmful'
  71    sequences */
  72 #define SCM_CHAR_GET(cp, ch)                                    \
  73     do {                                                        \
  74         if (((ch) = (unsigned char)*(cp)) >= 0x80) {            \
  75             if ((ch) == 0x8f &&                                 \
  76                 (unsigned char)(cp)[1] >= 0xa1 &&               \
  77                 (unsigned char)(cp)[2] >= 0xa1) {               \
  78                  (ch) = ((ch) << 16)                            \
  79                      + ((unsigned char)(cp)[1] << 8)            \
  80                      + (unsigned char)(cp)[2];                  \
  81             } else if ((unsigned char)(cp)[1] >= 0xa1) {        \
  82                 (ch) = ((ch) << 8) + (unsigned char)(cp)[1];    \
  83             } else {                                            \
  84                 (ch) = SCM_CHAR_INVALID;                        \
  85             }                                                   \
  86         }                                                       \
  87     } while (0)
  88 
  89 /* Convert a character CH to multibyte form and put it to the buffer
  90  * starting from char *cp.  You can assume the buffer has enough length
  91  * to contain the multibyte char.   cp is not modified.
  92  */
  93 #define SCM_CHAR_PUT(cp, ch)                    \
  94     do {                                        \
  95         if ((ch) > 0xff) {                      \
  96             if ((ch) > 0xffff) {                \
  97                 (cp)[0] = ((ch) >> 16) & 0xff;  \
  98                 (cp)[1] = ((ch) >> 8) & 0xff;   \
  99                 (cp)[2] = (ch) & 0xff;          \
 100             } else {                            \
 101                 (cp)[0] = (ch >> 8) & 0xff;     \
 102                 (cp)[1] = ch & 0xff;            \
 103             }                                   \
 104         } else {                                \
 105             (cp)[0] = ch & 0xff;                \
 106         }                                       \
 107     } while (0)
 108 
 109 /* const char *cp points to a multibyte string.  Set const char *result
 110  * to point to the previous character of the one cp points to.
 111  * const char *start points to the beginning of the buffer.
 112  * result is set to NULL if there's no valid multibyte char found
 113  * just before cp.   cp and start is not modified.
 114  */
 115 #define SCM_CHAR_BACKWARD(cp, start, result)                    \
 116     ((result) = Scm_CharBackwardEUC(cp, start))
 117 
 118 SCM_EXTERN const char *Scm_CharBackwardEUC(const char *cp, const char *start);
 119 
 120 #else  /* !SCM_CHAR_ENCODING_BODY */
 121 /*==================================================================
 122  * This part is included in char.c
 123  */
 124 
 125 /* Array of character encoding names, recognizable by iconv, that are
 126    compatible with this native encoding. */
 127 static const char *supportedCharacterEncodings[] = {
 128     "EUC-JP",
 129     "EUCJP",
 130     NULL
 131 };
 132 
 133 /* An ad-hoc algorithm to return a ptr to the previous character
 134    boundary.  Note that it is pretty permissive---the string
 135    can possibly include a illegal encoding. */
 136 
 137 const char *Scm_CharBackwardEUC(const char *cp, const char *start)
 138 {
 139     const unsigned char *t;
 140     /* be careful not to access beyond the beginning of the string */
 141     switch (cp - start) {
 142     default:
 143         t = (unsigned char*)(cp-3);
 144         if (t[0] == 0x8f && t[1] >= 0x80 && t[2] >= 0x80) {
 145             return (const char *)t;
 146         }
 147         /*FALLTHROUGH*/
 148     case 2:
 149         t = (unsigned char*)(cp-2);
 150         if (t[0] >= 0x80 && t[1] >= 0x80) {
 151             return (const char*)t;
 152         }
 153         /*FALLTHROUGH*/
 154     case 1:
 155         t = (unsigned char*)(cp-1);
 156         if (t[0] < 0x80) {
 157             return (const char*)t;
 158         }
 159     }
 160     return NULL;
 161 }
 162 
 163 #endif /* !SCM_CHAR_ENCODING_BODY */
/* [<][>][^][v][top][bottom][index][help] */