/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- Scm_CharBackwardEUC
1 /*
2 * char-euc-jp.h
3 *
4 * Copyright (c) 2000-2004 Shiro Kawai, All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * $Id: char_euc_jp.h,v 1.15 2004/08/01 05:27:11 shirok Exp $
34 */
35
36 #ifndef SCM_CHAR_ENCODING_BODY
37 /*===============================================================
38 * Header part
39 */
40
41 /* The name of the encoding. Scheme procedure
42 * gauche-character-encoding returns a symbol with this name.
43 */
44 #define SCM_CHAR_ENCODING_NAME "euc-jp"
45
46 /* Given first byte of the multibyte character, returns # of
47 * bytes that follows, i.e. if the byte consists a single-byte
48 * character, it returns 0; if the byte is the first byte of
49 * two-byte character, it returns 1. It may return -1 if
50 * the given byte can't be a valid first byte of multibyte characters.
51 */
52 #define SCM_CHAR_NFOLLOWS(ch) \
53 ((((unsigned char)(ch)) >= 0x80) ? \
54 ((((unsigned char)(ch)) == 0x8f) ? 2 : 1) : 0)
55
56 /* Given wide character CH, returns # of bytes used when CH is
57 * encoded in multibyte string.
58 */
59 #define SCM_CHAR_NBYTES(ch) \
60 (((ch) >= 0x80) ? (((ch) >= 0x10000)? 3 : 2) : 1)
61
62 /* Maximun # of multibyte character */
63 #define SCM_CHAR_MAX_BYTES 3
64
65 /* From a multibyte string pointed by const char *cp, extract a character
66 * and store it in ScmChar ch. If cp doesn't point to valid multibyte
67 * character, store SCM_CHAR_INVALID to ch. cp is not modified.
68 */
69 /* The tests aren't "exact" in the sense that it accepts not-quite
70 EUC-JP sequence, but I hope they at least exclude the 'harmful'
71 sequences */
72 #define SCM_CHAR_GET(cp, ch) \
73 do { \
74 if (((ch) = (unsigned char)*(cp)) >= 0x80) { \
75 if ((ch) == 0x8f && \
76 (unsigned char)(cp)[1] >= 0xa1 && \
77 (unsigned char)(cp)[2] >= 0xa1) { \
78 (ch) = ((ch) << 16) \
79 + ((unsigned char)(cp)[1] << 8) \
80 + (unsigned char)(cp)[2]; \
81 } else if ((unsigned char)(cp)[1] >= 0xa1) { \
82 (ch) = ((ch) << 8) + (unsigned char)(cp)[1]; \
83 } else { \
84 (ch) = SCM_CHAR_INVALID; \
85 } \
86 } \
87 } while (0)
88
89 /* Convert a character CH to multibyte form and put it to the buffer
90 * starting from char *cp. You can assume the buffer has enough length
91 * to contain the multibyte char. cp is not modified.
92 */
93 #define SCM_CHAR_PUT(cp, ch) \
94 do { \
95 if ((ch) > 0xff) { \
96 if ((ch) > 0xffff) { \
97 (cp)[0] = ((ch) >> 16) & 0xff; \
98 (cp)[1] = ((ch) >> 8) & 0xff; \
99 (cp)[2] = (ch) & 0xff; \
100 } else { \
101 (cp)[0] = (ch >> 8) & 0xff; \
102 (cp)[1] = ch & 0xff; \
103 } \
104 } else { \
105 (cp)[0] = ch & 0xff; \
106 } \
107 } while (0)
108
109 /* const char *cp points to a multibyte string. Set const char *result
110 * to point to the previous character of the one cp points to.
111 * const char *start points to the beginning of the buffer.
112 * result is set to NULL if there's no valid multibyte char found
113 * just before cp. cp and start is not modified.
114 */
115 #define SCM_CHAR_BACKWARD(cp, start, result) \
116 ((result) = Scm_CharBackwardEUC(cp, start))
117
118 SCM_EXTERN const char *Scm_CharBackwardEUC(const char *cp, const char *start);
119
120 #else /* !SCM_CHAR_ENCODING_BODY */
121 /*==================================================================
122 * This part is included in char.c
123 */
124
125 /* Array of character encoding names, recognizable by iconv, that are
126 compatible with this native encoding. */
127 static const char *supportedCharacterEncodings[] = {
128 "EUC-JP",
129 "EUCJP",
130 NULL
131 };
132
133 /* An ad-hoc algorithm to return a ptr to the previous character
134 boundary. Note that it is pretty permissive---the string
135 can possibly include a illegal encoding. */
136
137 const char *Scm_CharBackwardEUC(const char *cp, const char *start)
138 {
139 const unsigned char *t;
140 /* be careful not to access beyond the beginning of the string */
141 switch (cp - start) {
142 default:
143 t = (unsigned char*)(cp-3);
144 if (t[0] == 0x8f && t[1] >= 0x80 && t[2] >= 0x80) {
145 return (const char *)t;
146 }
147 /*FALLTHROUGH*/
148 case 2:
149 t = (unsigned char*)(cp-2);
150 if (t[0] >= 0x80 && t[1] >= 0x80) {
151 return (const char*)t;
152 }
153 /*FALLTHROUGH*/
154 case 1:
155 t = (unsigned char*)(cp-1);
156 if (t[0] < 0x80) {
157 return (const char*)t;
158 }
159 }
160 return NULL;
161 }
162
163 #endif /* !SCM_CHAR_ENCODING_BODY */