/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- guess_arc
- guess_dfa
- guess_jp
- Scm_Init_convguess
1 /*
2 * guess.c - guessing character encoding
3 *
4 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * $Id: guess.c,v 1.4 2004/10/06 09:25:36 shirok Exp $
34 */
35
36 #include <gauche.h>
37 #include <gauche/extend.h>
38 #include "charconv.h"
39
40 typedef struct guess_arc_rec {
41 unsigned int next; /* next state */
42 double score; /* score */
43 } guess_arc;
44
45 typedef struct guess_dfa_rec {
46 signed char (*states)[256];
47 guess_arc *arcs;
48 int state;
49 double score;
50 } guess_dfa;
51
52 #define DFA_INIT(st, ar) \
53 { st, ar, 0, 1.0 }
54
55 #define DFA_NEXT(dfa, ch) \
56 do { \
57 int arc__; \
58 if (dfa.state >= 0) { \
59 arc__ = dfa.states[dfa.state][ch]; \
60 if (arc__ < 0) { \
61 dfa.state = -1; \
62 } else { \
63 dfa.state = dfa.arcs[arc__].next; \
64 dfa.score *= dfa.arcs[arc__].score; \
65 } \
66 } \
67 } while (0)
68
69 #define DFA_ALIVE(dfa) (dfa.state >= 0)
70
71 /* include DFA table generated by guess.scm */
72 #include "guess_tab.c"
73
74 static const char *guess_jp(const char *buf, int buflen, void *data)
75 {
76 int i;
77 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
78 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);
79 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
80 guess_dfa *top = NULL;
81
82 for (i=0; i<buflen; i++) {
83 int c = (unsigned char)buf[i];
84
85 /* special treatment of jis escape sequence */
86 if (c == 0x1b) {
87 if (i < buflen-1) {
88 c = (unsigned char)buf[++i];
89 if (c == '$' || c == '(') return "ISO-2022-JP";
90 }
91 }
92
93 if (DFA_ALIVE(eucj)) {
94 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) return "EUC-JP";
95 DFA_NEXT(eucj, c);
96 }
97 if (DFA_ALIVE(sjis)) {
98 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) return "Shift_JIS";
99 DFA_NEXT(sjis, c);
100 }
101 if (DFA_ALIVE(utf8)) {
102 if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) return "UTF-8";
103 DFA_NEXT(utf8, c);
104 }
105
106 if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
107 /* we ran out the possibilities */
108 return NULL;
109 }
110 }
111
112 /* Now, we have ambigous code. Pick the highest score. If more than
113 one candidate tie, pick the default encoding. */
114 if (DFA_ALIVE(eucj)) top = &eucj;
115 if (DFA_ALIVE(utf8)) {
116 if (top) {
117 #if defined GAUCHE_CHAR_ENCODING_UTF_8
118 if (top->score <= utf8.score) top = &utf8;
119 #else
120 if (top->score < utf8.score) top = &utf8;
121 #endif
122 } else {
123 top = &utf8;
124 }
125 }
126 if (DFA_ALIVE(sjis)) {
127 if (top) {
128 #if defined GAUCHE_CHAR_ENCODING_SJIS
129 if (top->score <= sjis.score) top = &sjis;
130 #else
131 if (top->score < sjis.score) top = &sjis;
132 #endif
133 } else {
134 top = &sjis;
135 }
136 }
137
138 if (top == &eucj) return "EUC-JP";
139 if (top == &utf8) return "UTF-8";
140 if (top == &sjis) return "Shift_JIS";
141 return NULL;
142 }
143
144
145 /*
146 * Initialization
147 */
148
149 void Scm_Init_convguess(void)
150 {
151 Scm_RegisterCodeGuessingProc("*JP", guess_jp, NULL);
152 }
153