root/ext/charconv/guess.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. guess_arc
  2. guess_dfa
  3. guess_jp
  4. Scm_Init_convguess

   1 /*
   2  * guess.c - guessing character encoding 
   3  *
   4  *   Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
   5  * 
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  * 
  10  *   1. Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *
  13  *   2. Redistributions in binary form must reproduce the above copyright
  14  *      notice, this list of conditions and the following disclaimer in the
  15  *      documentation and/or other materials provided with the distribution.
  16  *
  17  *   3. Neither the name of the authors nor the names of its contributors
  18  *      may be used to endorse or promote products derived from this
  19  *      software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  $Id: guess.c,v 1.4 2004/10/06 09:25:36 shirok Exp $
  34  */
  35 
  36 #include <gauche.h>
  37 #include <gauche/extend.h>
  38 #include "charconv.h"
  39 
  40 typedef struct guess_arc_rec {
  41     unsigned int next;          /* next state */
  42     double score;               /* score */
  43 } guess_arc;
  44 
  45 typedef struct guess_dfa_rec {
  46     signed char (*states)[256];
  47     guess_arc *arcs;
  48     int state;
  49     double score;
  50 } guess_dfa;
  51 
  52 #define DFA_INIT(st, ar) \
  53     { st, ar, 0, 1.0 }
  54 
  55 #define DFA_NEXT(dfa, ch)                               \
  56     do {                                                \
  57         int arc__;                                      \
  58         if (dfa.state >= 0) {                           \
  59             arc__ = dfa.states[dfa.state][ch];          \
  60             if (arc__ < 0) {                            \
  61                 dfa.state = -1;                         \
  62             } else {                                    \
  63                 dfa.state = dfa.arcs[arc__].next;       \
  64                 dfa.score *= dfa.arcs[arc__].score;     \
  65             }                                           \
  66         }                                               \
  67     } while (0)
  68 
  69 #define DFA_ALIVE(dfa)  (dfa.state >= 0)
  70 
  71 /* include DFA table generated by guess.scm */
  72 #include "guess_tab.c"
  73 
  74 static const char *guess_jp(const char *buf, int buflen, void *data)
  75 {
  76     int i;
  77     guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar);
  78     guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar);
  79     guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar);
  80     guess_dfa *top = NULL;
  81 
  82     for (i=0; i<buflen; i++) {
  83         int c = (unsigned char)buf[i];
  84 
  85         /* special treatment of jis escape sequence */
  86         if (c == 0x1b) {
  87             if (i < buflen-1) {
  88                 c = (unsigned char)buf[++i];
  89                 if (c == '$' || c == '(') return "ISO-2022-JP";
  90             }
  91         }
  92         
  93         if (DFA_ALIVE(eucj)) {
  94             if (!DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) return "EUC-JP";
  95             DFA_NEXT(eucj, c);
  96         }
  97         if (DFA_ALIVE(sjis)) {
  98             if (!DFA_ALIVE(eucj) && !DFA_ALIVE(utf8)) return "Shift_JIS";
  99             DFA_NEXT(sjis, c);
 100         }
 101         if (DFA_ALIVE(utf8)) {
 102             if (!DFA_ALIVE(sjis) && !DFA_ALIVE(eucj)) return "UTF-8";
 103             DFA_NEXT(utf8, c);
 104         }
 105 
 106         if (!DFA_ALIVE(eucj) && !DFA_ALIVE(sjis) && !DFA_ALIVE(utf8)) {
 107             /* we ran out the possibilities */
 108             return NULL;
 109         }
 110     }
 111 
 112     /* Now, we have ambigous code.  Pick the highest score.  If more than
 113        one candidate tie, pick the default encoding. */
 114     if (DFA_ALIVE(eucj)) top = &eucj;
 115     if (DFA_ALIVE(utf8)) {
 116         if (top) {
 117 #if defined GAUCHE_CHAR_ENCODING_UTF_8
 118             if (top->score <= utf8.score)  top = &utf8;
 119 #else
 120             if (top->score <  utf8.score) top = &utf8;
 121 #endif
 122         } else {
 123             top = &utf8;
 124         }
 125     }
 126     if (DFA_ALIVE(sjis)) {
 127         if (top) {
 128 #if defined GAUCHE_CHAR_ENCODING_SJIS
 129             if (top->score <= sjis.score)  top = &sjis;
 130 #else
 131             if (top->score <  sjis.score) top = &sjis;
 132 #endif
 133         } else {
 134             top = &sjis;
 135         }
 136     }
 137 
 138     if (top == &eucj) return "EUC-JP";
 139     if (top == &utf8) return "UTF-8";
 140     if (top == &sjis) return "Shift_JIS";
 141     return NULL;
 142 }
 143 
 144 
 145 /*
 146  * Initialization
 147  */
 148 
 149 void Scm_Init_convguess(void)
 150 {
 151     Scm_RegisterCodeGuessingProc("*JP", guess_jp, NULL);
 152 }
 153 

/* [<][>][^][v][top][bottom][index][help] */