Julius: libsent/src/util/charconv_win32.c ソースファイル

00001 
00034 /*
00035  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00036  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00037  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology
00038  * All rights reserved
00039  */
00040 
00041 #include <sent/stddefs.h>
00042 
00043 #ifdef CHARACTER_CONVERSION
00044 
00045 #ifdef USE_WIN32_MULTIBYTE
00046 
00047 #include <windows.h>
00048 #include <winnls.h>
00049 #include <jlib.h>
00050 
00051 static boolean euctosjis = FALSE; 
00052 static boolean only_euc_conv = FALSE; 
00053 
00054 static unsigned int from_cp;    
00055 static unsigned int to_cp;      
00056 
00066 boolean
00067 charconv_win32_setup(char *fromcode, char *tocode, boolean *enable_conv)
00068 {
00069   unsigned int src_p, dst_p;
00070   
00071   if (tocode == NULL) {
00072     /* just disable conversion */
00073     *enable_conv = FALSE;
00074   } else {
00075     /* determine source character set */
00076     if (fromcode == NULL) {
00077       j_printerr("Error: charset names of both input and output should be given.\n");
00078       j_printerr("Error: use \"-charconv from to\" instead of \"-kanji\".\n");
00079       *enable_conv = FALSE;
00080       return FALSE;
00081     }
00082     euctosjis = FALSE;
00083     if (strmatch(fromcode, "euc-jp")
00084                || strmatch(fromcode, "euc")
00085                || strmatch(fromcode, "eucjp")) {
00086       /* pre-convert Japanese euc to Shift-jis */
00087       euctosjis = TRUE;
00088       /* input = Shift_jis (codepage 932) */
00089       from_cp = 932;
00090     } else if (strmatch(fromcode, "ansi")) {
00091       /* ANSI codepage (MBCS) ex. shift-jis in Windows XP Japanese edition.*/
00092       from_cp = CP_ACP;
00093     } else if (strmatch(fromcode, "mac")) {
00094       /* Macintosh codepage */
00095       from_cp = CP_MACCP;
00096     } else if (strmatch(fromcode, "oem")) {
00097       /* OEM localized default codepage */
00098       from_cp = CP_OEMCP;
00099     } else if (strmatch(fromcode, "utf-7")) {
00100       /* UTF-7 codepage */
00101       from_cp = CP_UTF7;
00102     } else if (strmatch(fromcode, "utf-8")) {
00103       /* UTF-8 codepage */
00104       from_cp = CP_UTF8;
00105     } else if (strmatch(fromcode, "sjis")
00106                || strmatch(fromcode, "sjis-win")
00107                || strmatch(fromcode, "shift-jis")
00108                || strmatch(fromcode, "shift_jis")) {
00109       /* sjis codepage = 932 */
00110       from_cp = 932;
00111     } else if (fromcode[0] >= '0' && fromcode[0] <= '9') {
00112       /* codepage number */
00113       from_cp = atoi(fromcode);
00114       if (! IsValidCodePage(from_cp)) {
00115         j_printerr("Error: codepage #%d not found\n", from_cp);
00116         *enable_conv = FALSE;
00117         return FALSE;
00118       }
00119     } else {
00120       j_printerr("Error: unknown source codepage \"%s\"\n", fromcode);
00121       j_printerr("Error: valids are \"ansi\", \"mac\", \"oem\", \"utf-7\", \"utf-8\" and codepage number\n");
00122       j_printerr("Error: the default local charcode can be speicified by \"ansi\".\n");
00123       *enable_conv = FALSE;
00124       return FALSE;
00125     }
00126     /* determine the target character set */
00127     if (strmatch(tocode, "ansi")) {
00128       /* ANSI codepage (MBCS) ex. shift-jis in Windows XP Japanese edition.*/
00129       to_cp = CP_ACP;
00130     } else if (strmatch(tocode, "mac")) {
00131       /* Macintosh codepage */
00132       to_cp = CP_MACCP;
00133     } else if (strmatch(tocode, "oem")) {
00134       /* OEM codepage */
00135       to_cp = CP_OEMCP;
00136     } else if (strmatch(tocode, "utf-7")) {
00137       /* UTF-7 codepage */
00138       to_cp = CP_UTF7;
00139     } else if (strmatch(tocode, "utf-8")) {
00140       /* UTF-8 codepage */
00141       to_cp = CP_UTF8;
00142     } else if (strmatch(tocode, "sjis")
00143                || strmatch(tocode, "sjis-win")
00144                || strmatch(tocode, "shift-jis")
00145                || strmatch(tocode, "shift_jis")) {
00146       /* sjis codepage = 932 */
00147       to_cp = 932;
00148     } else if (tocode[0] >= '0' && tocode[0] <= '9') {
00149       /* codepage number */
00150       to_cp = atoi(tocode);
00151       if (! IsValidCodePage(to_cp)) {
00152         j_printerr("Error: codepage #%d not found\n", to_cp);
00153         *enable_conv = FALSE;
00154         return FALSE;
00155       }
00156     } else {
00157       j_printerr("Error: unknown target codepage \"%s\"\n", tocode);
00158       j_printerr("Error: valids are \"ansi\", \"mac\", \"oem\", \"utf-7\", \"utf-8\" and codepage number\n");
00159       j_printerr("Error: the default local charcode can be speicified by \"ansi\".\n");
00160       *enable_conv = FALSE;
00161       return FALSE;
00162     }
00163     
00164     /* check whether the actual conversion is needed */
00165     src_p = from_cp;
00166     dst_p = to_cp;
00167     if (src_p == CP_ACP) src_p = GetACP();
00168     if (dst_p == CP_ACP) dst_p = GetACP();
00169     if (src_p == CP_OEMCP) src_p = GetOEMCP();
00170     if (dst_p == CP_OEMCP) dst_p = GetOEMCP();
00171     
00172     if (src_p == dst_p) {
00173       if (euctosjis == FALSE) {
00174         only_euc_conv = FALSE;
00175         *enable_conv = FALSE;
00176       } else {
00177         only_euc_conv = TRUE;
00178         *enable_conv = TRUE;
00179       }
00180     } else {
00181       only_euc_conv = FALSE;
00182       *enable_conv = TRUE;
00183     }
00184   }
00185   
00186   return TRUE;
00187 }
00188 
00189 #define UNICODE_BUFFER_SIZE 4096 
00190 static wchar_t unibuf[UNICODE_BUFFER_SIZE]; 
00191 
00192 
00202 char *
00203 charconv_win32(char *instr, char *outstr, int maxoutlen)
00204 {
00205   int unilen, newlen;
00206   char *srcbuf;
00207   
00208   srcbuf = instr;
00209   if (euctosjis == TRUE) {
00210     /* euc->sjis conversion */
00211     toStringSJIS(instr, outstr, maxoutlen);
00212     srcbuf = outstr;
00213     if (only_euc_conv) {
00214       return(outstr);
00215     }
00216   }
00217   
00218   /* get length of unicode string */
00219   unilen = MultiByteToWideChar(from_cp, 0, srcbuf, -1, NULL, 0);
00220   if (unilen <= 0) {
00221     j_printerr("conversion error?\n");
00222     return(instr);
00223   }
00224   if (unilen > UNICODE_BUFFER_SIZE) {
00225     j_printerr("InternalError: unicode buffer size exceeded (%d > %d)!\n", unilen, UNICODE_BUFFER_SIZE);
00226     return(instr);
00227   }
00228   /* convert source string to unicode */
00229   MultiByteToWideChar(from_cp, 0, srcbuf, -1, unibuf, unilen);
00230   /* get length of target string */
00231   newlen = WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, 0, NULL, NULL);
00232   if (newlen <= 0) {
00233     j_printerr("conversion error?\n");
00234     return(instr);
00235   }
00236   if (newlen > maxoutlen) {
00237     j_printerr("InternalError: target buffer size exceeded (%d > %d)!\n", newlen, maxoutlen);
00238     return(instr);
00239   }
00240   /* convert unicode to target string */
00241   WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, newlen, NULL, NULL);
00242 
00243   return(outstr);
00244 }
00245 
00246 #endif /* USE_WIN32_MULTIBYTE */
00247 
00248 #endif /* CHARACTER_CONVERSION */