Julius: julius/charconv_win32.c Source File

00001 
00035 /*
00036  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00037  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00038  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00039  * All rights reserved
00040  */
00041 
00042 #include "app.h"
00043 
00044 #ifdef CHARACTER_CONVERSION
00045 
00046 #ifdef USE_WIN32_MULTIBYTE
00047 
00048 #include <windows.h>
00049 #include <winnls.h>
00050 #include "libjcode/jlib.h"
00051 
00052 static boolean euctosjis = FALSE; 
00053 static boolean only_euc_conv = FALSE; 
00054 
00055 static unsigned int from_cp;    
00056 static unsigned int to_cp;      
00057 
00067 boolean
00068 charconv_win32_setup(char *fromcode, char *tocode, boolean *enable_conv)
00069 {
00070   unsigned int src_p, dst_p;
00071   
00072   if (tocode == NULL) {
00073     /* just disable conversion */
00074     *enable_conv = FALSE;
00075   } else {
00076     /* determine source character set */
00077     if (fromcode == NULL) {
00078       jlog("Error: charconv_win32: charset names of both input and output should be given.\n");
00079       jlog("Error: charconv_win32: use \"-charconv from to\" instead of \"-kanji\".\n");
00080       *enable_conv = FALSE;
00081       return FALSE;
00082     }
00083     euctosjis = FALSE;
00084     if (strmatch(fromcode, "euc-jp")
00085                || strmatch(fromcode, "euc")
00086                || strmatch(fromcode, "eucjp")) {
00087       /* pre-convert Japanese euc to Shift-jis */
00088       euctosjis = TRUE;
00089       /* input = Shift_jis (codepage 932) */
00090       from_cp = 932;
00091     } else if (strmatch(fromcode, "ansi")) {
00092       /* ANSI codepage (MBCS) ex. shift-jis in Windows XP Japanese edition.*/
00093       from_cp = CP_ACP;
00094     } else if (strmatch(fromcode, "mac")) {
00095       /* Macintosh codepage */
00096       from_cp = CP_MACCP;
00097     } else if (strmatch(fromcode, "oem")) {
00098       /* OEM localized default codepage */
00099       from_cp = CP_OEMCP;
00100     } else if (strmatch(fromcode, "utf-7")) {
00101       /* UTF-7 codepage */
00102       from_cp = CP_UTF7;
00103     } else if (strmatch(fromcode, "utf-8")) {
00104       /* UTF-8 codepage */
00105       from_cp = CP_UTF8;
00106     } else if (strmatch(fromcode, "sjis")
00107                || strmatch(fromcode, "sjis-win")
00108                || strmatch(fromcode, "shift-jis")
00109                || strmatch(fromcode, "shift_jis")) {
00110       /* sjis codepage = 932 */
00111       from_cp = 932;
00112     } else if (fromcode[0] >= '0' && fromcode[0] <= '9') {
00113       /* codepage number */
00114       from_cp = atoi(fromcode);
00115       if (! IsValidCodePage(from_cp)) {
00116         jlog("Error: charconv_win32: codepage #%d not found\n", from_cp);
00117         *enable_conv = FALSE;
00118         return FALSE;
00119       }
00120     } else {
00121       jlog("Error: charconv_win32: unknown source codepage \"%s\"\n", fromcode);
00122       jlog("Error: charconv_win32: valids are \"ansi\", \"mac\", \"oem\", \"utf-7\", \"utf-8\" and codepage number\n");
00123       jlog("Error: charconv_win32: the default local charcode can be speicified by \"ansi\".\n");
00124       *enable_conv = FALSE;
00125       return FALSE;
00126     }
00127     /* determine the target character set */
00128     if (strmatch(tocode, "ansi")) {
00129       /* ANSI codepage (MBCS) ex. shift-jis in Windows XP Japanese edition.*/
00130       to_cp = CP_ACP;
00131     } else if (strmatch(tocode, "mac")) {
00132       /* Macintosh codepage */
00133       to_cp = CP_MACCP;
00134     } else if (strmatch(tocode, "oem")) {
00135       /* OEM codepage */
00136       to_cp = CP_OEMCP;
00137     } else if (strmatch(tocode, "utf-7")) {
00138       /* UTF-7 codepage */
00139       to_cp = CP_UTF7;
00140     } else if (strmatch(tocode, "utf-8")) {
00141       /* UTF-8 codepage */
00142       to_cp = CP_UTF8;
00143     } else if (strmatch(tocode, "sjis")
00144                || strmatch(tocode, "sjis-win")
00145                || strmatch(tocode, "shift-jis")
00146                || strmatch(tocode, "shift_jis")) {
00147       /* sjis codepage = 932 */
00148       to_cp = 932;
00149     } else if (tocode[0] >= '0' && tocode[0] <= '9') {
00150       /* codepage number */
00151       to_cp = atoi(tocode);
00152       if (! IsValidCodePage(to_cp)) {
00153         jlog("Error: charconv_win32: codepage #%d not found\n", to_cp);
00154         *enable_conv = FALSE;
00155         return FALSE;
00156       }
00157     } else {
00158       jlog("Error: charconv_win32: unknown target codepage \"%s\"\n", tocode);
00159       jlog("Error: charconv_win32: valids are \"ansi\", \"mac\", \"oem\", \"utf-7\", \"utf-8\" and codepage number\n");
00160       jlog("Error: charconv_win32: the default local charcode can be speicified by \"ansi\".\n");
00161       *enable_conv = FALSE;
00162       return FALSE;
00163     }
00164     
00165     /* check whether the actual conversion is needed */
00166     src_p = from_cp;
00167     dst_p = to_cp;
00168     if (src_p == CP_ACP) src_p = GetACP();
00169     if (dst_p == CP_ACP) dst_p = GetACP();
00170     if (src_p == CP_OEMCP) src_p = GetOEMCP();
00171     if (dst_p == CP_OEMCP) dst_p = GetOEMCP();
00172     
00173     if (src_p == dst_p) {
00174       if (euctosjis == FALSE) {
00175         only_euc_conv = FALSE;
00176         *enable_conv = FALSE;
00177       } else {
00178         only_euc_conv = TRUE;
00179         *enable_conv = TRUE;
00180       }
00181     } else {
00182       only_euc_conv = FALSE;
00183       *enable_conv = TRUE;
00184     }
00185   }
00186   
00187   return TRUE;
00188 }
00189 
00190 #define UNICODE_BUFFER_SIZE 4096 
00191 static wchar_t unibuf[UNICODE_BUFFER_SIZE]; 
00192 
00193 
00203 char *
00204 charconv_win32(char *instr, char *outstr, int maxoutlen)
00205 {
00206   int unilen, newlen;
00207   char *srcbuf;
00208   
00209   srcbuf = instr;
00210   if (euctosjis == TRUE) {
00211     /* euc->sjis conversion */
00212     toStringSJIS(instr, outstr, maxoutlen);
00213     srcbuf = outstr;
00214     if (only_euc_conv) {
00215       return(outstr);
00216     }
00217   }
00218   
00219   /* get length of unicode string */
00220   unilen = MultiByteToWideChar(from_cp, 0, srcbuf, -1, NULL, 0);
00221   if (unilen <= 0) {
00222     jlog("Error: charconv_win32: conversion error?\n");
00223     return(instr);
00224   }
00225   if (unilen > UNICODE_BUFFER_SIZE) {
00226     jlog("Error: charconv_win32: unicode buffer size exceeded (%d > %d)!\n", unilen, UNICODE_BUFFER_SIZE);
00227     return(instr);
00228   }
00229   /* convert source string to unicode */
00230   MultiByteToWideChar(from_cp, 0, srcbuf, -1, unibuf, unilen);
00231   /* get length of target string */
00232   newlen = WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, 0, NULL, NULL);
00233   if (newlen <= 0) {
00234     jlog("Error: charconv_win32: conversion error?\n");
00235     return(instr);
00236   }
00237   if (newlen > maxoutlen) {
00238     jlog("Error: charconv_win32: target buffer size exceeded (%d > %d)!\n", newlen, maxoutlen);
00239     return(instr);
00240   }
00241   /* convert unicode to target string */
00242   WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, newlen, NULL, NULL);
00243 
00244   return(outstr);
00245 }
00246 
00247 #endif /* USE_WIN32_MULTIBYTE */
00248 
00249 #endif /* CHARACTER_CONVERSION */