libsent/src/ngram/init_ngram.c

Go to the documentation of this file.
00001 
00018 /*
00019  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00020  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00021  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00022  * All rights reserved
00023  */
00024 
00025 #include <sent/stddefs.h>
00026 #include <sent/ngram2.h>
00027 #include <sent/vocabulary.h>
00028 
00035 boolean
00036 init_ngram_bin(NGRAM_INFO *ndata, char *bin_ngram_file)
00037 {
00038   FILE *fp;
00039   
00040   jlog("Stat: init_ngram: reading in binary n-gram from %s\n", bin_ngram_file);
00041   if ((fp = fopen_readfile(bin_ngram_file)) == NULL) {
00042     jlog("Error: init_ngram: failed to open \"%s\"\n", bin_ngram_file);
00043     return FALSE;
00044   }
00045   if (ngram_read_bin(fp, ndata) == FALSE) {
00046     jlog("Error: init_ngram: failed to read \"%s\"\n", bin_ngram_file);
00047     return FALSE;
00048   }
00049   if (fclose_readfile(fp) == -1) {
00050     jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file);
00051     return FALSE;
00052   }
00053   jlog("Stat: init_ngram: finished reading n-gram\n");
00054   return TRUE;
00055 }
00056 
00064 boolean
00065 init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir)
00066 {
00067   FILE *fp;
00068 
00069   ndata->root = NULL;
00070   ndata->dir = dir;
00071 
00072   jlog("Stat: init_ngram: reading in ARPA %s n-gram from %s\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ngram_file);
00073   /* read RL n-gram */
00074   if ((fp = fopen_readfile(ngram_file)) == NULL) {
00075     jlog("Error: init_ngram: failed to open \"%s\"\n", ngram_file);
00076     return FALSE;
00077   }
00078   if (ngram_read_arpa(fp, ndata, FALSE) == FALSE) {
00079     jlog("Error: init_ngram: failed to read \"%s\"\n", ngram_file);
00080     return FALSE;
00081   }
00082   if (fclose_readfile(fp) == -1) {
00083     jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file);
00084     return FALSE;
00085   }
00086   jlog("Stat: init_ngram: finished reading n-gram\n");
00087 
00088   return TRUE;
00089 }
00090 
00097 boolean
00098 init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file)
00099 {
00100   FILE *fp;
00101 
00102   jlog("Stat: init_ngram: reading in additional LR 2-gram for the 1st pass from %s\n", bigram_file);
00103   if ((fp = fopen_readfile(bigram_file)) == NULL) {
00104     jlog("Error: init_ngram: failed to open \"%s\"\n", bigram_file);
00105     return FALSE;
00106   }
00107   if (ngram_read_arpa(fp, ndata, TRUE) == FALSE) {
00108     jlog("Error: init_ngram: failed to read \"%s\"\n", bigram_file);
00109     return FALSE;
00110   }
00111   if (fclose_readfile(fp) == -1) {
00112     jlog("Error: init_ngram: failed to close \"%s\"\n", bigram_file);
00113     return FALSE;
00114   }
00115   jlog("Stat: init_ngram: finished reading LR 2-gram\n");
00116 
00117   return TRUE;
00118 }
00119 
00126 void
00127 make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo)
00128 {
00129   int i;
00130 
00131   jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n");
00132   ndata->unk_num = 0;
00133   for (i = 0; i < winfo->num; i++) {
00134     winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]);
00135     if (winfo->wton[i] == ndata->unk_id) {
00136       (ndata->unk_num)++;
00137     }
00138   }
00139   if (ndata->unk_num == 0) {
00140     ndata->unk_num_log = 0.0;   /* for safe */
00141   } else {
00142     ndata->unk_num_log = (float)log10(ndata->unk_num);
00143   }
00144   jlog("Stat: init_ngram: finished word-to-ngram mapping\n");
00145 }
00146 
00156 void
00157 set_unknown_id(NGRAM_INFO *ndata)
00158 {
00159 #if 0
00160   ndata->unk_id = ngram_lookup_word(ndata, unkword);
00161   if (ndata->unk_id == WORD_INVALID) {
00162     jlog("word %s not found, so assume this is a closed vocabulary model\n",
00163             unkword);
00164     ndata->isopen = FALSE;
00165   } else {
00166     ndata->isopen = TRUE;
00167   }
00168 #endif
00169   ndata->isopen = TRUE;
00170   ndata->unk_id = 0;            /* unknown (OOV) words are always mapped to
00171                                    the number 0 (by CMU-TK)*/
00172 }

Generated on Tue Dec 18 15:59:55 2007 for Julius by  doxygen 1.5.4