00001
00018
00019
00020
00021
00022
00023
00024
00025 #include <sent/stddefs.h>
00026 #include <sent/ngram2.h>
00027 #include <sent/vocabulary.h>
00028
00035 boolean
00036 init_ngram_bin(NGRAM_INFO *ndata, char *bin_ngram_file)
00037 {
00038 FILE *fp;
00039
00040 jlog("Stat: init_ngram: reading in binary n-gram from %s\n", bin_ngram_file);
00041 if ((fp = fopen_readfile(bin_ngram_file)) == NULL) {
00042 jlog("Error: init_ngram: failed to open \"%s\"\n", bin_ngram_file);
00043 return FALSE;
00044 }
00045 if (ngram_read_bin(fp, ndata) == FALSE) {
00046 jlog("Error: init_ngram: failed to read \"%s\"\n", bin_ngram_file);
00047 return FALSE;
00048 }
00049 if (fclose_readfile(fp) == -1) {
00050 jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file);
00051 return FALSE;
00052 }
00053 jlog("Stat: init_ngram: finished reading n-gram\n");
00054 return TRUE;
00055 }
00056
00064 boolean
00065 init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir)
00066 {
00067 FILE *fp;
00068
00069 ndata->root = NULL;
00070 ndata->dir = dir;
00071
00072 jlog("Stat: init_ngram: reading in ARPA %s n-gram from %s\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ngram_file);
00073
00074 if ((fp = fopen_readfile(ngram_file)) == NULL) {
00075 jlog("Error: init_ngram: failed to open \"%s\"\n", ngram_file);
00076 return FALSE;
00077 }
00078 if (ngram_read_arpa(fp, ndata, FALSE) == FALSE) {
00079 jlog("Error: init_ngram: failed to read \"%s\"\n", ngram_file);
00080 return FALSE;
00081 }
00082 if (fclose_readfile(fp) == -1) {
00083 jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file);
00084 return FALSE;
00085 }
00086 jlog("Stat: init_ngram: finished reading n-gram\n");
00087
00088 return TRUE;
00089 }
00090
00097 boolean
00098 init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file)
00099 {
00100 FILE *fp;
00101
00102 jlog("Stat: init_ngram: reading in additional LR 2-gram for the 1st pass from %s\n", bigram_file);
00103 if ((fp = fopen_readfile(bigram_file)) == NULL) {
00104 jlog("Error: init_ngram: failed to open \"%s\"\n", bigram_file);
00105 return FALSE;
00106 }
00107 if (ngram_read_arpa(fp, ndata, TRUE) == FALSE) {
00108 jlog("Error: init_ngram: failed to read \"%s\"\n", bigram_file);
00109 return FALSE;
00110 }
00111 if (fclose_readfile(fp) == -1) {
00112 jlog("Error: init_ngram: failed to close \"%s\"\n", bigram_file);
00113 return FALSE;
00114 }
00115 jlog("Stat: init_ngram: finished reading LR 2-gram\n");
00116
00117 return TRUE;
00118 }
00119
00126 boolean
00127 make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo)
00128 {
00129 int i;
00130 boolean ok_flag = TRUE;
00131 int count = 0;
00132
00133 jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n");
00134 ndata->unk_num = 0;
00135 for (i = 0; i < winfo->num; i++) {
00136 winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]);
00137 if (winfo->wton[i] == WORD_INVALID) {
00138 ok_flag = FALSE;
00139 count++;
00140 continue;
00141 }
00142 if (winfo->wton[i] == ndata->unk_id) {
00143 (ndata->unk_num)++;
00144 }
00145 }
00146 if (ok_flag == FALSE) {
00147 jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count);
00148 jlog("Error: --- Specify the word to which those words are mapped with \"-mapunk\" (default: \"<unk>\" or \"<UNK>\"\n");
00149 return FALSE;
00150 }
00151
00152 if (ndata->unk_num == 0) {
00153 ndata->unk_num_log = 0.0;
00154 } else {
00155 ndata->unk_num_log = (float)log10(ndata->unk_num);
00156 }
00157 jlog("Stat: init_ngram: finished word-to-ngram mapping\n");
00158 return TRUE;
00159 }
00160
00168 void
00169 set_unknown_id(NGRAM_INFO *ndata, char *str)
00170 {
00171 ndata->unk_id = ngram_lookup_word(ndata, str);
00172 if (ndata->unk_id == WORD_INVALID) {
00173 if (strmatch(str, UNK_WORD_DEFAULT)) {
00174
00175 ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2);
00176 if (ndata->unk_id == WORD_INVALID) {
00177 jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
00178 ndata->isopen = FALSE;
00179 return;
00180 }
00181 }
00182 }
00183 if (ndata->unk_id == WORD_INVALID) {
00184 jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str);
00185 ndata->isopen = FALSE;
00186 } else {
00187 jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str);
00188 ndata->isopen = TRUE;
00189 }
00190 }
00191
00205 void
00206 fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo)
00207 {
00208 WORD_ID wb, we;
00209
00210 wb = winfo->wton[winfo->head_silwid];
00211 we = winfo->wton[winfo->tail_silwid];
00212 if (ndata->d[0].prob[wb] == -99.0) {
00213 jlog("Warning: BOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[wb]);
00214 jlog("Warning: assigining value of EOS word \"%s\": %f\n", ndata->wname[we], ndata->d[0].prob[we]);
00215 ndata->d[0].prob[wb] = ndata->d[0].prob[we];
00216 } else if (ndata->d[0].prob[we] == -99.0) {
00217 jlog("Warning: EOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[we]);
00218 jlog("Warning: assigining value of BOS word \"%s\": %f\n", ndata->wname[wb], ndata->d[0].prob[wb]);
00219 ndata->d[0].prob[we] = ndata->d[0].prob[wb];
00220 }
00221 }
00222