00001
00018
00019
00020
00021
00022
00023
00024
00025 #include <sent/stddefs.h>
00026 #include <sent/ngram2.h>
00027 #include <sent/vocabulary.h>
00028
00035 boolean
00036 init_ngram_bin(NGRAM_INFO *ndata, char *bin_ngram_file)
00037 {
00038 FILE *fp;
00039
00040 jlog("Stat: init_ngram: reading in binary n-gram from %s\n", bin_ngram_file);
00041 if ((fp = fopen_readfile(bin_ngram_file)) == NULL) {
00042 jlog("Error: init_ngram: failed to open \"%s\"\n", bin_ngram_file);
00043 return FALSE;
00044 }
00045 if (ngram_read_bin(fp, ndata) == FALSE) {
00046 jlog("Error: init_ngram: failed to read \"%s\"\n", bin_ngram_file);
00047 return FALSE;
00048 }
00049 if (fclose_readfile(fp) == -1) {
00050 jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file);
00051 return FALSE;
00052 }
00053 jlog("Stat: init_ngram: finished reading n-gram\n");
00054 return TRUE;
00055 }
00056
00064 boolean
00065 init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir)
00066 {
00067 FILE *fp;
00068
00069 ndata->root = NULL;
00070 ndata->dir = dir;
00071
00072 jlog("Stat: init_ngram: reading in ARPA %s n-gram from %s\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ngram_file);
00073
00074 if ((fp = fopen_readfile(ngram_file)) == NULL) {
00075 jlog("Error: init_ngram: failed to open \"%s\"\n", ngram_file);
00076 return FALSE;
00077 }
00078 if (ngram_read_arpa(fp, ndata, FALSE) == FALSE) {
00079 jlog("Error: init_ngram: failed to read \"%s\"\n", ngram_file);
00080 return FALSE;
00081 }
00082 if (fclose_readfile(fp) == -1) {
00083 jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file);
00084 return FALSE;
00085 }
00086 jlog("Stat: init_ngram: finished reading n-gram\n");
00087
00088 return TRUE;
00089 }
00090
00097 boolean
00098 init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file)
00099 {
00100 FILE *fp;
00101
00102 jlog("Stat: init_ngram: reading in additional LR 2-gram for the 1st pass from %s\n", bigram_file);
00103 if ((fp = fopen_readfile(bigram_file)) == NULL) {
00104 jlog("Error: init_ngram: failed to open \"%s\"\n", bigram_file);
00105 return FALSE;
00106 }
00107 if (ngram_read_arpa(fp, ndata, TRUE) == FALSE) {
00108 jlog("Error: init_ngram: failed to read \"%s\"\n", bigram_file);
00109 return FALSE;
00110 }
00111 if (fclose_readfile(fp) == -1) {
00112 jlog("Error: init_ngram: failed to close \"%s\"\n", bigram_file);
00113 return FALSE;
00114 }
00115 jlog("Stat: init_ngram: finished reading LR 2-gram\n");
00116
00117 return TRUE;
00118 }
00119
00126 void
00127 make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo)
00128 {
00129 int i;
00130
00131 jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n");
00132 ndata->unk_num = 0;
00133 for (i = 0; i < winfo->num; i++) {
00134 winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]);
00135 if (winfo->wton[i] == ndata->unk_id) {
00136 (ndata->unk_num)++;
00137 }
00138 }
00139 if (ndata->unk_num == 0) {
00140 ndata->unk_num_log = 0.0;
00141 } else {
00142 ndata->unk_num_log = (float)log10(ndata->unk_num);
00143 }
00144 jlog("Stat: init_ngram: finished word-to-ngram mapping\n");
00145 }
00146
00156 void
00157 set_unknown_id(NGRAM_INFO *ndata)
00158 {
00159 #if 0
00160 ndata->unk_id = ngram_lookup_word(ndata, unkword);
00161 if (ndata->unk_id == WORD_INVALID) {
00162 jlog("word %s not found, so assume this is a closed vocabulary model\n",
00163 unkword);
00164 ndata->isopen = FALSE;
00165 } else {
00166 ndata->isopen = TRUE;
00167 }
00168 #endif
00169 ndata->isopen = TRUE;
00170 ndata->unk_id = 0;
00171
00172 }