00001
00053
00054
00055
00056
00057
00058
00059
00060 #include <sent/stddefs.h>
00061 #include <sent/ngram2.h>
00062
00063 static int file_version;
00064 static boolean need_swap;
00065
00074 static void
00075 rdn(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00076 {
00077 size_t tmp;
00078 if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < (size_t)unitnum) {
00079 perror("ngram_read_bin");
00080 j_error("read failed\n");
00081 }
00082 if (need_swap) {
00083 if (unitbyte != 1) {
00084 swap_bytes(buf, unitbyte, unitnum);
00085 }
00086 }
00087
00088 }
00089
00095 static void
00096 check_header(FILE *fp)
00097 {
00098 char buf[BINGRAM_HDSIZE], *p;
00099 rdn(fp, buf, 1, BINGRAM_HDSIZE);
00100
00101 p = buf;
00102
00103 if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {
00104
00105 file_version = 3;
00106 p += strlen(BINGRAM_IDSTR) + 1;
00107 } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {
00108
00109 file_version = 4;
00110 p += strlen(BINGRAM_IDSTR_V4) + 1;
00111 } else {
00112
00113 j_printerr("Error: invalid header, you probably use old bingram\n");
00114 j_error("Error: if so, please re-make with newer mkbingram that comes with Julius-2.0 or later\n");
00115 }
00116
00117 if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {
00118 p += strlen(BINGRAM_SIZESTR_HEAD);
00119 if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {
00120
00121
00122 j_error("Error: word size does not match in bingram\n");
00123 }
00124 p += strlen(BINGRAM_SIZESTR_BODY) + 1;
00125
00126
00127 if (file_version == 4) {
00128 if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {
00129 j_error("Error: no information for byte order in v4 format??\n");
00130 }
00131 p += strlen(BINGRAM_BYTEORDER_HEAD);
00132 if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {
00133
00134 need_swap = TRUE;
00135 } else {
00136 need_swap = FALSE;
00137 }
00138 p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;
00139 }
00140 }
00141
00142 if (file_version != 4) {
00143
00144 #ifdef WORDS_BIGENDIAN
00145 need_swap = FALSE;
00146 #else
00147 need_swap = TRUE;
00148 #endif
00149 }
00150
00151
00152 }
00153
00162 boolean
00163 ngram_read_bin(FILE *fp, NGRAM_INFO *ndata)
00164 {
00165 int i,n,len;
00166 char *w, *p;
00167 NNID *n3_bgn;
00168 NNID d, ntmp;
00169
00170 ndata->from_bin = TRUE;
00171
00172
00173 check_header(fp);
00174
00175
00176 for(n=0;n<MAX_N;n++) {
00177 rdn(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1);
00178 if (file_version == 4 && ndata->ngram_num[n] >= NNIDMAX) {
00179 j_error("Error: too big %d-gram (%d, should be less than %d)\n", n+1, ndata->ngram_num[n], NNIDMAX);
00180 }
00181 }
00182 ndata->max_word_num = ndata->ngram_num[0];
00183 if (file_version == 4) rdn(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1);
00184
00185
00186 switch(file_version) {
00187 case 4:
00188 ndata->version = 4;
00189 break;
00190 case 3:
00191 if (ndata->ngram_num[2] >= NNIDMAX) {
00192 j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX);
00193 ndata->version = 3;
00194 } else {
00195 ndata->version = 4;
00196 }
00197 break;
00198 }
00199
00200
00201 rdn(fp, &len, sizeof(int), 1);
00202 w = mymalloc(len);
00203 rdn(fp, w, 1, len);
00204
00205 ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]);
00206 p = w; i = 0;
00207 while (p < w + len) {
00208 ndata->wname[i++] = p;
00209 while(*p != '\0') p++;
00210 p++;
00211 }
00212 if (i != ndata->ngram_num[0]) {
00213 j_error("wname error??\n");
00214 }
00215
00216 ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00217 ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00218 ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00219 ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[0]);
00220 ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[0]);
00221 ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00222 ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00223 ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00224 if (file_version == 4) {
00225 ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00226 ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00227 ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->bigram_bo_num);
00228 ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->bigram_bo_num);
00229 ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->bigram_bo_num);
00230 ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->bigram_bo_num);
00231 } else {
00232 ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00233 ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00234 if (ndata->version == 4) {
00235 ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00236 ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00237 n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00238 } else {
00239 ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00240 }
00241 }
00242
00243 ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[2]);
00244 ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[2]);
00245
00246
00247 j_printerr("1-gram.");
00248 rdn(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]);
00249 j_printerr(".");
00250 rdn(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]);
00251 j_printerr(".");
00252 rdn(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]);
00253 j_printerr(".");
00254 rdn(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]);
00255 j_printerr(".");
00256 rdn(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]);
00257
00258
00259 j_printerr("2-gram.");
00260 rdn(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]);
00261 j_printerr(".");
00262 rdn(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]);
00263 j_printerr(".");
00264 rdn(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]);
00265 j_printerr(".");
00266 if (file_version == 4) {
00267 rdn(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]);
00268 rdn(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]);
00269 j_printerr(".");
00270 rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num);
00271 j_printerr(".");
00272 rdn(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num);
00273 rdn(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num);
00274 j_printerr(".");
00275 rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num);
00276 } else {
00277 rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]);
00278 j_printerr(".");
00279 if (ndata->version == 4) {
00280 rdn(fp, n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00281 for(d=0;d<ndata->ngram_num[1];d++) {
00282 if (n3_bgn[d] == NNID_INVALID) {
00283 ndata->n3_bgn_lower[d] = 0;
00284 ndata->n3_bgn_upper[d] = NNID_INVALID_UPPER;
00285 } else {
00286 ntmp = n3_bgn[d] & 0xffff;
00287 ndata->n3_bgn_lower[d] = ntmp;
00288 ntmp = n3_bgn[d] >> 16;
00289 ndata->n3_bgn_upper[d] = ntmp;
00290 }
00291 }
00292 } else {
00293 rdn(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00294 }
00295 j_printerr(".");
00296 rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]);
00297 }
00298
00299
00300 j_printerr("3-gram.");
00301 rdn(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]);
00302 j_printerr(".");
00303 rdn(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]);
00304
00305
00306 j_printerr("indexing...");
00307 ngram_make_lookup_tree(ndata);
00308
00309
00310 if (file_version != 4 && ndata->version == 4) {
00311 free(n3_bgn);
00312 ngram_compact_bigram_context(ndata);
00313 }
00314
00315
00316 set_unknown_id(ndata);
00317
00318 return TRUE;
00319 }