00001
00054
00055
00056
00057
00058
00059
00060
00061 #include <sent/stddefs.h>
00062 #include <sent/ngram2.h>
00063
00064 static int file_version;
00065 static boolean need_swap;
00066 #ifdef WORDS_INT
00067 static boolean need_conv;
00068 static boolean words_int_retry = FALSE;
00069 #endif
00070
00075 #define rdn(A,B,C,D) if (rdnfunc(A,B,C,D) == FALSE) return FALSE
00076 #define rdn_wordid(A,B,C,D) if (rdn_wordid_func(A,B,C,D) == FALSE) return FALSE
00077
00085 static boolean
00086 rdnfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
00087 {
00088 size_t tmp;
00089 if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < unitnum) {
00090 jlog("Error: ngram_read_bin: failed to read %d bytes\n", unitbyte*unitnum);
00091 return FALSE;
00092 }
00093 if (need_swap) {
00094 if (unitbyte != 1) {
00095 swap_bytes(buf, unitbyte, unitnum);
00096 }
00097 }
00098 return TRUE;
00099 }
00100
00101 #ifdef WORDS_INT
00102
00110 static boolean
00111 rdn_wordid_func(FILE *fp, void *buf, int unitnum, boolean need_conv)
00112 {
00113 int i;
00114 unsigned short *s;
00115 WORD_ID *t;
00116 WORD_ID d;
00117
00118 if (need_conv) {
00119
00120 rdn(fp, buf, sizeof(unsigned short), unitnum);
00121
00122 for(i=unitnum-1;i>=0;i--) {
00123 s = (unsigned short *)buf + i;
00124 t = (WORD_ID *)buf + i;
00125 d = *s;
00126 *t = d;
00127 }
00128 } else {
00129
00130 rdn(fp, buf, sizeof(WORD_ID), unitnum);
00131 }
00132 return TRUE;
00133 }
00134 #endif
00135
00141 static boolean
00142 check_header(FILE *fp)
00143 {
00144 char buf[BINGRAM_HDSIZE], *p;
00145
00146 rdn(fp, buf, 1, BINGRAM_HDSIZE);
00147 p = buf;
00148 #ifdef WORDS_INT
00149 need_conv = FALSE;
00150 #endif
00151
00152
00153 if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {
00154
00155 file_version = 3;
00156 p += strlen(BINGRAM_IDSTR) + 1;
00157 } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {
00158
00159 file_version = 4;
00160 p += strlen(BINGRAM_IDSTR_V4) + 1;
00161 } else if (strnmatch(p, BINGRAM_IDSTR_V5, strlen(BINGRAM_IDSTR_V5))) {
00162
00163 file_version = 5;
00164 p += strlen(BINGRAM_IDSTR_V5) + 1;
00165 } else {
00166
00167 jlog("Error: ngram_read_bin: invalid header\n");
00168 return FALSE;
00169 }
00170
00171 if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {
00172 p += strlen(BINGRAM_SIZESTR_HEAD);
00173 if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {
00174
00175 #ifdef WORDS_INT
00176 if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) {
00177
00178 jlog("Warning: ngram_read_bin: 2-bytes bingram, converting to 4 bytes\n");
00179 need_conv = TRUE;
00180 p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1;
00181 } else {
00182 jlog("Error: ngram_read_bin: unknown word byte size!\n");
00183 return FALSE;
00184 }
00185 #else
00186 if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) {
00187
00188 jlog("Error: ngram_read_bin: cannot handle 4-bytes bingram\n");
00189 jlog("Error: ngram_read_bin: please use Julius compiled with --enable-words-int\n");
00190 return FALSE;
00191
00192 } else {
00193 jlog("Error: ngram_read_bin: unknown word byte size!\n");
00194 return FALSE;
00195 }
00196 #endif
00197 } else {
00198 p += strlen(BINGRAM_SIZESTR_BODY) + 1;
00199 }
00200
00201
00202 if (file_version >= 4) {
00203 if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {
00204 jlog("Error: ngram_read_bin: no information for byte order??\n");
00205 return FALSE;
00206 }
00207 p += strlen(BINGRAM_BYTEORDER_HEAD);
00208 if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {
00209
00210 need_swap = TRUE;
00211 } else {
00212 need_swap = FALSE;
00213 }
00214 p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;
00215 }
00216 }
00217
00218
00219
00220
00221
00222
00223
00224
00225 if (file_version < 4) {
00226
00227 #ifdef WORDS_BIGENDIAN
00228 need_swap = FALSE;
00229 #else
00230 need_swap = TRUE;
00231 #endif
00232 }
00233
00234
00235
00236 return TRUE;
00237 }
00238
00239 static boolean
00240 ngram_read_bin_v5(FILE *fp, NGRAM_INFO *ndata)
00241 {
00242 int i,n,len;
00243 char *w, *p;
00244 #ifdef WORDS_INT
00245 unsigned short *buf;
00246 #endif
00247 NGRAM_TUPLE_INFO *t;
00248
00249
00250 rdn(fp, &(ndata->n), sizeof(int), 1);
00251 rdn(fp, &(ndata->dir), sizeof(int), 1);
00252 rdn(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
00253
00254 jlog("Stat: ngram_read_bin_v5: this is %s %d-gram file\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ndata->n);
00255
00256 if (ndata->n > MAX_N) {
00257 jlog("Error: ngram_read_bin_v5: too long N-gram (N=%d)\n", n);
00258 jlog("Error: ngram_read_bin_v5: current maximum length of N-gram is set to %d\n", MAX_N);
00259 jlog("Error: ngram_read_bin_v5: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");
00260 return FALSE;
00261 }
00262
00263
00264 for(n=0;n<ndata->n;n++) {
00265 rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00266 }
00267 ndata->max_word_num = ndata->d[0].totalnum;
00268
00269
00270 rdn(fp, &len, sizeof(int), 1);
00271 w = mymalloc(len);
00272 rdn(fp, w, 1, len);
00273
00274 ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
00275 p = w; i = 0;
00276 while (p < w + len) {
00277 ndata->wname[i++] = p;
00278 while(*p != '\0') p++;
00279 p++;
00280 }
00281 if (i != ndata->max_word_num) {
00282 jlog("Error: ngram_read_bin_v5: wname error??\n");
00283 return FALSE;
00284 }
00285
00286
00287 for(n=0;n<ndata->n;n++) {
00288 jlog("stat: ngram_read_bin_v5: reading %d-gram\n", n+1);
00289
00290 t = &(ndata->d[n]);
00291
00292 rdn(fp, &(t->is24bit), sizeof(boolean), 1);
00293 rdn(fp, &(t->ct_compaction), sizeof(boolean), 1);
00294 rdn(fp, &(t->bgnlistlen), sizeof(NNID), 1);
00295 rdn(fp, &(t->context_num), sizeof(NNID), 1);
00296
00297 if (n > 0) {
00298 if (t->is24bit) {
00299 t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen);
00300 rdn(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
00301 t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen);
00302 rdn(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
00303 } else {
00304 t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen);
00305 rdn(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
00306 }
00307 t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen);
00308 rdn(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
00309 t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum);
00310 rdn(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
00311 } else {
00312 t->bgn_upper = NULL;
00313 t->bgn_lower = NULL;
00314 t->bgn = NULL;
00315 t->num = NULL;
00316 t->bgnlistlen = 0;
00317 t->nnid2wid = NULL;
00318 }
00319
00320 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
00321 rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00322
00323 rdn(fp, &i, sizeof(int), 1);
00324 if (i == 1) {
00325 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);
00326 rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00327 } else {
00328 t->bo_wt = NULL;
00329 }
00330 rdn(fp, &i, sizeof(int), 1);
00331 if (i == 1) {
00332 t->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->totalnum);
00333 t->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->totalnum);
00334 rdn(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
00335 rdn(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
00336 } else {
00337 t->nnid2ctid_upper = NULL;
00338 t->nnid2ctid_lower = NULL;
00339 }
00340 }
00341 rdn(fp, &i, sizeof(int), 1);
00342 if (i == 1) {
00343 ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[0].context_num);
00344 rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
00345 } else {
00346 ndata->bo_wt_1 = NULL;
00347 }
00348 rdn(fp, &i, sizeof(int), 1);
00349 if (i == 1) {
00350 ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum);
00351 rdn(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
00352 } else {
00353 ndata->p_2 = NULL;
00354 }
00355
00356 return TRUE;
00357 }
00358
00359 static boolean
00360 ngram_read_bin_compat(FILE *fp, NGRAM_INFO *ndata, int *retry_ret)
00361 {
00362 int i,n,len;
00363 char *w, *p;
00364 NNID *n3_bgn;
00365 NNID d, ntmp;
00366 #ifdef WORDS_INT
00367 unsigned short *buf;
00368 #endif
00369 NGRAM_TUPLE_INFO *t, *tt, *ttt;
00370
00371
00372 ndata->bigram_index_reversed = TRUE;
00373 ndata->n = 3;
00374 ndata->dir = DIR_RL;
00375
00376
00377 for(n=0;n<ndata->n;n++) {
00378 rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00379 }
00380 ndata->max_word_num = ndata->d[0].totalnum;
00381
00382 if (file_version == 4) {
00383 rdn(fp, &(ndata->d[1].context_num), sizeof(NNID), 1);
00384 }
00385
00386 for(n=0;n<ndata->n;n++) {
00387 if (n < 2) {
00388 ndata->d[n].is24bit = FALSE;
00389 } else {
00390 if (ndata->d[n].totalnum >= NNID_MAX_24) {
00391 jlog("Warning: ngram_read_bin_compat: num of %d-gram exceeds 24bit, now switch to %dbit index\n", n+1, sizeof(NNID) * 8);
00392 ndata->d[n].is24bit = FALSE;
00393 } else {
00394 ndata->d[n].is24bit = TRUE;
00395 }
00396 }
00397 ndata->d[n].nnid2ctid_upper = NULL;
00398 ndata->d[n].nnid2ctid_lower = NULL;
00399 }
00400
00401
00402 ndata->d[0].ct_compaction = FALSE;
00403 for(n=1;n<ndata->n;n++) {
00404 ndata->d[n].ct_compaction = TRUE;
00405 }
00406
00407
00408 rdn(fp, &len, sizeof(int), 1);
00409 w = mymalloc(len);
00410 rdn(fp, w, 1, len);
00411
00412 ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
00413 p = w; i = 0;
00414 while (p < w + len) {
00415 ndata->wname[i++] = p;
00416 while(*p != '\0') p++;
00417 p++;
00418 }
00419 if (i != ndata->max_word_num) {
00420 jlog("Error: ngram_read_bin_compat: wname error??\n");
00421 return FALSE;
00422 }
00423
00424
00425 t = &(ndata->d[0]);
00426 tt = &(ndata->d[1]);
00427 ttt = &(ndata->d[2]);
00428
00429 t->bgn_upper = NULL;
00430 t->bgn_lower = NULL;
00431 t->bgn = NULL;
00432 t->num = NULL;
00433 t->bgnlistlen = 0;
00434 t->nnid2wid = NULL;
00435 t->nnid2ctid_upper = NULL;
00436 t->nnid2ctid_lower = NULL;
00437
00438 t->context_num = t->totalnum;
00439 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
00440 ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);
00441 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);
00442 tt->bgnlistlen = t->context_num;
00443 tt->bgn = (NNID *)mymalloc_big(sizeof(NNID), tt->bgnlistlen);
00444 tt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->bgnlistlen);
00445
00446
00447 jlog("stat: ngram_read_bin_compat: reading 1-gram\n");
00448 rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00449 rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), t->context_num);
00450 rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00451 rdn(fp, tt->bgn, sizeof(NNID), tt->bgnlistlen);
00452 #ifdef WORDS_INT
00453 rdn_wordid(fp, tt->num, tt->bgnlistlen, need_conv);
00454 #else
00455 rdn(fp, tt->num, sizeof(WORD_ID), tt->bgnlistlen);
00456 #endif
00457
00458 #ifdef WORDS_INT
00459 {
00460
00461
00462
00463
00464
00465
00466
00467
00468 WORD_ID w;
00469 for(w=0;w<ndata->max_word_num;w++) {
00470 if (ndata->d[1].num[w] > ndata->max_word_num) {
00471 if (words_int_retry) {
00472 jlog("Error: ngram_read_bin_compat: retry failed, wrong bingram format\n");
00473 return FALSE;
00474 }
00475 jlog("Warning: ngram_read_bin_compat: incorrect data, may be a 2-byte v3 bingram, retry with conversion\n");
00476 free(ndata->wname[0]);
00477 free(ndata->wname);
00478 free(t->prob);
00479 free(ndata->bo_wt_1);
00480 free(t->bo_wt);
00481 free(tt->bgn);
00482 free(tt->num);
00483 myfrewind(fp);
00484 words_int_retry = TRUE;
00485 *retry_ret = 1;
00486 return FALSE;
00487 }
00488 }
00489 }
00490 #endif
00491
00492
00493 tt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->totalnum);
00494 tt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum);
00495 ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum);
00496 if (file_version == 4) {
00497 tt->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), tt->totalnum);
00498 tt->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), tt->totalnum);
00499 tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num);
00500 ttt->bgnlistlen = tt->context_num;
00501 ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen);
00502 ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen);
00503 ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen);
00504 } else {
00505 tt->context_num = tt->totalnum;
00506 tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num);
00507 ttt->bgnlistlen = tt->context_num;
00508 ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen);
00509 if (ttt->is24bit) {
00510 ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen);
00511 ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen);
00512 n3_bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen);
00513 } else {
00514 ttt->bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen);
00515 }
00516 }
00517
00518 ttt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->totalnum);
00519 ttt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ttt->totalnum);
00520 ttt->bo_wt = NULL;
00521
00522
00523 jlog("Stat: ngram_read_bin_compat: reading 2-gram\n");
00524 #ifdef WORDS_INT
00525 rdn_wordid(fp, tt->nnid2wid, tt->totalnum, need_conv);
00526 #else
00527 rdn(fp, tt->nnid2wid, sizeof(WORD_ID), tt->totalnum);
00528 #endif
00529 rdn(fp, ndata->p_2, sizeof(LOGPROB), tt->totalnum);
00530 rdn(fp, tt->prob, sizeof(LOGPROB), tt->totalnum);
00531 if (file_version == 4) {
00532 rdn(fp, tt->nnid2ctid_upper, sizeof(NNID_UPPER), tt->totalnum);
00533 rdn(fp, tt->nnid2ctid_lower, sizeof(NNID_LOWER), tt->totalnum);
00534 rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num);
00535 rdn(fp, ttt->bgn_upper, sizeof(NNID_UPPER), ttt->bgnlistlen);
00536 rdn(fp, ttt->bgn_lower, sizeof(NNID_LOWER), ttt->bgnlistlen);
00537 #ifdef WORDS_INT
00538 rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);
00539 #else
00540 rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);
00541 #endif
00542 } else {
00543 rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num);
00544 if (ttt->is24bit) {
00545 rdn(fp, n3_bgn, sizeof(NNID), ttt->bgnlistlen);
00546 for(d=0;d<ttt->bgnlistlen;d++) {
00547 if (n3_bgn[d] == NNID_INVALID) {
00548 ttt->bgn_lower[d] = 0;
00549 ttt->bgn_upper[d] = NNID_INVALID_UPPER;
00550 } else {
00551 ntmp = n3_bgn[d] & 0xffff;
00552 ttt->bgn_lower[d] = ntmp;
00553 ntmp = n3_bgn[d] >> 16;
00554 ttt->bgn_upper[d] = ntmp;
00555 }
00556 }
00557 } else {
00558 rdn(fp, ttt->bgn, sizeof(NNID), ttt->bgnlistlen);
00559 }
00560 #ifdef WORDS_INT
00561 rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);
00562 #else
00563 rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);
00564 #endif
00565 }
00566
00567
00568 jlog("Stat: ngram_read_bin_compat: reading 3-gram\n");
00569 #ifdef WORDS_INT
00570 rdn_wordid(fp, ttt->nnid2wid, ttt->totalnum, need_conv);
00571 #else
00572 rdn(fp, ttt->nnid2wid, sizeof(WORD_ID), ttt->totalnum);
00573 #endif
00574 rdn(fp, ttt->prob, sizeof(LOGPROB), ttt->totalnum);
00575
00576
00577 if (file_version != 4) {
00578 if (ttt->is24bit) {
00579 free(n3_bgn);
00580 if (ngram_compact_context(ndata, 2) == FALSE) return FALSE;
00581 }
00582 }
00583
00584 return TRUE;
00585 }
00586
00587
00596 boolean
00597 ngram_read_bin(FILE *fp, NGRAM_INFO *ndata)
00598 {
00599 int retry;
00600
00601 #ifdef WORDS_INT
00602
00603 words_int_retry = FALSE;
00604
00605 ngram_read_bin_start:
00606 #endif
00607
00608 ndata->from_bin = TRUE;
00609
00610
00611 if (check_header(fp) == FALSE) return FALSE;
00612
00613 #ifdef WORDS_INT
00614
00615 if (words_int_retry) need_conv = TRUE;
00616 #endif
00617
00618 #ifdef WORDS_INT
00619 if (need_conv) jlog("Stat: ngram_read_bin: word-id size conversion enabled\n");
00620 #endif
00621
00622 if (file_version <= 4) {
00623 retry = 0;
00624 if (ngram_read_bin_compat(fp, ndata, &retry) == FALSE) {
00625 #ifdef WORDS_INT
00626 if (retry == 1) {
00627 goto ngram_read_bin_start;
00628 } else {
00629 return FALSE;
00630 }
00631 #else
00632 return FALSE;
00633 #endif
00634 }
00635 } else {
00636 if (ngram_read_bin_v5(fp, ndata) == FALSE) return FALSE;
00637 }
00638
00639
00640
00641 jlog("Stat: ngram_read_bin: making entry name index\n");
00642 ngram_make_lookup_tree(ndata);
00643
00644 bi_prob_func_set(ndata);
00645
00646 return TRUE;
00647 }
00648