00001
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 #include <sent/stddefs.h>
00037 #include <sent/ngram2.h>
00038
00039 static char buf[800];
00040 static char pbuf[800];
00041
00042
00051 static WORD_ID
00052 lookup_word(NGRAM_INFO *ndata, char *str)
00053 {
00054 WORD_ID wid;
00055
00056 if ((wid = ngram_lookup_word(ndata, str)) == WORD_INVALID) {
00057 j_error("word %s not in N-gram vocabulary.\n",str);
00058 }
00059 return wid;
00060 }
00061
00071 void
00072 set_unknown_id(NGRAM_INFO *ndata)
00073 {
00074 #if 0
00075 ndata->unk_id = ngram_lookup_word(ndata, unkword);
00076 if (ndata->unk_id == WORD_INVALID) {
00077 j_printerr("word %s not found, so assume this is a closed vocabulary model\n",
00078 unkword);
00079 ndata->isopen = FALSE;
00080 } else {
00081 ndata->isopen = TRUE;
00082 }
00083 #endif
00084 ndata->isopen = TRUE;
00085 ndata->unk_id = 0;
00086
00087 }
00088
00089
00096 static void
00097 set_total_info(FILE *fp, NGRAM_INFO *ndata)
00098 {
00099 char *p;
00100 int n;
00101
00102 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00103 if (strnmatch(buf, "ngram", 5)) {
00104 p = strtok(buf, "=");
00105 n = p[strlen(p)-1] - '0' - 1;
00106 p = strtok(NULL, "=");
00107 ndata->ngram_num[n] = atoi(p);
00108 }
00109 }
00110 }
00111
00112
00120 static void
00121 set_and_check_total_info(FILE *fp, NGRAM_INFO *ndata)
00122 {
00123 char *p;
00124 int n;
00125
00126 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00127 if (strnmatch(buf, "ngram", 5)) {
00128 p = strtok(buf, "=");
00129 n = p[strlen(p)-1] - '0' - 1;
00130 p = strtok(NULL, "=");
00131
00132
00133
00134
00135
00136
00137 if (n == 2) {
00138 ndata->ngram_num[n] = atoi(p);
00139 } else {
00140 if (n <= 1 && ndata->ngram_num[n] != atoi(p)) {
00141 j_printerr("Warning: %d-gram total num differ! may cause read error\n",n+1);
00142 }
00143 }
00144 }
00145 }
00146 }
00147
00154 static void
00155 set_unigram(FILE *fp, NGRAM_INFO *ndata)
00156 {
00157 WORD_ID read_word_num;
00158 WORD_ID nid, resid;
00159 LOGPROB prob, bo_wt;
00160 char *name, *p;
00161
00162
00163 ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]);
00164 ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]);
00165 ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]);
00166 ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[0]);
00167 ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID)*ndata->ngram_num[0]);
00168 ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[0]);
00169 read_word_num = 0;
00170
00171 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00172 prob = (LOGPROB)atof(first_token(buf));
00173 p = next_token();
00174 name = strcpy((char *)mymalloc(strlen(p)+1), p);
00175 bo_wt = (LOGPROB)atof(next_token());
00176
00177
00178 nid = read_word_num;
00179 ndata->wname[nid] = name;
00180
00181 if (ndata->root == NULL) {
00182 ndata->root = ptree_make_root_node(nid);
00183 } else {
00184 resid = ptree_search_data(name, ndata->root);
00185 if (strmatch(name, ndata->wname[resid])) {
00186 j_error("Error: word \"%s\" multiply defined at (#%d and #%d)\n",
00187 name, resid, nid);
00188 } else {
00189 ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root));
00190 }
00191 }
00192 ndata->p[nid] = prob;
00193 ndata->bo_wt_lr[nid] = bo_wt;
00194 ndata->n2_bgn[nid] = NNID_INVALID;
00195 ndata->n2_num[nid] = 0;
00196
00197 read_word_num++;
00198 if (read_word_num > ndata->max_word_num) {
00199 j_printerr("Error: actual n-gram word num exceeded header value\n");
00200 j_error("%d > %d\n", read_word_num, ndata->max_word_num);
00201 }
00202 }
00203
00204 if (read_word_num != ndata->ngram_num[0]) {
00205 j_printerr("Error: actual n-gram word num not match the header value\n");
00206 j_error("%d != %d ?\n", read_word_num, ndata->ngram_num[0]);
00207 }
00208 j_printerr(" 1-gram read %d end\n", read_word_num);
00209 }
00210
00211
00219 static void
00220 add_unigram(FILE *fp, NGRAM_INFO *ndata)
00221 {
00222 WORD_ID read_word_num;
00223 WORD_ID nid;
00224 LOGPROB prob, bo_wt;
00225 char *name, *p;
00226
00227 read_word_num = 0;
00228 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00229 prob = atof(first_token(buf));
00230 p = next_token();
00231 name = strcpy((char *)mymalloc(strlen(p)+1), p);
00232 bo_wt = (LOGPROB)atof(next_token());
00233
00234
00235 nid = lookup_word(ndata, name);
00236 if (nid == WORD_INVALID) {
00237 j_printerr("Warning: n-gram word \"%s\" in RL not exist in LR (ignored)\n", name);
00238 } else {
00239 ndata->bo_wt_rl[nid] = bo_wt;
00240 }
00241
00242 read_word_num++;
00243 if (read_word_num > ndata->max_word_num) {
00244 j_printerr("Error: actual n-gram word num exceeded header value\n");
00245 j_error("%d > %d\n", read_word_num, ndata->max_word_num);
00246 }
00247 free(name);
00248 }
00249 j_printerr(" 1-gram read %d end\n", read_word_num);
00250
00251 }
00252
00259 static void
00260 set_bigram(FILE *fp, NGRAM_INFO *ndata)
00261 {
00262 int w_l, w_r;
00263 int w_last, w_r_last;
00264 LOGPROB p;
00265 NNID n2;
00266
00267 ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[1]);
00268 ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]);
00269 ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]);
00270 ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[1]);
00271
00272 n2 = 0;
00273
00274
00275 w_last = -1; w_r_last = -1;
00276 for (;;) {
00277 if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;
00278 strcpy(pbuf, buf);
00279 if ( n2 % 100000 == 0) {
00280 j_printerr(" 2-gram read %d (%d%%)\n", n2, n2 * 100 / ndata->ngram_num[1]);
00281 }
00282
00283
00284 p = (LOGPROB)atof(first_token(buf));
00285
00286 w_l = lookup_word(ndata, next_token());
00287
00288 if (w_l != w_last) {
00289 if (w_last != -1) ndata->n2_num[w_last] = n2 - ndata->n2_bgn[w_last];
00290
00291 if (ndata->n2_bgn[w_l] != NNID_INVALID) {
00292 j_printerr("Error: entry not sorted (same left context not sequenced)\n");
00293 j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf);
00294 }
00295 ndata->n2_bgn[w_l] = n2;
00296 w_r_last = -1;
00297 }
00298
00299 w_r = lookup_word(ndata, next_token());
00300 if (w_r == w_r_last) {
00301 j_printerr("Error: duplicated entry\n");
00302 j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf);
00303 } else if (w_r < w_r_last) {
00304 j_printerr("Error: entry not sorted downward\n");
00305 j_error("at 2-gram #%d: \"%s\"\n", n2+1, pbuf);
00306 }
00307 ndata->n2tonid[n2] = w_r;
00308 ndata->p_lr[n2] = p;
00309
00310 n2++;
00311 w_last = w_l;
00312 w_r_last = w_r;
00313
00314
00315 if (n2 > ndata->ngram_num[1]) {
00316 j_printerr("Error: actual 2-gram num not match the header value\n");
00317 j_error("%d != %d ?\n", n2, ndata->ngram_num[1]);
00318 }
00319 }
00320
00321
00322 ndata->n2_num[w_last] = n2 - ndata->n2_bgn[w_last];
00323
00324 j_printerr(" 2-gram read %d end\n", n2);
00325
00326 }
00327
00336 static void
00337 add_bigram_rl(FILE *fp, NGRAM_INFO *ndata)
00338 {
00339 WORD_ID w_l, w_r;
00340 LOGPROB prob, bo_wt;
00341 int bi_count = 0;
00342 NNID n2;
00343
00344 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00345
00346 if ( ++bi_count % 100000 == 0) {
00347 j_printerr(" 2-gram read %d (%d%%)\n", bi_count, bi_count * 100 / ndata->ngram_num[1]);
00348 }
00349 prob = (LOGPROB)atof(first_token(buf));
00350 w_r = lookup_word(ndata, next_token());
00351 w_l = lookup_word(ndata, next_token());
00352 bo_wt = (LOGPROB)atof(next_token());
00353 n2 = search_bigram(ndata, w_l, w_r);
00354 if (n2 == NNID_INVALID) {
00355 j_printerr("Warning: (%s,%s) not exist in LR 2-gram (ignored)\n",
00356 ndata->wname[w_l], ndata->wname[w_r]);
00357 } else {
00358 ndata->p_rl[n2] = prob;
00359 ndata->bo_wt_rrl[n2] = bo_wt;
00360 }
00361 }
00362 j_printerr(" 2-gram read %d end\n", bi_count);
00363
00364 }
00365
00366
00373 static void
00374 set_trigram(FILE *fp, NGRAM_INFO *ndata)
00375 {
00376 int w_l, w_m, w_r;
00377 LOGPROB p_rl;
00378 int w_r_last, w_m_last, w_l_last;
00379 NNID n2, n2_last;
00380 NNID n3;
00381 NNID ntmp;
00382
00383
00384 switch(ndata->version) {
00385 case 3:
00386 ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID)*ndata->ngram_num[1]);
00387 for(n2=0;n2<ndata->ngram_num[1];n2++) ndata->n3_bgn[n2] = NNID_INVALID;
00388 break;
00389 case 4:
00390 ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER)*ndata->ngram_num[1]);
00391 ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER)*ndata->ngram_num[1]);
00392 for(n2=0;n2<ndata->ngram_num[1];n2++) {
00393 ndata->n3_bgn_upper[n2] = NNID_INVALID_UPPER;
00394 ndata->n3_bgn_lower[n2] = 0;
00395 }
00396 break;
00397 }
00398 ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[1]);
00399 for(n2=0;n2<ndata->ngram_num[1];n2++) ndata->n3_num[n2] = 0;
00400
00401
00402 ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[2]);
00403 ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[2]);
00404 n3 = 0;
00405
00406 n2 = n2_last = NNID_INVALID;
00407 w_r_last = w_m_last = w_l_last = -1;
00408 for (;;) {
00409
00410 if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;
00411 strcpy(pbuf, buf);
00412 if (n3 % 100000 == 0) {
00413 j_printerr(" 3-gram read %d (%d%%)\n", n3, n3 * 100 / ndata->ngram_num[2]);
00414 }
00415
00416
00417 p_rl = (LOGPROB)atof(first_token(buf));
00418
00419 w_r = lookup_word(ndata, next_token());
00420
00421 w_m = lookup_word(ndata, next_token());
00422
00423
00424 if (w_r != w_r_last || w_m != w_m_last) {
00425 n2 = search_bigram(ndata, (WORD_ID)w_m, (WORD_ID)w_r);
00426 if (n2 == NNID_INVALID) {
00427 j_printerr("Warning: context (%s,%s) not exist in LR 2-gram (ignored)\n",
00428 ndata->wname[w_m], ndata->wname[w_r]);
00429 continue;
00430 }
00431 switch(ndata->version) {
00432 case 3:
00433 ntmp = ndata->n3_bgn[n2_last];
00434 break;
00435 case 4:
00436 ntmp = ((NNID)(ndata->n3_bgn_upper[n2_last]) << 16) + (NNID)(ndata->n3_bgn_lower[n2_last]);
00437 break;
00438 }
00439 if (n2_last != NNID_INVALID) ndata->n3_num[n2_last] = n3 - ntmp;
00440
00441 switch(ndata->version) {
00442 case 3:
00443 if (ndata->n3_bgn[n2] != NNID_INVALID) {
00444 j_printerr("Error: entry not sorted (same left context not sequenced)\n");
00445 j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);
00446 }
00447 ndata->n3_bgn[n2] = n3;
00448 break;
00449 case 4:
00450 if (ndata->n3_bgn_upper[n2] != NNID_INVALID_UPPER) {
00451 j_printerr("Error: entry not sorted (same left context not sequenced)\n");
00452 j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);
00453 }
00454 ntmp = n3 & 0xffff;
00455 ndata->n3_bgn_lower[n2] = ntmp;
00456 ntmp = n3 >> 16;
00457 ndata->n3_bgn_upper[n2] = ntmp;
00458 break;
00459 }
00460
00461 n2_last = n2;
00462 w_l_last = -1;
00463 } else {
00464 if (n2 == NNID_INVALID) continue;
00465 }
00466
00467
00468 w_l = lookup_word(ndata, next_token());
00469 if (w_l == w_l_last) {
00470 j_printerr("Error: duplicated entry\n");
00471 j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);
00472 } else if (w_l < w_l_last) {
00473 j_printerr("Error: entry not sorted downward\n");
00474 j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf);
00475 }
00476 ndata->n3tonid[n3] = w_l;
00477 ndata->p_rrl[n3] = p_rl;
00478
00479 n3++;
00480 w_m_last = w_m;
00481 w_r_last = w_r;
00482 w_l_last = w_l;
00483
00484
00485 if (n3 > ndata->ngram_num[2]) {
00486 j_printerr("Error: actual 3-gram num not match the header value\n");
00487 j_error("%d != %d ?\n", n3, ndata->ngram_num[2]);
00488 }
00489 }
00490
00491
00492 switch(ndata->version) {
00493 case 3:
00494 ntmp = ndata->n3_bgn[n2_last];
00495 break;
00496 case 4:
00497 ntmp = ((NNID)(ndata->n3_bgn_upper[n2_last]) << 16) + (NNID)(ndata->n3_bgn_lower[n2_last]);
00498 break;
00499 }
00500 ndata->n3_num[n2_last] = n3 - ntmp;
00501
00502 j_printerr(" 3-gram read %d end\n", n3);
00503 }
00504
00505
00506 static boolean LR_2gram_read = FALSE;
00507
00517 boolean
00518 ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, int direction)
00519 {
00520 int n;
00521
00522 ndata->from_bin = FALSE;
00523
00524 if (!LR_2gram_read && direction == DIR_RL) {
00525 j_printerr("you should first read LR 2-gram\n");
00526 return FALSE;
00527 }
00528
00529 if (direction == DIR_LR) {
00530 n = 2;
00531 } else {
00532 n = 3;
00533 }
00534
00535
00536 while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0);
00537
00538
00539 if (direction == DIR_LR) {
00540 set_total_info(fp, ndata);
00541 } else {
00542 set_and_check_total_info(fp, ndata);
00543 }
00544 if (ndata->ngram_num[0] > MAX_WORD_NUM) {
00545 j_error("Error: vocabulary size exceeded limit (%d)\n", MAX_WORD_NUM);
00546 }
00547 ndata->max_word_num = ndata->ngram_num[0];
00548
00549
00550 if (n >= 3) {
00551 if (ndata->ngram_num[2] >= NNIDMAX) {
00552 j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX);
00553 ndata->version = 3;
00554 } else {
00555 ndata->version = 4;
00556 }
00557 }
00558
00559
00560 if (!strnmatch(buf,"\\1-grams",8)) {
00561 j_error("data format error: 1-gram not found\n");
00562 }
00563 j_printerr(" reading 1-gram part...\n");
00564 if (direction == DIR_LR) {
00565 set_unigram(fp, ndata);
00566 } else {
00567 add_unigram(fp, ndata);
00568 }
00569
00570 if (n >= 2) {
00571
00572 if (!strnmatch(buf,"\\2-grams", 8)) {
00573 j_error("data format error: 2-gram not found\n");
00574 }
00575 j_printerr(" reading 2-gram part...\n");
00576 if (direction == DIR_LR) {
00577 set_bigram(fp, ndata);
00578 } else {
00579 add_bigram_rl(fp, ndata);
00580 }
00581 }
00582
00583 if (n >= 3) {
00584
00585 if (!strnmatch(buf,"\\3-grams", 8)) {
00586 j_error("data format error: 3-gram not found\n");
00587 }
00588 if ( direction == DIR_LR) {
00589 j_error("should not happen..\n");
00590 } else {
00591 j_printerr(" reading 3-gram part...\n");
00592 set_trigram(fp, ndata);
00593 }
00594 }
00595
00596
00597 if (!strnmatch(buf, "\\end", 4)) {
00598 j_error("data format error: data end marker \"\\end\" not found\n");
00599 }
00600 #ifdef CLASS_NGRAM
00601
00602 if (getl(buf, sizeof(buf), fp) != NULL) {
00603 if (strnmatch(buf, "\\class", 6)) {
00604 j_printerr(" skipping in-class word entries...\n");
00605 }
00606 }
00607 #endif
00608
00609 if (n >= 3 && ndata->version == 4) {
00610
00611 ngram_compact_bigram_context(ndata);
00612 }
00613
00614
00615 set_unknown_id(ndata);
00616
00617 if (direction == DIR_LR) {
00618 LR_2gram_read = TRUE;
00619 }
00620
00621 return TRUE;
00622 }
00623
00629 void
00630 ngram_compact_bigram_context(NGRAM_INFO *ndata)
00631 {
00632 NNID i;
00633 int c;
00634 int dst;
00635 NNID ntmp;
00636
00637
00638 if (ndata->version != 4) {
00639 j_error("InternalError: bigram context compaction called for version != 4\n");
00640 }
00641
00642
00643 c = 0;
00644 for(i=0;i<ndata->ngram_num[1];i++) {
00645 if (ndata->n3_bgn_upper[i] != NNID_INVALID_UPPER) {
00646 c++;
00647 } else {
00648 if (ndata->n3_num[i] != 0) {
00649 printf("bgn=%d|%d, num=%d, bo_wt_rrl=%f\n",
00650 ndata->n3_bgn_upper[i],
00651 ndata->n3_bgn_lower[i],
00652 ndata->n3_num[i],
00653 ndata->bo_wt_rrl[i]);
00654 j_error("Error: ngram_compact_bigram_context: internal error\n");
00655 }
00656 if (ndata->bo_wt_rrl[i] != 0.0) {
00657 j_error("Error: 2-gram has no upper 3-gram, but not 0.0 back-off weight\n");
00658 }
00659 }
00660 }
00661 ndata->bigram_bo_num = c;
00662 j_printerr("num: %d -> %d\n", ndata->ngram_num[1], ndata->bigram_bo_num);
00663
00664
00665 ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00666 ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00667
00668 dst = 0;
00669 for(i=0;i<ndata->ngram_num[1];i++) {
00670 if (ndata->n3_bgn_upper[i] != NNID_INVALID_UPPER) {
00671 ndata->bo_wt_rrl[dst] = ndata->bo_wt_rrl[i];
00672 ndata->n3_bgn_upper[dst] = ndata->n3_bgn_upper[i];
00673 ndata->n3_bgn_lower[dst] = ndata->n3_bgn_lower[i];
00674 ndata->n3_num[dst] = ndata->n3_num[i];
00675 ntmp = dst & 0xffff;
00676 ndata->n2bo_lower[i] = ntmp;
00677 ntmp = dst >> 16;
00678 ndata->n2bo_upper[i] = ntmp;
00679 dst++;
00680 } else {
00681 ndata->n2bo_upper[i] = NNID_INVALID_UPPER;
00682 ndata->n2bo_lower[i] = 0;
00683 }
00684 }
00685
00686 ndata->bo_wt_rrl = (LOGPROB *)myrealloc(ndata->bo_wt_rrl, sizeof(LOGPROB) * ndata->bigram_bo_num);
00687 ndata->n3_bgn_upper = (NNID_UPPER *)myrealloc(ndata->n3_bgn_upper, sizeof(NNID_UPPER) * ndata->bigram_bo_num);
00688 ndata->n3_bgn_lower = (NNID_LOWER *)myrealloc(ndata->n3_bgn_lower, sizeof(NNID_LOWER) * ndata->bigram_bo_num);
00689 ndata->n3_num = (WORD_ID *)myrealloc(ndata->n3_num, sizeof(WORD_ID) * ndata->bigram_bo_num);
00690 }