00001
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include <sent/stddefs.h>
00038 #include <sent/ngram2.h>
00039
00040 static char buf[800];
00041 static char pbuf[800];
00042
00043
00050 static int
00051 get_total_info(FILE *fp, int num[])
00052 {
00053 char *p;
00054 int n;
00055 int maxn;
00056 int entry_num;
00057
00058 maxn = 0;
00059 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00060 if (strnmatch(buf, "ngram", 5)) {
00061 p = strtok(buf, "=");
00062 n = p[strlen(p)-1] - '0';
00063 if (n > MAX_N) {
00064 jlog("Error: too long N-gram (N=%d)\n", n);
00065 jlog("Error: current maximum length of N-gram is set to %d\n", MAX_N);
00066 jlog("Error: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");
00067 return -1;
00068 }
00069 p = strtok(NULL, "=");
00070 entry_num = atoi(p);
00071
00072 if (entry_num == 0) {
00073 jlog("Warning: empty %d-gram, skipped\n", n);
00074 } else {
00075 num[n-1] = entry_num;
00076 if (maxn < n) maxn = n;
00077 }
00078 }
00079 }
00080
00081 return(maxn);
00082 }
00083
00090 static boolean
00091 set_unigram(FILE *fp, NGRAM_INFO *ndata)
00092 {
00093 WORD_ID nid;
00094 int resid;
00095 LOGPROB prob, bo_wt;
00096 char *name, *p;
00097 boolean ok_p = TRUE;
00098 NGRAM_TUPLE_INFO *t;
00099
00100 t = &(ndata->d[0]);
00101
00102
00103 ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
00104
00105
00106
00107 t->bgn_upper = NULL;
00108 t->bgn_lower = NULL;
00109 t->bgn = NULL;
00110 t->num = NULL;
00111 t->bgnlistlen = 0;
00112 t->nnid2wid = NULL;
00113 t->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00114 t->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00115 t->context_num = t->totalnum;
00116 t->nnid2ctid_upper = NULL;
00117 t->nnid2ctid_lower = NULL;
00118
00119 nid = 0;
00120
00121 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00122 if ((p = strtok(buf, DELM)) == NULL) {
00123 jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
00124 return FALSE;
00125 }
00126 prob = (LOGPROB)atof(p);
00127 if ((p = strtok(NULL, DELM)) == NULL) {
00128 jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
00129 return FALSE;
00130 }
00131 name = strcpy((char *)mymalloc(strlen(p)+1), p);
00132 if ((p = strtok(NULL, DELM)) == NULL) {
00133 jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
00134 return FALSE;
00135 }
00136 bo_wt = (LOGPROB)atof(p);
00137
00138
00139 ndata->wname[nid] = name;
00140
00141
00142 if (ndata->root == NULL) {
00143 ndata->root = ptree_make_root_node(nid);
00144 } else {
00145 resid = ptree_search_data(name, ndata->root);
00146 if (resid != -1 && strmatch(name, ndata->wname[resid])) {
00147 jlog("Error: ngram_read_arpa: duplicate word entry \"%s\" at #%d and #%d in 1-gram\n", name, resid, nid);
00148 ok_p = FALSE;
00149 continue;
00150 } else {
00151 ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root));
00152 }
00153 }
00154
00155 if (nid >= ndata->max_word_num) {
00156 jlog("Error: ngram_read_arpa: num of 1-gram is bigger than header value (%d)\n", ndata->max_word_num);
00157 return FALSE;
00158 }
00159
00160
00161 t->prob[nid] = prob;
00162 t->bo_wt[nid] = bo_wt;
00163
00164 nid++;
00165 }
00166
00167 if (nid != t->totalnum) {
00168 jlog("Error: ngram_read_arpa: num of 1-gram (%d) not equal to header value (%d)\n", nid, t->totalnum);
00169 return FALSE;
00170 }
00171
00172 if (ok_p == TRUE) {
00173 jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", nid);
00174 }
00175
00176 return ok_p;
00177 }
00178
00179
00187 static boolean
00188 add_unigram(FILE *fp, NGRAM_INFO *ndata)
00189 {
00190 WORD_ID read_word_num;
00191 WORD_ID nid;
00192 LOGPROB prob, bo_wt;
00193 char *name, *p;
00194 boolean ok_p = TRUE;
00195 boolean mismatched = FALSE;
00196
00197 ndata->bo_wt_1 = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->max_word_num);
00198
00199 read_word_num = 0;
00200 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00201 if ((p = strtok(buf, DELM)) == NULL) {
00202 jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00203 return FALSE;
00204 }
00205 prob = atof(p);
00206 if ((p = strtok(NULL, DELM)) == NULL) {
00207 jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00208 return FALSE;
00209 }
00210 name = strcpy((char *)mymalloc(strlen(p)+1), p);
00211 if ((p = strtok(NULL, DELM)) == NULL) {
00212 jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00213 return FALSE;
00214 }
00215 bo_wt = (LOGPROB)atof(p);
00216
00217
00218 nid = ngram_lookup_word(ndata, name);
00219 if (nid == WORD_INVALID) {
00220 if (mismatched == FALSE) {
00221 jlog("Error: ngram_read_arpa: vocabulary mismatch between LR n-gram and RL n-gram\n");
00222 mismatched = TRUE;
00223 }
00224 jlog("Error: ngram_read_arpa: \"%s\" does not appears in LR n-gram\n", name);
00225 ok_p = FALSE;
00226 } else {
00227 ndata->bo_wt_1[nid] = bo_wt;
00228 }
00229
00230 read_word_num++;
00231 if (read_word_num > ndata->max_word_num) {
00232 jlog("Error: ngram_read_arpa: vocabulary size of RL n-gram is bigger than header value (%d)\n", ndata->max_word_num);
00233 return FALSE;
00234 }
00235 free(name);
00236 }
00237 if (ok_p == TRUE) {
00238 jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", read_word_num);
00239 }
00240
00241 return ok_p;
00242 }
00243
00252 static boolean
00253 add_bigram(FILE *fp, NGRAM_INFO *ndata)
00254 {
00255 WORD_ID w[2], wtmp;
00256 LOGPROB prob;
00257 int bi_count = 0;
00258 NNID n2;
00259 boolean ok_p = TRUE;
00260 char *s;
00261
00262 ndata->p_2 = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->d[1].totalnum);
00263
00264 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00265 strcpy(pbuf, buf);
00266 if ( ++bi_count % 100000 == 0) {
00267 jlog("Stat: ngram_read_arpa: 2-gram read %d (%d%%)\n", bi_count, bi_count * 100 / ndata->d[1].totalnum);
00268 }
00269 if ((s = strtok(buf, DELM)) == NULL) {
00270 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00271 return FALSE;
00272 }
00273 prob = (LOGPROB)atof(s);
00274 if ((s = strtok(NULL, DELM)) == NULL) {
00275 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00276 return FALSE;
00277 }
00278 w[0] = ngram_lookup_word(ndata, s);
00279 if (w[1] == WORD_INVALID) {
00280 jlog("Error: ngram_read_arpa: 2-gram #%d: \"%s\": \"%s\" not exist in 1-gram\n", n2+1, pbuf, s);
00281 ok_p = FALSE;
00282 continue;
00283 }
00284 if ((s = strtok(NULL, DELM)) == NULL) {
00285 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00286 return FALSE;
00287 }
00288 w[1] = ngram_lookup_word(ndata, s);
00289 if (w[0] == WORD_INVALID) {
00290 jlog("Error: ngram_read_arpa: 2-gram #%d: \"%s\": \"%s\" not exist in 1-gram\n", n2+1, pbuf, s);
00291 ok_p = FALSE;
00292 continue;
00293 }
00294 if (ndata->dir == DIR_RL) {
00295
00296 wtmp = w[0];
00297 w[0] = w[1];
00298 w[1] = wtmp;
00299 }
00300 n2 = search_ngram(ndata, 2, w);
00301 if (n2 == NNID_INVALID) {
00302 jlog("Warning: ngram_read_arpa: 2-gram #%d: \"%s\": (%s,%s) not exist in LR 2-gram (ignored)\n", n2+1, pbuf, ndata->wname[w[0]], ndata->wname[w[1]]);
00303 } else {
00304 ndata->p_2[n2] = prob;
00305 }
00306 }
00307
00308 if (ok_p == TRUE) {
00309 jlog("Stat: ngram_read_arpa: 2-gram read %d end\n", bi_count);
00310 }
00311
00312 return ok_p;
00313 }
00314
00321 static boolean
00322 set_ngram(FILE *fp, NGRAM_INFO *ndata, int n)
00323 {
00324 NNID i;
00325 WORD_ID w[MAX_N];
00326 WORD_ID w_last[MAX_N];
00327 LOGPROB p, bowt;
00328 NNID nnid;
00329 NNID cid, cid_last;
00330 boolean ok_p = TRUE;
00331 char *s;
00332 NGRAM_TUPLE_INFO *t;
00333 NGRAM_TUPLE_INFO *tprev;
00334 NNID ntmp;
00335
00336 if (n < 2) {
00337 jlog("Error: ngram_read_arpa: unable to process 1-gram\n");
00338 return FALSE;
00339 }
00340
00341 t = &(ndata->d[n-1]);
00342 tprev = &(ndata->d[n-2]);
00343
00344
00345 t->bgnlistlen = tprev->context_num;
00346 if (t->is24bit) {
00347 t->bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * t->bgnlistlen);
00348 t->bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * t->bgnlistlen);
00349 for(i = 0; i < t->bgnlistlen; i++) {
00350 t->bgn_upper[i] = NNID_INVALID_UPPER;
00351 t->bgn_lower[i] = 0;
00352 }
00353 } else {
00354 t->bgn = (NNID *)mymalloc(sizeof(NNID) * t->bgnlistlen);
00355 for(i = 0;i < t->bgnlistlen; i++) {
00356 t->bgn[i] = NNID_INVALID;
00357 }
00358 }
00359 t->num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * t->bgnlistlen);
00360 for(i = 0; i < t->bgnlistlen; i++) {
00361 t->num[i] = 0;
00362 }
00363
00364
00365 t->nnid2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * t->totalnum);
00366 t->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00367 t->bo_wt = NULL;
00368 t->nnid2ctid_upper = NULL;
00369 t->nnid2ctid_lower = NULL;
00370
00371 nnid = 0;
00372 cid = cid_last = NNID_INVALID;
00373 for(i=0;i<n;i++) w_last[i] = WORD_INVALID;
00374
00375
00376 for (;;) {
00377 if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;
00378 strcpy(pbuf, buf);
00379 if ( nnid % 100000 == 0) {
00380 jlog("Stat: ngram_read_arpa: %d-gram read %d (%d%%)\n", n, nnid, nnid * 100 / t->totalnum);
00381 }
00382
00383
00384 if ((s = strtok(buf, DELM)) == NULL) {
00385 jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
00386 return FALSE;
00387 }
00388 p = (LOGPROB)atof(s);
00389
00390 for(i=0;i<n;i++) {
00391 if ((s = strtok(NULL, DELM)) == NULL) {
00392 jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
00393 return FALSE;
00394 }
00395 if ((w[i] = ngram_lookup_word(ndata, s)) == WORD_INVALID) {
00396 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": \"%s\" not exist in %d-gram\n", n, nnid+1, pbuf, s, n);
00397 ok_p = FALSE;
00398 break;
00399 }
00400
00401 }
00402 if (i < n) continue;
00403
00404
00405 for(i=0;i<n-1;i++) {
00406 if (w[i] != w_last[i]) break;
00407 }
00408 if (i < n-1) {
00409
00410 cid = search_ngram(ndata, n-1, w);
00411 if (cid == NNID_INVALID) {
00412
00413 jlog("Warning: ngram_read_arpa: %d-gram #%d: \"%s\": context (",
00414 n, nnid+1, pbuf);
00415 for(i=0;i<n-1;i++) {
00416 jlog(" %s", ndata->wname[w[i]]);
00417 }
00418 jlog(") not exist in %d-gram (ignored)\n", n-1);
00419 ok_p = FALSE;
00420 continue;
00421 }
00422 if (cid_last != NNID_INVALID) {
00423
00424 if (t->is24bit) {
00425 ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
00426 } else {
00427 ntmp = t->bgn[cid_last];
00428 }
00429 t->num[cid_last] = nnid - ntmp;
00430 }
00431
00432 if (t->is24bit) {
00433 if (t->bgn_upper[cid] != NNID_INVALID_UPPER) {
00434 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00435 return FALSE;
00436 }
00437 ntmp = nnid & 0xffff;
00438 t->bgn_lower[cid] = ntmp;
00439 ntmp = nnid >> 16;
00440 t->bgn_upper[cid] = ntmp;
00441 } else {
00442 if (t->bgn[cid] != NNID_INVALID) {
00443 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00444 return FALSE;
00445 }
00446 t->bgn[cid] = nnid;
00447 }
00448
00449 cid_last = cid;
00450 w_last[n-1] = WORD_INVALID;
00451 }
00452
00453
00454 if (w[n-1] == w_last[n-1]) {
00455 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": duplicated entry\n", n, nnid+1, pbuf);
00456 ok_p = FALSE;
00457 continue;
00458 } else if (w_last[n-1] != WORD_INVALID && w[n-1] < w_last[n-1]) {
00459 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00460 return FALSE;
00461 }
00462
00463
00464 if ((s = strtok(NULL, DELM)) != NULL) {
00465 bowt = (LOGPROB) atof(s);
00466 if (t->bo_wt == NULL) {
00467 t->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00468 }
00469 t->bo_wt[nnid] = bowt;
00470 }
00471
00472
00473 t->nnid2wid[nnid] = w[n-1];
00474 t->prob[nnid] = p;
00475
00476 nnid++;
00477 for(i=0;i<n;i++) w_last[i] = w[i];
00478
00479
00480 if (nnid > t->totalnum) {
00481 jlog("Error: ngram_read_arpa: %d-gram: read num (%d) not match the header value (%d)\n", n, nnid, t->totalnum);
00482 return FALSE;
00483 }
00484 }
00485
00486
00487 if (t->is24bit) {
00488 ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
00489 } else {
00490 ntmp = t->bgn[cid_last];
00491 }
00492 t->num[cid_last] = nnid - ntmp;
00493
00494 if (t->bo_wt != NULL) t->context_num = t->totalnum;
00495
00496 if (ok_p == TRUE) {
00497 jlog("Stat: ngram_read_arpa: %d-gram read %d end\n", n, nnid);
00498 }
00499
00500 return ok_p;
00501 }
00502
00513 boolean
00514 ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition)
00515 {
00516 int i, n;
00517 int num[MAX_N];
00518
00519
00520 ndata->from_bin = FALSE;
00521 ndata->bigram_index_reversed = FALSE;
00522
00523
00524 while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0);
00525
00526
00527 if (addition) {
00528
00529
00530 n = get_total_info(fp, num);
00531 if (n == -1) {
00532 return FALSE;
00533 }
00534
00535
00536 if (n < 2) {
00537 jlog("Error: forward N-gram for pass1 is does not contain 2-gram\n");
00538 return FALSE;
00539 }
00540 if (n > 2) {
00541 jlog("Warning: forward N-gram for pass1 contains %d-gram, only 2-gram will be used\n", n);
00542 }
00543
00544
00545 for(i=0;i<2;i++) {
00546 if (ndata->d[i].totalnum != num[i]) {
00547 jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", n+1);
00548 }
00549 }
00550
00551 if (!strnmatch(buf,"\\1-grams",8)) {
00552 jlog("Error: ngram_read_arpa: 1-gram not found for additional LR 2-gram\n");
00553 return FALSE;
00554 }
00555 jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
00556 if (add_unigram(fp, ndata) == FALSE) return FALSE;
00557
00558 if (!strnmatch(buf,"\\2-grams", 8)) {
00559 jlog("Error: ngram_read_arpa: 2-gram not found for additional LR 2-gram\n");
00560 return FALSE;
00561 }
00562 jlog("Stat: ngram_read_arpa: reading 2-gram part...\n");
00563 if (add_bigram(fp, ndata) == FALSE) return FALSE;
00564
00565
00566
00567 if (strnmatch(buf,"\\3-grams", 8)) {
00568 jlog("Warning: forward n-gram contains more than 3-gram, ignored\n");
00569 }
00570
00571 } else {
00572
00573 n = get_total_info(fp, num);
00574 if (n == -1) {
00575 return FALSE;
00576 }
00577 jlog("Stat: ngram_read_arpa: this is %d-gram file\n", n);
00578 for(i=0;i<n;i++) {
00579 ndata->d[i].totalnum = num[i];
00580 }
00581
00582
00583 if (ndata->d[0].totalnum > MAX_WORD_NUM) {
00584 jlog("Error: ngram_read_arpa: N-gram vocabulary size exceeds the limit (%d)\n", MAX_WORD_NUM);
00585 return FALSE;
00586 }
00587 ndata->max_word_num = ndata->d[0].totalnum;
00588
00589
00590
00591 for(i=0;i<n;i++) {
00592 if (i < 2) {
00593 ndata->d[i].is24bit = FALSE;
00594 } else {
00595
00596
00597 if (ndata->d[i].totalnum >= NNIDMAX) {
00598 jlog("Warning: ngram_read_arpa: more than 24bit %d-gram tuples, use 32bit index\n", NNIDMAX, n+1);
00599 ndata->d[i].is24bit = FALSE;
00600 } else {
00601 ndata->d[i].is24bit = TRUE;
00602 }
00603 }
00604 }
00605
00606 for(i=0;i<n;i++) {
00607 ndata->d[i].ct_compaction = FALSE;
00608 }
00609
00610
00611 if (!strnmatch(buf,"\\1-grams",8)) {
00612 jlog("Error: ngram_read_arpa: data format error: 1-gram not found\n");
00613 return FALSE;
00614 }
00615 jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
00616 if (set_unigram(fp, ndata) == FALSE) return FALSE;
00617
00618 i = 2;
00619 while(i <= n) {
00620
00621 sprintf(pbuf, "\\%d-grams", i);
00622 if (!strnmatch(buf, pbuf, 8)) {
00623 jlog("Error: ngram_read_arpa: data format error: %d-gram not found\n", i);
00624 return FALSE;
00625 }
00626 jlog("Stat: ngram_read_arpa: reading %d-gram part...\n", i);
00627 if (set_ngram(fp, ndata, i) == FALSE) return FALSE;
00628 i++;
00629 }
00630
00631 if (!strnmatch(buf, "\\end", 4)) {
00632 jlog("Error: ngram_read_arpa: data format error: end marker \"\\end\" not found\n");
00633 return FALSE;
00634 }
00635
00636 ndata->n = n;
00637
00638 for(i=1;i<n;i++) {
00639 if (ndata->d[i].bo_wt != NULL) {
00640
00641 if (ngram_compact_context(ndata, i+1) == FALSE) return FALSE;
00642 ndata->d[i].ct_compaction = TRUE;
00643 }
00644 }
00645
00646
00647 set_unknown_id(ndata);
00648
00649 }
00650
00651 #ifdef CLASS_NGRAM
00652
00653 if (getl(buf, sizeof(buf), fp) != NULL) {
00654 if (strnmatch(buf, "\\class", 6)) {
00655 jlog("Stat: ngram_read_arpa: skipping in-class word entries...\n");
00656 }
00657 }
00658 #endif
00659
00660 bi_prob_func_set(ndata);
00661
00662 return TRUE;
00663 }