00001
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include <sent/stddefs.h>
00038 #include <sent/ngram2.h>
00039
00040 static char buf[800];
00041 static char pbuf[800];
00042
00043
00052 static int
00053 get_total_info(FILE *fp, NNID num[])
00054 {
00055 char *p;
00056 int n;
00057 int maxn;
00058 unsigned long entry_num;
00059
00060 maxn = 0;
00061
00062 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00063 if (strnmatch(buf, "ngram", 5)) {
00064 p = strtok(buf, "=");
00065 n = p[strlen(p)-1] - '0';
00066 if (n > MAX_N) {
00067 jlog("Error: too long N-gram (N=%d)\n", n);
00068 jlog("Error: current maximum length of N-gram is set to %d\n", MAX_N);
00069 jlog("Error: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");
00070 return -1;
00071 }
00072 p = strtok(NULL, "=");
00073
00074 sscanf(p, "%lu", &entry_num);
00075
00076 if (entry_num > NNID_MAX) {
00077 jlog("Error: too big %d-gram (exceeds %d bit)\n", n, sizeof(NNID) * 8);
00078 return -1;
00079 }
00080
00081 if (entry_num == 0) {
00082 jlog("Warning: empty %d-gram, skipped\n", n);
00083 } else {
00084 num[n-1] = entry_num;
00085 if (maxn < n) maxn = n;
00086 }
00087 }
00088 }
00089
00090 return(maxn);
00091 }
00092
00099 static boolean
00100 set_unigram(FILE *fp, NGRAM_INFO *ndata)
00101 {
00102 WORD_ID nid;
00103 int resid;
00104 LOGPROB prob, bo_wt;
00105 char *name, *p;
00106 boolean ok_p = TRUE;
00107 NGRAM_TUPLE_INFO *t;
00108
00109 t = &(ndata->d[0]);
00110
00111
00112 ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
00113
00114
00115
00116 t->bgn_upper = NULL;
00117 t->bgn_lower = NULL;
00118 t->bgn = NULL;
00119 t->num = NULL;
00120 t->bgnlistlen = 0;
00121 t->nnid2wid = NULL;
00122 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
00123 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
00124 t->context_num = t->totalnum;
00125 t->nnid2ctid_upper = NULL;
00126 t->nnid2ctid_lower = NULL;
00127
00128 nid = 0;
00129
00130 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00131 if ((p = strtok(buf, DELM)) == NULL) {
00132 jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");
00133 return FALSE;
00134 }
00135 prob = (LOGPROB)atof(p);
00136 if ((p = strtok(NULL, DELM)) == NULL) {
00137 jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n");
00138 return FALSE;
00139 }
00140 name = strcpy((char *)mymalloc(strlen(p)+1), p);
00141 if ((p = strtok(NULL, DELM)) == NULL) {
00142 bo_wt = 0.0;
00143 } else {
00144 bo_wt = (LOGPROB)atof(p);
00145 }
00146
00147
00148 ndata->wname[nid] = name;
00149
00150
00151 if (ndata->root == NULL) {
00152 ndata->root = ptree_make_root_node(nid);
00153 } else {
00154 resid = ptree_search_data(name, ndata->root);
00155 if (resid != -1 && strmatch(name, ndata->wname[resid])) {
00156 jlog("Error: ngram_read_arpa: duplicate word entry \"%s\" at #%d and #%d in 1-gram\n", name, resid, nid);
00157 ok_p = FALSE;
00158 continue;
00159 } else {
00160 ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root));
00161 }
00162 }
00163
00164 if (nid >= ndata->max_word_num) {
00165 jlog("Error: ngram_read_arpa: num of 1-gram is bigger than header value (%d)\n", ndata->max_word_num);
00166 return FALSE;
00167 }
00168
00169
00170 t->prob[nid] = prob;
00171 t->bo_wt[nid] = bo_wt;
00172
00173 nid++;
00174 }
00175
00176 if (nid != t->totalnum) {
00177 jlog("Error: ngram_read_arpa: num of 1-gram (%d) not equal to header value (%d)\n", nid, t->totalnum);
00178 return FALSE;
00179 }
00180
00181 if (ok_p == TRUE) {
00182 jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", nid);
00183 }
00184
00185 return ok_p;
00186 }
00187
00188
00196 static boolean
00197 add_unigram(FILE *fp, NGRAM_INFO *ndata)
00198 {
00199 WORD_ID read_word_num;
00200 WORD_ID nid;
00201 LOGPROB prob, bo_wt;
00202 char *name, *p;
00203 boolean ok_p = TRUE;
00204 boolean mismatched = FALSE;
00205
00206 ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->max_word_num);
00207
00208 read_word_num = 0;
00209 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00210 if ((p = strtok(buf, DELM)) == NULL) {
00211 jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00212 return FALSE;
00213 }
00214 prob = atof(p);
00215 if ((p = strtok(NULL, DELM)) == NULL) {
00216 jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00217 return FALSE;
00218 }
00219 name = strcpy((char *)mymalloc(strlen(p)+1), p);
00220 if ((p = strtok(NULL, DELM)) == NULL) {
00221 bo_wt = 0.0;
00222 } else {
00223 bo_wt = (LOGPROB)atof(p);
00224 }
00225
00226
00227 nid = ngram_lookup_word(ndata, name);
00228 if (nid == WORD_INVALID) {
00229 if (mismatched == FALSE) {
00230 jlog("Error: ngram_read_arpa: vocabulary mismatch between LR n-gram and RL n-gram\n");
00231 mismatched = TRUE;
00232 }
00233 jlog("Error: ngram_read_arpa: \"%s\" does not appears in LR n-gram\n", name);
00234 ok_p = FALSE;
00235 } else {
00236 ndata->bo_wt_1[nid] = bo_wt;
00237 }
00238
00239 read_word_num++;
00240 if (read_word_num > ndata->max_word_num) {
00241 jlog("Error: ngram_read_arpa: vocabulary size of RL n-gram is bigger than header value (%d)\n", ndata->max_word_num);
00242 return FALSE;
00243 }
00244 free(name);
00245 }
00246 if (ok_p == TRUE) {
00247 jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", read_word_num);
00248 }
00249
00250 return ok_p;
00251 }
00252
00260 static boolean
00261 add_bigram(FILE *fp, NGRAM_INFO *ndata)
00262 {
00263 WORD_ID w[2], wtmp;
00264 LOGPROB prob;
00265 NNID bi_count = 0;
00266 NNID n2;
00267 boolean ok_p = TRUE;
00268 char *s;
00269
00270 ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum);
00271
00272 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00273 strcpy(pbuf, buf);
00274 if ( ++bi_count % 100000 == 0) {
00275 jlog("Stat: ngram_read_arpa: 2-gram read %lu (%d%%)\n", bi_count, bi_count * 100 / ndata->d[1].totalnum);
00276 }
00277 if ((s = strtok(buf, DELM)) == NULL) {
00278 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00279 return FALSE;
00280 }
00281 prob = (LOGPROB)atof(s);
00282 if ((s = strtok(NULL, DELM)) == NULL) {
00283 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00284 return FALSE;
00285 }
00286 w[0] = ngram_lookup_word(ndata, s);
00287 if (w[0] == WORD_INVALID) {
00288 jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s);
00289 ok_p = FALSE;
00290 continue;
00291 }
00292 if ((s = strtok(NULL, DELM)) == NULL) {
00293 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00294 return FALSE;
00295 }
00296 w[1] = ngram_lookup_word(ndata, s);
00297 if (w[1] == WORD_INVALID) {
00298 jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s);
00299 ok_p = FALSE;
00300 continue;
00301 }
00302 if (ndata->dir == DIR_RL) {
00303
00304 wtmp = w[0];
00305 w[0] = w[1];
00306 w[1] = wtmp;
00307 }
00308 n2 = search_ngram(ndata, 2, w);
00309 if (n2 == NNID_INVALID) {
00310 jlog("Warning: ngram_read_arpa: 2-gram #%d: \"%s\": (%s,%s) not exist in LR 2-gram (ignored)\n", n2+1, pbuf, ndata->wname[w[0]], ndata->wname[w[1]]);
00311 } else {
00312 ndata->p_2[n2] = prob;
00313 }
00314 }
00315
00316 if (ok_p == TRUE) {
00317 jlog("Stat: ngram_read_arpa: 2-gram read %lu end\n", bi_count);
00318 }
00319
00320 return ok_p;
00321 }
00322
00329 static boolean
00330 set_ngram(FILE *fp, NGRAM_INFO *ndata, int n)
00331 {
00332 NNID i;
00333 WORD_ID w[MAX_N];
00334 WORD_ID w_last[MAX_N];
00335 LOGPROB p, bowt;
00336 NNID nnid;
00337 NNID cid, cid_last;
00338 boolean ok_p = TRUE;
00339 char *s;
00340 NGRAM_TUPLE_INFO *t;
00341 NGRAM_TUPLE_INFO *tprev;
00342 NNID ntmp;
00343
00344 if (n < 2) {
00345 jlog("Error: ngram_read_arpa: unable to process 1-gram\n");
00346 return FALSE;
00347 }
00348
00349 t = &(ndata->d[n-1]);
00350 tprev = &(ndata->d[n-2]);
00351
00352
00353 t->bgnlistlen = tprev->context_num;
00354 if (t->is24bit) {
00355 t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen);
00356 t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen);
00357 for(i = 0; i < t->bgnlistlen; i++) {
00358 t->bgn_upper[i] = NNID_INVALID_UPPER;
00359 t->bgn_lower[i] = 0;
00360 }
00361 } else {
00362 t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen);
00363 for(i = 0;i < t->bgnlistlen; i++) {
00364 t->bgn[i] = NNID_INVALID;
00365 }
00366 }
00367 t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen);
00368 for(i = 0; i < t->bgnlistlen; i++) {
00369 t->num[i] = 0;
00370 }
00371
00372
00373 t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum);
00374 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
00375 t->bo_wt = NULL;
00376 t->nnid2ctid_upper = NULL;
00377 t->nnid2ctid_lower = NULL;
00378
00379 nnid = 0;
00380 cid = cid_last = NNID_INVALID;
00381 for(i=0;i<n;i++) w_last[i] = WORD_INVALID;
00382
00383
00384 for (;;) {
00385 if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;
00386 strcpy(pbuf, buf);
00387 if ( nnid % 100000 == 0) {
00388 jlog("Stat: ngram_read_arpa: %d-gram read %d (%d%%)\n", n, nnid, nnid * 100 / t->totalnum);
00389 }
00390
00391
00392 if ((s = strtok(buf, DELM)) == NULL) {
00393 jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
00394 return FALSE;
00395 }
00396 p = (LOGPROB)atof(s);
00397
00398 for(i=0;i<n;i++) {
00399 if ((s = strtok(NULL, DELM)) == NULL) {
00400 jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
00401 return FALSE;
00402 }
00403 if ((w[i] = ngram_lookup_word(ndata, s)) == WORD_INVALID) {
00404 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": \"%s\" not exist in %d-gram\n", n, nnid+1, pbuf, s, n);
00405 ok_p = FALSE;
00406 break;
00407 }
00408
00409 }
00410 if (i < n) continue;
00411
00412
00413 for(i=0;i<n-1;i++) {
00414 if (w[i] != w_last[i]) break;
00415 }
00416 if (i < n-1) {
00417
00418 cid = search_ngram(ndata, n-1, w);
00419 if (cid == NNID_INVALID) {
00420
00421 jlog("Warning: ngram_read_arpa: %d-gram #%d: \"%s\": context (",
00422 n, nnid+1, pbuf);
00423 for(i=0;i<n-1;i++) {
00424 jlog(" %s", ndata->wname[w[i]]);
00425 }
00426 jlog(") not exist in %d-gram (ignored)\n", n-1);
00427 ok_p = FALSE;
00428 continue;
00429 }
00430 if (cid_last != NNID_INVALID) {
00431
00432 if (t->is24bit) {
00433 ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
00434 } else {
00435 ntmp = t->bgn[cid_last];
00436 }
00437 t->num[cid_last] = nnid - ntmp;
00438 }
00439
00440 if (t->is24bit) {
00441 if (t->bgn_upper[cid] != NNID_INVALID_UPPER) {
00442 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00443 return FALSE;
00444 }
00445 ntmp = nnid & 0xffff;
00446 t->bgn_lower[cid] = ntmp;
00447 ntmp = nnid >> 16;
00448 t->bgn_upper[cid] = ntmp;
00449 } else {
00450 if (t->bgn[cid] != NNID_INVALID) {
00451 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00452 return FALSE;
00453 }
00454 t->bgn[cid] = nnid;
00455 }
00456
00457 cid_last = cid;
00458 w_last[n-1] = WORD_INVALID;
00459 }
00460
00461
00462 if (w[n-1] == w_last[n-1]) {
00463 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": duplicated entry\n", n, nnid+1, pbuf);
00464 ok_p = FALSE;
00465 continue;
00466 } else if (w_last[n-1] != WORD_INVALID && w[n-1] < w_last[n-1]) {
00467 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00468 return FALSE;
00469 }
00470
00471
00472 if ((s = strtok(NULL, DELM)) != NULL) {
00473 bowt = (LOGPROB) atof(s);
00474 if (t->bo_wt == NULL) {
00475 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);
00476 for(i=0;i<nnid;i++) t->bo_wt[i] = 0.0;
00477 }
00478 t->bo_wt[nnid] = bowt;
00479 } else {
00480 if (t->bo_wt != NULL) t->bo_wt[nnid] = 0.0;
00481 }
00482
00483
00484 t->nnid2wid[nnid] = w[n-1];
00485 t->prob[nnid] = p;
00486
00487 nnid++;
00488 for(i=0;i<n;i++) w_last[i] = w[i];
00489
00490
00491 if (nnid > t->totalnum) {
00492 jlog("Error: ngram_read_arpa: %d-gram: read num (%d) not match the header value (%d)\n", n, nnid, t->totalnum);
00493 return FALSE;
00494 }
00495 }
00496
00497
00498 if (t->is24bit) {
00499 ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
00500 } else {
00501 ntmp = t->bgn[cid_last];
00502 }
00503 t->num[cid_last] = nnid - ntmp;
00504
00505 if (t->bo_wt != NULL) t->context_num = t->totalnum;
00506
00507 if (ok_p == TRUE) {
00508 jlog("Stat: ngram_read_arpa: %d-gram read %d end\n", n, nnid);
00509 }
00510
00511 return ok_p;
00512 }
00513
00524 boolean
00525 ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition)
00526 {
00527 int i, n;
00528 NNID num[MAX_N];
00529
00530
00531 ndata->from_bin = FALSE;
00532 ndata->bigram_index_reversed = FALSE;
00533
00534
00535 while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0);
00536
00537
00538 if (addition) {
00539
00540
00541 n = get_total_info(fp, num);
00542 if (n == -1) {
00543 return FALSE;
00544 }
00545
00546
00547 if (n < 2) {
00548 jlog("Error: forward N-gram for pass1 is does not contain 2-gram\n");
00549 return FALSE;
00550 }
00551 if (n > 2) {
00552 jlog("Warning: forward N-gram for pass1 contains %d-gram, only 2-gram will be used\n", n);
00553 }
00554
00555
00556 for(i=0;i<2;i++) {
00557 if (ndata->d[i].totalnum != num[i]) {
00558 jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", i+1);
00559 }
00560 }
00561
00562 if (!strnmatch(buf,"\\1-grams",8)) {
00563 jlog("Error: ngram_read_arpa: 1-gram not found for additional LR 2-gram\n");
00564 return FALSE;
00565 }
00566 jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
00567 if (add_unigram(fp, ndata) == FALSE) return FALSE;
00568
00569 if (!strnmatch(buf,"\\2-grams", 8)) {
00570 jlog("Error: ngram_read_arpa: 2-gram not found for additional LR 2-gram\n");
00571 return FALSE;
00572 }
00573 jlog("Stat: ngram_read_arpa: reading 2-gram part...\n");
00574 if (add_bigram(fp, ndata) == FALSE) return FALSE;
00575
00576
00577
00578 if (strnmatch(buf,"\\3-grams", 8)) {
00579 jlog("Warning: forward n-gram contains more than 3-gram, ignored\n");
00580 }
00581
00582 } else {
00583
00584 n = get_total_info(fp, num);
00585 if (n == -1) {
00586 return FALSE;
00587 }
00588 jlog("Stat: ngram_read_arpa: this is %d-gram file\n", n);
00589 for(i=0;i<n;i++) {
00590 ndata->d[i].totalnum = num[i];
00591 }
00592
00593
00594 if (ndata->d[0].totalnum > MAX_WORD_NUM) {
00595 jlog("Error: ngram_read_arpa: N-gram vocabulary size exceeds the limit (%d)\n", MAX_WORD_NUM);
00596 return FALSE;
00597 }
00598 ndata->max_word_num = ndata->d[0].totalnum;
00599
00600
00601
00602 for(i=0;i<n;i++) {
00603 if (i < 2) {
00604 ndata->d[i].is24bit = FALSE;
00605 } else {
00606
00607
00608 if (ndata->d[i].totalnum > NNID_MAX_24) {
00609 jlog("Warning: ngram_read_arpa: num of %d-gram exceeds 24bit, now switch to %dbit index\n", i+1, sizeof(NNID) * 8);
00610 ndata->d[i].is24bit = FALSE;
00611 } else {
00612 ndata->d[i].is24bit = TRUE;
00613 }
00614 }
00615 }
00616
00617 for(i=0;i<n;i++) {
00618 ndata->d[i].ct_compaction = FALSE;
00619 }
00620
00621
00622 if (!strnmatch(buf,"\\1-grams",8)) {
00623 jlog("Error: ngram_read_arpa: data format error: 1-gram not found\n");
00624 return FALSE;
00625 }
00626 jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
00627 if (set_unigram(fp, ndata) == FALSE) return FALSE;
00628
00629 i = 2;
00630 while(i <= n) {
00631
00632 sprintf(pbuf, "\\%d-grams", i);
00633 if (!strnmatch(buf, pbuf, 8)) {
00634 jlog("Error: ngram_read_arpa: data format error: %d-gram not found\n", i);
00635 return FALSE;
00636 }
00637 jlog("Stat: ngram_read_arpa: reading %d-gram part...\n", i);
00638 if (set_ngram(fp, ndata, i) == FALSE) return FALSE;
00639 i++;
00640 }
00641
00642 if (!strnmatch(buf, "\\end", 4)) {
00643 jlog("Error: ngram_read_arpa: data format error: end marker \"\\end\" not found\n");
00644 return FALSE;
00645 }
00646
00647 ndata->n = n;
00648
00649 for(i=2;i<n;i++) {
00650 if (ndata->d[i-1].bo_wt != NULL) {
00651
00652 if (ngram_compact_context(ndata, i) == FALSE) return FALSE;
00653 }
00654 }
00655
00656
00657 if (ndata->dir == DIR_RL) {
00658 WORD_ID bos, eos;
00659 char *p;
00660 bos = ngram_lookup_word(ndata, BEGIN_WORD_DEFAULT);
00661 eos = ngram_lookup_word(ndata, END_WORD_DEFAULT);
00662 if (!ndata->bos_eos_swap) {
00663
00664 if (bos != WORD_INVALID && eos != WORD_INVALID && ndata->d[0].prob[bos] == -99) {
00665 jlog("Stat: \"P(%s) = -99\" in reverse N-gram, may be trained by SRILM\n", BEGIN_WORD_DEFAULT);
00666 jlog("Stat: going to swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT);
00667 ndata->bos_eos_swap = TRUE;
00668 }
00669 }
00670 if (ndata->bos_eos_swap) {
00671 if (bos == WORD_INVALID) {
00672 jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", BEGIN_WORD_DEFAULT);
00673 }
00674 if (eos == WORD_INVALID) {
00675 jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", END_WORD_DEFAULT);
00676 }
00677 if (bos == WORD_INVALID || eos == WORD_INVALID) {
00678 return FALSE;
00679 }
00680
00681 jlog("Stat: ngram_read_arpa: swap \"%s\" and \"%s\" at backward N-gram\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT);
00682
00683 p = ndata->wname[bos];
00684 ndata->wname[bos] = ndata->wname[eos];
00685 ndata->wname[eos] = p;
00686
00687 ptree_replace_data(BEGIN_WORD_DEFAULT, eos, ndata->root);
00688 ptree_replace_data(END_WORD_DEFAULT, bos, ndata->root);
00689 }
00690 }
00691
00692 }
00693
00694 #ifdef CLASS_NGRAM
00695
00696 if (getl(buf, sizeof(buf), fp) != NULL) {
00697 if (strnmatch(buf, "\\class", 6)) {
00698 jlog("Stat: ngram_read_arpa: skipping in-class word entries...\n");
00699 }
00700 }
00701 #endif
00702
00703 bi_prob_func_set(ndata);
00704
00705 return TRUE;
00706 }