Julius: libsent/src/ngram/ngram_read

00001 
00026 /*
00027  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00028  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00029  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00030  * All rights reserved
00031  */
00032 
00033 /* $Id: ngram_read_arpa.c,v 1.1.1.1 2007/09/28 02:50:56 sumomo Exp $ */
00034 
00035 /* words should be alphabetically sorted */
00036 
00037 #include <sent/stddefs.h>
00038 #include <sent/ngram2.h>
00039 
00040 static char buf[800];                   
00041 static char pbuf[800];                  
00042 
00043 
00050 static int
00051 get_total_info(FILE *fp, int num[])
00052 {
00053   char *p;
00054   int n;
00055   int maxn;
00056   int entry_num;
00057 
00058   maxn = 0;
00059   while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00060     if (strnmatch(buf, "ngram", 5)) { /* n-gram num */
00061       p = strtok(buf, "=");
00062       n = p[strlen(p)-1] - '0';
00063       if (n > MAX_N) {
00064         jlog("Error: too long N-gram (N=%d)\n", n);
00065         jlog("Error: current maximum length of N-gram is set to %d\n", MAX_N);
00066         jlog("Error: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");
00067         return -1;
00068       }
00069       p = strtok(NULL, "=");
00070       entry_num = atoi(p);
00071       /* ignore empty entry */
00072       if (entry_num == 0) {
00073         jlog("Warning: empty %d-gram, skipped\n", n);
00074       } else {
00075         num[n-1] = entry_num;
00076         if (maxn < n) maxn = n;
00077       }
00078     }
00079   }
00080 
00081   return(maxn);
00082 }
00083 
00090 static boolean
00091 set_unigram(FILE *fp, NGRAM_INFO *ndata)
00092 {
00093   WORD_ID nid;
00094   int resid;
00095   LOGPROB prob, bo_wt;
00096   char *name, *p;
00097   boolean ok_p = TRUE;
00098   NGRAM_TUPLE_INFO *t;
00099 
00100   t = &(ndata->d[0]);
00101 
00102   /* malloc name area */
00103   ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
00104 
00105   /* malloc data area */
00106   //t->bgn_upper = t->bgn_lower = t->bgn = t->num = NULL;
00107   t->bgn_upper = NULL;
00108   t->bgn_lower = NULL;
00109   t->bgn = NULL;
00110   t->num = NULL;
00111   t->bgnlistlen = 0;
00112   t->nnid2wid = NULL;
00113   t->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00114   t->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00115   t->context_num = t->totalnum;
00116   t->nnid2ctid_upper = NULL;
00117   t->nnid2ctid_lower = NULL;
00118 
00119   nid = 0;
00120   
00121   while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00122     if ((p = strtok(buf, DELM)) == NULL) {
00123       jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
00124       return FALSE;
00125     }
00126     prob = (LOGPROB)atof(p);
00127     if ((p = strtok(NULL, DELM)) == NULL) {
00128       jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
00129       return FALSE;
00130     }
00131     name = strcpy((char *)mymalloc(strlen(p)+1), p);
00132     if ((p = strtok(NULL, DELM)) == NULL) {
00133       jlog("Error: ngram_read_arpa: LR 1-gram: failed to parse, corrupted or invalid data?\n");
00134       return FALSE;
00135     }
00136     bo_wt = (LOGPROB)atof(p);
00137 
00138     /* register word entry name */
00139     ndata->wname[nid] = name;
00140 
00141     /* add entry name to index tree */
00142     if (ndata->root == NULL) {
00143       ndata->root = ptree_make_root_node(nid);
00144     } else {
00145       resid = ptree_search_data(name, ndata->root);
00146       if (resid != -1 && strmatch(name, ndata->wname[resid])) { /* already exist */
00147         jlog("Error: ngram_read_arpa: duplicate word entry \"%s\" at #%d and #%d in 1-gram\n", name, resid, nid);
00148         ok_p = FALSE;
00149         continue;
00150       } else {
00151         ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root));
00152       }
00153     }
00154 
00155     if (nid >= ndata->max_word_num) {
00156       jlog("Error: ngram_read_arpa: num of 1-gram is bigger than header value (%d)\n", ndata->max_word_num);
00157       return FALSE;
00158     }
00159 
00160     /* register entry info */
00161     t->prob[nid] = prob;
00162     t->bo_wt[nid] = bo_wt;
00163   
00164     nid++;
00165   }
00166 
00167   if (nid != t->totalnum) {
00168     jlog("Error: ngram_read_arpa: num of 1-gram (%d) not equal to header value (%d)\n", nid, t->totalnum);
00169     return FALSE;
00170   }
00171 
00172   if (ok_p == TRUE) {
00173     jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", nid);
00174   }
00175   
00176   return ok_p;
00177 }
00178 
00179 /* read-in 1-gram (RL) --- only add back-off weight */
00187 static boolean
00188 add_unigram(FILE *fp, NGRAM_INFO *ndata)
00189 {
00190   WORD_ID read_word_num;
00191   WORD_ID nid;
00192   LOGPROB prob, bo_wt;
00193   char *name, *p;
00194   boolean ok_p = TRUE;
00195   boolean mismatched = FALSE;
00196 
00197   ndata->bo_wt_1 = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->max_word_num);
00198 
00199   read_word_num = 0;
00200   while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00201     if ((p = strtok(buf, DELM)) == NULL) {
00202       jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00203       return FALSE;
00204     }
00205     prob = atof(p);
00206     if ((p = strtok(NULL, DELM)) == NULL) {
00207       jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00208       return FALSE;
00209     }
00210     name = strcpy((char *)mymalloc(strlen(p)+1), p);
00211     if ((p = strtok(NULL, DELM)) == NULL) {
00212       jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n");
00213       return FALSE;
00214     }
00215     bo_wt = (LOGPROB)atof(p);
00216   
00217     /* add bo_wt_rl to existing 1-gram entry */
00218     nid = ngram_lookup_word(ndata, name);
00219     if (nid == WORD_INVALID) {
00220       if (mismatched == FALSE) {
00221         jlog("Error: ngram_read_arpa: vocabulary mismatch between LR n-gram and RL n-gram\n");
00222         mismatched = TRUE;
00223       }
00224       jlog("Error: ngram_read_arpa: \"%s\" does not appears in LR n-gram\n", name);
00225       ok_p = FALSE;
00226     } else {
00227       ndata->bo_wt_1[nid] = bo_wt;
00228     }
00229   
00230     read_word_num++;
00231     if (read_word_num > ndata->max_word_num) {
00232       jlog("Error: ngram_read_arpa: vocabulary size of RL n-gram is bigger than header value (%d)\n", ndata->max_word_num);
00233       return FALSE;
00234     }
00235     free(name);
00236   }
00237   if (ok_p == TRUE) {
00238     jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", read_word_num);
00239   }
00240 
00241   return ok_p;
00242 }
00243 
00252 static boolean
00253 add_bigram(FILE *fp, NGRAM_INFO *ndata)
00254 {
00255   WORD_ID w[2], wtmp;
00256   LOGPROB prob;
00257   int bi_count = 0;
00258   NNID n2;
00259   boolean ok_p = TRUE;
00260   char *s;
00261 
00262   ndata->p_2 = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->d[1].totalnum);
00263 
00264   while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
00265     strcpy(pbuf, buf);
00266     if ( ++bi_count % 100000 == 0) {
00267       jlog("Stat: ngram_read_arpa: 2-gram read %d (%d%%)\n", bi_count, bi_count * 100 / ndata->d[1].totalnum);
00268     }
00269     if ((s = strtok(buf, DELM)) == NULL) {
00270       jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00271       return FALSE;
00272     }
00273     prob = (LOGPROB)atof(s);
00274     if ((s = strtok(NULL, DELM)) == NULL) {
00275       jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00276       return FALSE;
00277     }
00278     w[0] = ngram_lookup_word(ndata, s);
00279     if (w[1] == WORD_INVALID) {
00280       jlog("Error: ngram_read_arpa: 2-gram #%d: \"%s\": \"%s\" not exist in 1-gram\n", n2+1, pbuf, s);
00281       ok_p = FALSE;
00282       continue;
00283     }
00284     if ((s = strtok(NULL, DELM)) == NULL) {
00285       jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n");
00286       return FALSE;
00287     }
00288     w[1] = ngram_lookup_word(ndata, s);
00289     if (w[0] == WORD_INVALID) {
00290       jlog("Error: ngram_read_arpa: 2-gram #%d: \"%s\": \"%s\" not exist in 1-gram\n", n2+1, pbuf, s);
00291       ok_p = FALSE;
00292       continue;
00293     }
00294     if (ndata->dir == DIR_RL) {
00295       /* word order should be reversed */
00296       wtmp = w[0];
00297       w[0] = w[1];
00298       w[1] = wtmp;
00299     }
00300     n2 = search_ngram(ndata, 2, w);
00301     if (n2 == NNID_INVALID) {
00302       jlog("Warning: ngram_read_arpa: 2-gram #%d: \"%s\": (%s,%s) not exist in LR 2-gram (ignored)\n", n2+1, pbuf, ndata->wname[w[0]], ndata->wname[w[1]]);
00303     } else {
00304       ndata->p_2[n2] = prob;
00305     }
00306   }
00307 
00308   if (ok_p == TRUE) {
00309     jlog("Stat: ngram_read_arpa: 2-gram read %d end\n", bi_count);
00310   }
00311 
00312   return ok_p;
00313 }
00314     
00321 static boolean
00322 set_ngram(FILE *fp, NGRAM_INFO *ndata, int n)
00323 {
00324   NNID i;
00325   WORD_ID w[MAX_N];
00326   WORD_ID w_last[MAX_N];
00327   LOGPROB p, bowt;
00328   NNID nnid;
00329   NNID cid, cid_last;
00330   boolean ok_p = TRUE;
00331   char *s;
00332   NGRAM_TUPLE_INFO *t;
00333   NGRAM_TUPLE_INFO *tprev;
00334   NNID ntmp;
00335 
00336   if (n < 2) {
00337     jlog("Error: ngram_read_arpa: unable to process 1-gram\n");
00338     return FALSE;
00339   }
00340 
00341   t = &(ndata->d[n-1]);
00342   tprev = &(ndata->d[n-2]);
00343 
00344   /* initialize pointer storage to access from (N-1)-gram */
00345   t->bgnlistlen = tprev->context_num;
00346   if (t->is24bit) {
00347     t->bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * t->bgnlistlen);
00348     t->bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * t->bgnlistlen);
00349     for(i = 0; i < t->bgnlistlen; i++) {    
00350       t->bgn_upper[i] = NNID_INVALID_UPPER;
00351       t->bgn_lower[i] = 0;
00352     }
00353   } else {
00354     t->bgn = (NNID *)mymalloc(sizeof(NNID) * t->bgnlistlen);
00355     for(i = 0;i < t->bgnlistlen; i++) {
00356       t->bgn[i] = NNID_INVALID;
00357     }
00358   }
00359   t->num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * t->bgnlistlen);
00360   for(i = 0; i < t->bgnlistlen; i++) {
00361     t->num[i] = 0;
00362   }
00363 
00364   /* allocate data area */
00365   t->nnid2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * t->totalnum);
00366   t->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00367   t->bo_wt = NULL;
00368   t->nnid2ctid_upper = NULL;
00369   t->nnid2ctid_lower = NULL;
00370 
00371   nnid = 0;
00372   cid = cid_last = NNID_INVALID;
00373   for(i=0;i<n;i++) w_last[i] = WORD_INVALID;
00374 
00375   /* read in 2-gram */
00376   for (;;) {
00377     if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break;
00378     strcpy(pbuf, buf);
00379     if ( nnid % 100000 == 0) {
00380       jlog("Stat: ngram_read_arpa: %d-gram read %d (%d%%)\n", n, nnid, nnid * 100 / t->totalnum);
00381     }
00382 
00383     /* 2-gram probability */
00384     if ((s = strtok(buf, DELM)) == NULL) {
00385       jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
00386       return FALSE;
00387     }
00388     p = (LOGPROB)atof(s);
00389     /* read in context word and lookup the ID */
00390     for(i=0;i<n;i++) {
00391       if ((s = strtok(NULL, DELM)) == NULL) {
00392         jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
00393         return FALSE;
00394       }
00395       if ((w[i] = ngram_lookup_word(ndata, s)) == WORD_INVALID) {
00396         jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": \"%s\" not exist in %d-gram\n", n, nnid+1, pbuf, s, n);
00397         ok_p = FALSE;
00398         break;
00399       }
00400       /* increment nnid_bgn and nnid_num if context word changed */
00401     }
00402     if (i < n) continue;        /* error out */
00403 
00404     /* detect context entry change at this line */
00405     for(i=0;i<n-1;i++) {
00406       if (w[i] != w_last[i]) break;
00407     }
00408     if (i < n-1) {              /* context changed here */
00409       /* find new entry point */
00410       cid = search_ngram(ndata, n-1, w);
00411       if (cid == NNID_INVALID) {        /* no context */
00412         //jlog("Warning: ngram_read_arpa: %d-gram #%d: \"%s\": context (%s,%s) not exist in %d-gram (ignored)\n", n, nnid+1, pbuf, ndata->wname[w_m], ndata->wname[w_r], n-1);
00413         jlog("Warning: ngram_read_arpa: %d-gram #%d: \"%s\": context (",
00414              n, nnid+1, pbuf);
00415         for(i=0;i<n-1;i++) {
00416           jlog(" %s", ndata->wname[w[i]]);
00417         }
00418         jlog(") not exist in %d-gram (ignored)\n", n-1);
00419         ok_p = FALSE;
00420         continue;
00421       }
00422       if (cid_last != NNID_INVALID) {
00423         /* close last entry */
00424         if (t->is24bit) {
00425           ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
00426         } else {
00427           ntmp = t->bgn[cid_last];
00428         }
00429         t->num[cid_last] = nnid - ntmp;
00430       }
00431       /* the next context word should be an new entry */
00432       if (t->is24bit) {
00433         if (t->bgn_upper[cid] != NNID_INVALID_UPPER) {
00434           jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00435           return FALSE;
00436         }
00437         ntmp = nnid & 0xffff;
00438         t->bgn_lower[cid] = ntmp;
00439         ntmp = nnid >> 16;
00440         t->bgn_upper[cid] = ntmp;
00441       } else {
00442         if (t->bgn[cid] != NNID_INVALID) {
00443           jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00444           return FALSE;
00445         }
00446         t->bgn[cid] = nnid;
00447       }
00448 
00449       cid_last = cid;
00450       w_last[n-1] = WORD_INVALID;
00451     }
00452 
00453     /* store the probabilities of the target word */
00454     if (w[n-1] == w_last[n-1]) {
00455       jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": duplicated entry\n", n, nnid+1, pbuf);
00456       ok_p = FALSE;
00457       continue;
00458     } else if (w_last[n-1] != WORD_INVALID && w[n-1] < w_last[n-1]) {
00459       jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
00460       return FALSE;
00461     }
00462 
00463     /* if the 2-gram has back-off entries, store them here */
00464     if ((s = strtok(NULL, DELM)) != NULL) {
00465       bowt = (LOGPROB) atof(s);
00466       if (t->bo_wt == NULL) {
00467         t->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00468       }
00469       t->bo_wt[nnid] = bowt;
00470     }
00471 
00472     /* store the entry info */
00473     t->nnid2wid[nnid] = w[n-1];
00474     t->prob[nnid] = p;
00475 
00476     nnid++;
00477     for(i=0;i<n;i++) w_last[i] = w[i];
00478 
00479     /* check total num */
00480     if (nnid > t->totalnum) {
00481       jlog("Error: ngram_read_arpa: %d-gram: read num (%d) not match the header value (%d)\n", n, nnid, t->totalnum);
00482       return FALSE;
00483     }
00484   }
00485   
00486   /* set the last entry */
00487   if (t->is24bit) {
00488     ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]);
00489   } else {
00490     ntmp = t->bgn[cid_last];
00491   }
00492   t->num[cid_last] = nnid - ntmp;
00493 
00494   if (t->bo_wt != NULL) t->context_num = t->totalnum;
00495 
00496   if (ok_p == TRUE) {
00497     jlog("Stat: ngram_read_arpa: %d-gram read %d end\n", n, nnid);
00498   }
00499 
00500   return ok_p;
00501 }
00502 
00513 boolean
00514 ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition)
00515 {
00516   int i, n;
00517   int num[MAX_N];
00518 
00519   /* source file is not a binary N-gram */
00520   ndata->from_bin = FALSE;
00521   ndata->bigram_index_reversed = FALSE;
00522 
00523   /* read until `\data\' found */
00524   while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0);
00525 
00526 
00527   if (addition) {
00528     /* reading additional forward 2-gram for the 1st pass */
00529     /* read n-gram total info */
00530     n = get_total_info(fp, num);
00531     if (n == -1) {              /* error */
00532       return FALSE;
00533     }
00534 
00535     /* check N limit */
00536     if (n < 2) {
00537       jlog("Error: forward N-gram for pass1 is does not contain 2-gram\n");
00538       return FALSE;
00539     }
00540     if (n > 2) {
00541       jlog("Warning: forward N-gram for pass1 contains %d-gram, only 2-gram will be used\n", n);
00542     }
00543 
00544     /* check if the numbers are the same with already read n-gram */
00545     for(i=0;i<2;i++) {
00546       if (ndata->d[i].totalnum != num[i]) {
00547         jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", n+1);
00548       }
00549     }
00550     /* read additional 1-gram data */
00551     if (!strnmatch(buf,"\\1-grams",8)) {
00552       jlog("Error: ngram_read_arpa: 1-gram not found for additional LR 2-gram\n");
00553       return FALSE;
00554     }
00555     jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
00556     if (add_unigram(fp, ndata) == FALSE) return FALSE;
00557     /* read 2-gram data */
00558     if (!strnmatch(buf,"\\2-grams", 8)) {
00559       jlog("Error: ngram_read_arpa: 2-gram not found for additional LR 2-gram\n");
00560       return FALSE;
00561     }
00562     jlog("Stat: ngram_read_arpa: reading 2-gram part...\n");
00563     if (add_bigram(fp, ndata) == FALSE) return FALSE;
00564 
00565 
00566     /* ignore the rest */
00567     if (strnmatch(buf,"\\3-grams", 8)) {
00568       jlog("Warning: forward n-gram contains more than 3-gram, ignored\n");
00569     }
00570 
00571   } else {
00572     /* read n-gram total info */
00573     n = get_total_info(fp, num);
00574     if (n == -1) {              /* error */
00575       return FALSE;
00576     }
00577     jlog("Stat: ngram_read_arpa: this is %d-gram file\n", n);
00578     for(i=0;i<n;i++) {
00579       ndata->d[i].totalnum = num[i];
00580     }
00581     
00582     /* set word num */
00583     if (ndata->d[0].totalnum > MAX_WORD_NUM) {
00584       jlog("Error: ngram_read_arpa: N-gram vocabulary size exceeds the limit (%d)\n", MAX_WORD_NUM);
00585       return FALSE;
00586     }
00587     ndata->max_word_num = ndata->d[0].totalnum;
00588     
00589     /* check if each N-gram allows 24bit and back-off compaction mode */
00590     /* for fast access, 1-gram and 2-gram always use non-compaction mode */
00591     for(i=0;i<n;i++) {
00592       if (i < 2) {              /* not use for 1-gram and 2-gram */
00593         ndata->d[i].is24bit = FALSE;
00594       } else {
00595         /* for 3-gram and later 24 bit mode is preferred,
00596            but should be disabled if number of entries is over 2^24 */
00597         if (ndata->d[i].totalnum >= NNIDMAX) {
00598           jlog("Warning: ngram_read_arpa: more than 24bit %d-gram tuples, use 32bit index\n", NNIDMAX, n+1);
00599           ndata->d[i].is24bit = FALSE;
00600         } else {
00601           ndata->d[i].is24bit = TRUE;
00602         }
00603       }
00604     }
00605     /* disable ct_compaction flag while reading ARPA data */
00606     for(i=0;i<n;i++) {
00607       ndata->d[i].ct_compaction = FALSE;
00608     }
00609     
00610     /* read 1-gram data */
00611     if (!strnmatch(buf,"\\1-grams",8)) {
00612       jlog("Error: ngram_read_arpa: data format error: 1-gram not found\n");
00613       return FALSE;
00614     }
00615     jlog("Stat: ngram_read_arpa: reading 1-gram part...\n");
00616     if (set_unigram(fp, ndata) == FALSE) return FALSE;
00617     
00618     i = 2;
00619     while(i <= n) {
00620       /* read n-gram data in turn */
00621       sprintf(pbuf, "\\%d-grams", i);
00622       if (!strnmatch(buf, pbuf, 8)) {
00623         jlog("Error: ngram_read_arpa: data format error: %d-gram not found\n", i);
00624         return FALSE;
00625       }
00626       jlog("Stat: ngram_read_arpa: reading %d-gram part...\n", i);
00627       if (set_ngram(fp, ndata, i) == FALSE) return FALSE;
00628       i++;
00629     }
00630     /* finished reading file */
00631     if (!strnmatch(buf, "\\end", 4)) {
00632       jlog("Error: ngram_read_arpa: data format error: end marker \"\\end\" not found\n");
00633       return FALSE;
00634     }
00635 
00636     ndata->n = n;
00637 
00638     for(i=1;i<n;i++) {
00639       if (ndata->d[i].bo_wt != NULL) {
00640         /* perform back-off compaction */
00641         if (ngram_compact_context(ndata, i+1) == FALSE) return FALSE;
00642         ndata->d[i].ct_compaction = TRUE;
00643       }
00644     }
00645     
00646     /* set unknown (=OOV) word id */
00647     set_unknown_id(ndata);
00648     
00649   }
00650     
00651 #ifdef CLASS_NGRAM
00652   /* skip in-class word entries (they should be in word dictionary) */
00653   if (getl(buf, sizeof(buf), fp) != NULL) {
00654     if (strnmatch(buf, "\\class", 6)) {
00655       jlog("Stat: ngram_read_arpa: skipping in-class word entries...\n");
00656     }
00657   }
00658 #endif
00659 
00660   bi_prob_func_set(ndata);
00661 
00662   return TRUE;
00663 }
libsent/src/ngram/ngram_read_arpa.c