Julius: libsent/src/ngram/ngram_read

00001 
00054 /*
00055  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00056  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00057  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00058  * All rights reserved
00059  */
00060 
00061 #include <sent/stddefs.h>
00062 #include <sent/ngram2.h>
00063 
00064 static int file_version;  
00065 static boolean need_swap; 
00066 #ifdef WORDS_INT
00067 static boolean need_conv;       
00068 static boolean words_int_retry = FALSE; 
00069 #endif
00070 
00075 #define rdn(A,B,C,D) if (rdnfunc(A,B,C,D) == FALSE) return FALSE
00076 #define rdn_wordid(A,B,C,D) if (rdn_wordid_func(A,B,C,D) == FALSE) return FALSE
00077 
00085 static boolean
00086 rdnfunc(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00087 {
00088   size_t tmp;
00089   if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < (size_t)unitnum) {
00090     jlog("Error: ngram_read_bin: failed to read %d bytes\n", unitbyte*unitnum);
00091     return FALSE;
00092   }
00093   if (need_swap) {
00094     if (unitbyte != 1) {
00095       swap_bytes(buf, unitbyte, unitnum);
00096     }
00097   }
00098   return TRUE;
00099 }
00100 
00101 #ifdef WORDS_INT
00102 
00110 static boolean
00111 rdn_wordid_func(FILE *fp, void *buf, int unitnum, boolean need_conv)
00112 {
00113   int i;
00114   unsigned short *s;
00115   WORD_ID *t;
00116   WORD_ID d;
00117 
00118   if (need_conv) {
00119     /* read unsigned short units */
00120     rdn(fp, buf, sizeof(unsigned short), unitnum);
00121     /* convert them to WORD_ID (integer) */
00122     for(i=unitnum-1;i>=0;i--) {
00123       s = (unsigned short *)buf + i;
00124       t = (WORD_ID *)buf + i;
00125       d = *s;
00126       *t = d;
00127     }
00128   } else {
00129     /* read as usual */
00130     rdn(fp, buf, sizeof(WORD_ID), unitnum);
00131   }
00132   return TRUE;
00133 }
00134 #endif
00135 
00141 static boolean
00142 check_header(FILE *fp)
00143 {
00144   char buf[BINGRAM_HDSIZE], *p;
00145 
00146   rdn(fp, buf, 1, BINGRAM_HDSIZE);
00147   p = buf;
00148 #ifdef WORDS_INT
00149   need_conv = FALSE;
00150 #endif
00151 
00152   /* version check */
00153   if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {
00154     /* bingram file made by mkbingram before 3.4.2 */
00155     file_version = 3;
00156     p += strlen(BINGRAM_IDSTR) + 1;
00157   } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {
00158     /* bingram file made by mkbingram later than 3.5 */
00159     file_version = 4;
00160     p += strlen(BINGRAM_IDSTR_V4) + 1;
00161   } else if (strnmatch(p, BINGRAM_IDSTR_V5, strlen(BINGRAM_IDSTR_V5))) {
00162     /* bingram file made by JuliusLib-4 and later */
00163     file_version = 5;
00164     p += strlen(BINGRAM_IDSTR_V5) + 1;
00165   } else {
00166     /* not a bingram file */
00167     jlog("Error: ngram_read_bin: invalid header\n");
00168     return FALSE;
00169   }
00170   /* word size check (for bingram build by mkbingram 3.3p5 and later */
00171   if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {
00172     p += strlen(BINGRAM_SIZESTR_HEAD);
00173     if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {
00174       /* word size does not match (int / short) */
00175 #ifdef WORDS_INT
00176       if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) {
00177         /* this is 2-byte word ID, will convert while reading */
00178         jlog("Warning: ngram_read_bin: 2-bytes bingram, converting to 4 bytes\n");
00179         need_conv = TRUE;
00180         p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1;
00181       } else {
00182         jlog("Error: ngram_read_bin: unknown word byte size!\n");
00183         return FALSE;
00184       }
00185 #else
00186       if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) {
00187         /*** 4bytes to 2bytes not implemented, just terminate here... ***/
00188         jlog("Error: ngram_read_bin: cannot handle 4-bytes bingram\n");
00189         jlog("Error: ngram_read_bin: please use Julius compiled with --enable-words-int\n");
00190         return FALSE;
00191         //p += strlen(BINGRAM_SIZESTR_BODY_4BYTE) + 1;
00192       } else {
00193         jlog("Error: ngram_read_bin: unknown word byte size!\n");
00194         return FALSE;
00195       }
00196 #endif
00197     } else {
00198       p += strlen(BINGRAM_SIZESTR_BODY) + 1;
00199     }
00200 
00201     /* byte order check (v4 (rev.3.5) and later) */
00202     if (file_version >= 4) {
00203       if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {
00204         jlog("Error: ngram_read_bin: no information for byte order??\n");
00205         return FALSE;
00206       }
00207       p += strlen(BINGRAM_BYTEORDER_HEAD);
00208       if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {
00209         /* file endian and running endian is different, need swapping */
00210         need_swap = TRUE;
00211       } else {
00212         need_swap = FALSE;
00213       }
00214       p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;
00215     }
00216   } /* if no BINGRAM_SIZESTR_HEAD found, just pass it */
00217 
00218   /* in case of V3 bingram file, the unit size of word_id and its byte order
00219      cannot be determined from the header.  In that case, we assume 
00220      byteorder to be a BIG ENDIAN.  The word_id unit size (2byte in normal,
00221      or 4byte if bingram generated with mkbingram with --enable-words-int)
00222      will be automagically detected.
00223      */
00224 
00225   if (file_version < 4) {
00226     /* assume input as big endian */
00227 #ifdef WORDS_BIGENDIAN
00228     need_swap = FALSE;
00229 #else
00230     need_swap = TRUE;
00231 #endif
00232   }
00233     
00234   /*jlog("%s",buf);*/
00235 
00236   return TRUE;
00237 }
00238 
00239 static boolean
00240 ngram_read_bin_v5(FILE *fp, NGRAM_INFO *ndata)
00241 {
00242   int i,n,len;
00243   char *w, *p;
00244 #ifdef WORDS_INT
00245   unsigned short *buf;
00246 #endif
00247   NGRAM_TUPLE_INFO *t;
00248 
00249   /* read some info extended from version 5 */
00250   rdn(fp, &(ndata->n), sizeof(int), 1);
00251   rdn(fp, &(ndata->dir), sizeof(int), 1);
00252   rdn(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
00253 
00254   jlog("Stat: ngram_read_bin_v5: this is %s %d-gram file\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ndata->n);
00255 
00256   if (ndata->n > MAX_N) {
00257     jlog("Error: ngram_read_bin_v5: too long N-gram (N=%d)\n", n);
00258     jlog("Error: ngram_read_bin_v5: current maximum length of N-gram is set to %d\n", MAX_N);
00259     jlog("Error: ngram_read_bin_v5: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");
00260     return FALSE;
00261   }
00262 
00263   /* read total info and set max_word_num */
00264   for(n=0;n<ndata->n;n++) {
00265     rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00266     if (ndata->d[n].totalnum >= NNIDMAX) {
00267       jlog("Error: ngram_read_bin_v5: too big %d-gram (%d, should be less than %d)\n", n+1, ndata->d[n].totalnum, NNIDMAX);
00268       return FALSE;
00269     }
00270   }
00271   ndata->max_word_num = ndata->d[0].totalnum;
00272 
00273   /* read wname */
00274   rdn(fp, &len, sizeof(int), 1);
00275   w = mymalloc(len);
00276   rdn(fp, w, 1, len);
00277   /* assign... */
00278   ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
00279   p = w; i = 0;
00280   while (p < w + len) {
00281     ndata->wname[i++] = p;
00282     while(*p != '\0') p++;
00283     p++;
00284   }
00285   if (i != ndata->max_word_num) {
00286     jlog("Error: ngram_read_bin_v5: wname error??\n");
00287     return FALSE;
00288   }
00289 
00290   /* read N-gram */
00291   for(n=0;n<ndata->n;n++) {
00292     jlog("stat: ngram_read_bin_v5: reading %d-gram\n", n+1);
00293 
00294     t = &(ndata->d[n]);
00295     
00296     rdn(fp, &(t->is24bit), sizeof(boolean), 1);
00297     rdn(fp, &(t->ct_compaction), sizeof(boolean), 1);
00298     rdn(fp, &(t->bgnlistlen), sizeof(int), 1);
00299     rdn(fp, &(t->context_num), sizeof(int), 1);
00300 
00301     if (n > 0) {
00302       if (t->is24bit) {
00303         t->bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * t->bgnlistlen);
00304         rdn(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
00305         t->bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * t->bgnlistlen);
00306         rdn(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
00307       } else {
00308         t->bgn = (NNID *)mymalloc(sizeof(NNID) * t->bgnlistlen);
00309         rdn(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
00310       }
00311       t->num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * t->bgnlistlen);
00312       rdn(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
00313       t->nnid2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * t->totalnum);
00314       rdn(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
00315     } else {
00316       t->bgn_upper = NULL;
00317       t->bgn_lower = NULL;
00318       t->bgn = NULL;
00319       t->num = NULL;
00320       t->bgnlistlen = 0;
00321       t->nnid2wid = NULL;
00322     }
00323 
00324     t->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00325     rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00326 
00327     rdn(fp, &i, sizeof(int), 1);
00328     if (i == 1) {
00329       t->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->context_num);
00330       rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00331     } else {
00332       t->bo_wt = NULL;
00333     }
00334     rdn(fp, &i, sizeof(int), 1);
00335     if (i == 1) {
00336       t->nnid2ctid_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * t->totalnum);
00337       t->nnid2ctid_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * t->totalnum);
00338       rdn(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
00339       rdn(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
00340     } else {
00341       t->nnid2ctid_upper = NULL;
00342       t->nnid2ctid_lower = NULL;
00343     }
00344   }
00345   rdn(fp, &i, sizeof(int), 1);
00346   if (i == 1) {
00347     ndata->bo_wt_1 = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->d[0].context_num);
00348     rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
00349   } else {
00350     ndata->bo_wt_1 = NULL;
00351   }
00352   rdn(fp, &i, sizeof(int), 1);
00353   if (i == 1) {
00354     ndata->p_2 = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->d[1].totalnum);
00355     rdn(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
00356   } else {
00357     ndata->p_2 = NULL;
00358   }
00359 
00360   return TRUE;
00361 }
00362 
00363 static boolean
00364 ngram_read_bin_compat(FILE *fp, NGRAM_INFO *ndata, int *retry_ret)
00365 {
00366   int i,n,len;
00367   char *w, *p;
00368   NNID *n3_bgn;
00369   NNID d, ntmp;
00370 #ifdef WORDS_INT
00371   unsigned short *buf;
00372 #endif
00373   NGRAM_TUPLE_INFO *t, *tt, *ttt;
00374 
00375   /* old binary N-gram assumes these types */
00376   ndata->bigram_index_reversed = TRUE;
00377   ndata->n = 3;
00378   ndata->dir = DIR_RL;
00379 
00380   /* read total info and set max_word_num */
00381   for(n=0;n<ndata->n;n++) {
00382     rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00383     if (file_version >= 4 && ndata->d[n].totalnum >= NNIDMAX) {
00384       jlog("Error: ngram_read_bin_compat: too big %d-gram (%d, should be less than %d)\n", n+1, ndata->d[n].totalnum, NNIDMAX);
00385       return FALSE;
00386     }
00387   }
00388   ndata->max_word_num = ndata->d[0].totalnum;
00389 
00390   if (file_version == 4) {
00391     rdn(fp, &(ndata->d[1].context_num), sizeof(NNID), 1);
00392   }
00393 
00394   for(n=0;n<ndata->n;n++) {
00395     if (n < 2) {
00396       ndata->d[n].is24bit = FALSE;
00397     } else {
00398       if (ndata->d[n].totalnum >= NNIDMAX) {
00399         jlog("Stat: ngram_read_bin_compat: more than %d %d-gram tuples, use old structure\n", NNIDMAX, n+1);
00400         ndata->d[n].is24bit = FALSE;
00401       } else {
00402         ndata->d[n].is24bit = TRUE;
00403       }
00404     }
00405     ndata->d[n].nnid2ctid_upper = NULL;
00406     ndata->d[n].nnid2ctid_lower = NULL;
00407   }
00408   /* always do back-off compaction for 3-gram and up */
00409   /* mark 2-gram and up */
00410   ndata->d[0].ct_compaction = FALSE;
00411   for(n=1;n<ndata->n;n++) {
00412     ndata->d[n].ct_compaction = TRUE;
00413   }
00414 
00415   /* read wname */
00416   rdn(fp, &len, sizeof(int), 1);
00417   w = mymalloc(len);
00418   rdn(fp, w, 1, len);
00419   /* assign... */
00420   ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);
00421   p = w; i = 0;
00422   while (p < w + len) {
00423     ndata->wname[i++] = p;
00424     while(*p != '\0') p++;
00425     p++;
00426   }
00427   if (i != ndata->max_word_num) {
00428     jlog("Error: ngram_read_bin_compat: wname error??\n");
00429     return FALSE;
00430   }
00431 
00432   /* malloc 1-gram */
00433   t = &(ndata->d[0]);
00434   tt = &(ndata->d[1]);
00435   ttt = &(ndata->d[2]);
00436 
00437   t->bgn_upper = NULL;
00438   t->bgn_lower = NULL;
00439   t->bgn = NULL;
00440   t->num = NULL;
00441   t->bgnlistlen = 0;
00442   t->nnid2wid = NULL;
00443   t->nnid2ctid_upper = NULL;
00444   t->nnid2ctid_lower = NULL;
00445 
00446   t->context_num = t->totalnum;
00447   t->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->totalnum);
00448   ndata->bo_wt_1 = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->context_num);
00449   t->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * t->context_num);
00450   tt->bgnlistlen = t->context_num;
00451   tt->bgn = (NNID *)mymalloc(sizeof(NNID) * tt->bgnlistlen);
00452   tt->num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * tt->bgnlistlen);
00453 
00454   /* read 1-gram */
00455   jlog("stat: ngram_read_bin_compat: reading 1-gram\n");
00456   rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00457   rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), t->context_num);
00458   rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00459   rdn(fp, tt->bgn, sizeof(NNID), tt->bgnlistlen);
00460 #ifdef WORDS_INT
00461   rdn_wordid(fp, tt->num, tt->bgnlistlen, need_conv);
00462 #else
00463   rdn(fp, tt->num, sizeof(WORD_ID), tt->bgnlistlen);
00464 #endif
00465 
00466 #ifdef WORDS_INT
00467   {
00468     /* check if we are wrongly reading word_id=2byte bingram
00469        (if bingram version >= 4, this should not be happen because
00470         header correctly tells the word_id byte size.  This will 
00471         occur only if matches all the conditions below:
00472         - you run Julius with --enable-words-int,
00473         - you use old bingram of version <= 3, and
00474         - you use bingram file converted without --enable-words-int
00475      */
00476     WORD_ID w;
00477     for(w=0;w<ndata->max_word_num;w++) {
00478       if (ndata->d[1].num[w] > ndata->max_word_num) {
00479         if (words_int_retry) {
00480           jlog("Error: ngram_read_bin_compat: retry failed, wrong bingram format\n");
00481           return FALSE;
00482         }
00483         jlog("Warning: ngram_read_bin_compat: incorrect data, may be a 2-byte v3 bingram, retry with conversion\n");
00484         free(ndata->wname[0]);
00485         free(ndata->wname);
00486         free(t->prob);
00487         free(ndata->bo_wt_1);
00488         free(t->bo_wt);
00489         free(tt->bgn);
00490         free(tt->num);
00491         myfrewind(fp);
00492         words_int_retry = TRUE;
00493         *retry_ret = 1;
00494         return FALSE;
00495       }
00496     }
00497   }
00498 #endif
00499 
00500   /* malloc the rest */
00501   tt->nnid2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * tt->totalnum);
00502   tt->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * tt->totalnum);
00503   ndata->p_2 = (LOGPROB *)mymalloc(sizeof(LOGPROB) * tt->totalnum);
00504   if (file_version == 4) {      /* context compaction and 24bit */
00505     tt->nnid2ctid_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * tt->totalnum);
00506     tt->nnid2ctid_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * tt->totalnum);
00507     tt->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * tt->context_num);
00508     ttt->bgnlistlen = tt->context_num;
00509     ttt->bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ttt->bgnlistlen);
00510     ttt->bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ttt->bgnlistlen);
00511     ttt->num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ttt->bgnlistlen);
00512   } else {
00513     tt->context_num = tt->totalnum;
00514     tt->bo_wt = (LOGPROB *)mymalloc(sizeof(LOGPROB) * tt->context_num);
00515     ttt->bgnlistlen = tt->context_num;
00516     ttt->num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ttt->bgnlistlen);
00517     if (ttt->is24bit) {
00518       ttt->bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ttt->bgnlistlen);
00519       ttt->bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ttt->bgnlistlen);
00520       n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ttt->bgnlistlen);
00521     } else {
00522       ttt->bgn = (NNID *)mymalloc(sizeof(NNID) * ttt->bgnlistlen);
00523     }
00524   }
00525       
00526   ttt->nnid2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ttt->totalnum);
00527   ttt->prob = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ttt->totalnum);
00528   ttt->bo_wt = NULL;
00529   
00530   /* read 2-gram*/
00531   jlog("Stat: ngram_read_bin_compat: reading 2-gram\n");
00532 #ifdef WORDS_INT
00533   rdn_wordid(fp, tt->nnid2wid, tt->totalnum, need_conv);
00534 #else
00535   rdn(fp, tt->nnid2wid, sizeof(WORD_ID), tt->totalnum);
00536 #endif
00537   rdn(fp, ndata->p_2, sizeof(LOGPROB), tt->totalnum);
00538   rdn(fp, tt->prob, sizeof(LOGPROB), tt->totalnum);
00539   if (file_version == 4) {
00540     rdn(fp, tt->nnid2ctid_upper, sizeof(NNID_UPPER), tt->totalnum);
00541     rdn(fp, tt->nnid2ctid_lower, sizeof(NNID_LOWER), tt->totalnum);
00542     rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num);
00543     rdn(fp, ttt->bgn_upper, sizeof(NNID_UPPER), ttt->bgnlistlen);
00544     rdn(fp, ttt->bgn_lower, sizeof(NNID_LOWER), ttt->bgnlistlen);
00545 #ifdef WORDS_INT
00546     rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);
00547 #else
00548     rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);
00549 #endif
00550   } else {
00551     rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num);
00552     if (ttt->is24bit) {
00553       rdn(fp, n3_bgn, sizeof(NNID), ttt->bgnlistlen);
00554       for(d=0;d<ttt->bgnlistlen;d++) {
00555         if (n3_bgn[d] == NNID_INVALID) {
00556           ttt->bgn_lower[d] = 0;
00557           ttt->bgn_upper[d] = NNID_INVALID_UPPER;
00558         } else {
00559           ntmp = n3_bgn[d] & 0xffff;
00560           ttt->bgn_lower[d] = ntmp;
00561           ntmp = n3_bgn[d] >> 16;
00562           ttt->bgn_upper[d] = ntmp;
00563         }
00564       }
00565     } else {
00566       rdn(fp, ttt->bgn, sizeof(NNID), ttt->bgnlistlen);
00567     }
00568 #ifdef WORDS_INT
00569     rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);
00570 #else
00571     rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);
00572 #endif
00573   }
00574 
00575   /* read 3-gram*/
00576   jlog("Stat: ngram_read_bin_compat: reading 3-gram\n");
00577 #ifdef WORDS_INT
00578   rdn_wordid(fp, ttt->nnid2wid, ttt->totalnum, need_conv);
00579 #else
00580   rdn(fp, ttt->nnid2wid, sizeof(WORD_ID), ttt->totalnum);
00581 #endif
00582   rdn(fp, ttt->prob, sizeof(LOGPROB), ttt->totalnum);
00583 
00584   /* compact the 2-gram back-off and 3-gram links */
00585   if (file_version != 4) {
00586     if (ttt->is24bit) {
00587       free(n3_bgn);
00588       if (ngram_compact_context(ndata, 2) == FALSE) return FALSE;
00589     }
00590   }
00591   
00592   return TRUE;
00593 }
00594 
00595 
00604 boolean
00605 ngram_read_bin(FILE *fp, NGRAM_INFO *ndata)
00606 {
00607   int retry;
00608 
00609 #ifdef WORDS_INT
00610   /* reset retry flag */
00611   words_int_retry = FALSE;
00612   /* when retrying, it restarts from here with words_int_retry = TRUE */
00613  ngram_read_bin_start:
00614 #endif
00615   
00616   ndata->from_bin = TRUE;
00617 
00618   /* check initial header */
00619   if (check_header(fp) == FALSE) return FALSE;
00620   
00621 #ifdef WORDS_INT
00622   /* in retry mode, force word_id conversion  */
00623   if (words_int_retry) need_conv = TRUE;
00624 #endif
00625   
00626 #ifdef WORDS_INT
00627   if (need_conv) jlog("Stat: ngram_read_bin: word-id size conversion enabled\n");
00628 #endif
00629 
00630   if (file_version <= 4) {
00631     retry = 0;
00632     if (ngram_read_bin_compat(fp, ndata, &retry) == FALSE) {
00633 #ifdef WORDS_INT
00634       if (retry == 1) {
00635         goto ngram_read_bin_start;
00636       } else {
00637         return FALSE;
00638       }
00639 #else
00640       return FALSE;
00641 #endif
00642     }
00643   } else {
00644     if (ngram_read_bin_v5(fp, ndata) == FALSE) return FALSE;
00645   }
00646 
00647 
00648   /* make word search tree for later lookup */
00649   jlog("Stat: ngram_read_bin: making entry name index\n");
00650   ngram_make_lookup_tree(ndata);
00651 
00652   /* set unknown id */
00653   set_unknown_id(ndata);
00654   
00655   bi_prob_func_set(ndata);
00656 
00657   return TRUE;
00658 }
00659
libsent/src/ngram/ngram_read_bin.c