Main Page | Modules | Data Structures | Directories | File List | Data Fields | Globals | Related Pages

ngram_read_bin.c

Go to the documentation of this file.
00001 
00053 /*
00054  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00055  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00056  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology, Nagoya Institute of Technology
00057  * All rights reserved
00058  */
00059 
00060 #include <sent/stddefs.h>
00061 #include <sent/ngram2.h>
00062 
00063 static int file_version;  
00064 static boolean need_swap; 
00065 
00074 static void
00075 rdn(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00076 {
00077   size_t tmp;
00078   if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < (size_t)unitnum) {
00079     perror("ngram_read_bin");
00080     j_error("read failed\n");
00081   }
00082   if (need_swap) {
00083     if (unitbyte != 1) {
00084       swap_bytes(buf, unitbyte, unitnum);
00085     }
00086   }
00087 
00088 }
00089 
00095 static void
00096 check_header(FILE *fp)
00097 {
00098   char buf[BINGRAM_HDSIZE], *p;
00099   rdn(fp, buf, 1, BINGRAM_HDSIZE);
00100   
00101   p = buf;
00102   /* version check */
00103   if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {
00104     /* bingram file made by mkbingram before 3.4.2 */
00105     file_version = 3;
00106     p += strlen(BINGRAM_IDSTR) + 1;
00107   } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {
00108     /* bingram file made by mkbingram later than 3.5 */
00109     file_version = 4;
00110     p += strlen(BINGRAM_IDSTR_V4) + 1;
00111   } else {
00112     /* not a bingram file */
00113     j_printerr("Error: invalid header, you probably use old bingram\n");
00114     j_error("Error: if so, please re-make with newer mkbingram that comes with Julius-2.0 or later\n");
00115   }
00116   /* word size check (for bingram build by mkbingram 3.3p5 and later */
00117   if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {
00118     p += strlen(BINGRAM_SIZESTR_HEAD);
00119     if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {
00120       /* word size does not match (int / short) */
00121       /*** it should be converted, but just terminate here... ***/
00122       j_error("Error: word size does not match in bingram\n");
00123     }
00124     p += strlen(BINGRAM_SIZESTR_BODY) + 1;
00125 
00126     /* byte order check (v4 (rev.3.5) and later) */
00127     if (file_version == 4) {
00128       if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {
00129         j_error("Error: no information for byte order in v4 format??\n");
00130       }
00131       p += strlen(BINGRAM_BYTEORDER_HEAD);
00132       if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {
00133         /* file endian and running endian is different, need swapping */
00134         need_swap = TRUE;
00135       } else {
00136         need_swap = FALSE;
00137       }
00138       p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;
00139     }
00140   } /* if no BINGRAM_SIZESTR_HEAD found, just pass it */
00141 
00142   if (file_version != 4) {
00143     /* assume input as big endian */
00144 #ifdef WORDS_BIGENDIAN
00145     need_swap = FALSE;
00146 #else
00147     need_swap = TRUE;
00148 #endif
00149   }
00150     
00151   /*j_printf("%s",buf);*/
00152 }
00153 
00162 boolean
00163 ngram_read_bin(FILE *fp, NGRAM_INFO *ndata)
00164 {
00165   int i,n,len;
00166   char *w, *p;
00167   NNID *n3_bgn;
00168   NNID d, ntmp;
00169   
00170   ndata->from_bin = TRUE;
00171 
00172   /* check initial header */
00173   check_header(fp);
00174 
00175   /* read total info and set max_word_num */
00176   for(n=0;n<MAX_N;n++) {
00177     rdn(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1);
00178     if (file_version == 4 && ndata->ngram_num[n] >= NNIDMAX) {
00179       j_error("Error: too big %d-gram (%d, should be less than %d)\n", n+1, ndata->ngram_num[n], NNIDMAX);
00180     }
00181   }
00182   ndata->max_word_num = ndata->ngram_num[0];
00183   if (file_version == 4) rdn(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1);
00184 
00185   /* version requirement check */
00186   switch(file_version) {
00187   case 4:
00188     ndata->version = 4;
00189     break;
00190   case 3:
00191     if (ndata->ngram_num[2] >= NNIDMAX) {
00192       j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX);
00193       ndata->version = 3;
00194     } else {
00195       ndata->version = 4;       /* will be converted to v4 later */
00196     }
00197     break;
00198   }
00199 
00200   /* read wname */
00201   rdn(fp, &len, sizeof(int), 1);
00202   w = mymalloc(len);
00203   rdn(fp, w, 1, len);
00204   /* assign... */
00205   ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]);
00206   p = w; i = 0;
00207   while (p < w + len) {
00208     ndata->wname[i++] = p;
00209     while(*p != '\0') p++;
00210     p++;
00211   }
00212   if (i != ndata->ngram_num[0]) {
00213     j_error("wname error??\n");
00214   }
00215   /* malloc all */
00216   ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00217   ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00218   ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00219   ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[0]);
00220   ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[0]);
00221   ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00222   ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00223   ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00224   if (file_version == 4) {
00225     ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00226     ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00227     ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->bigram_bo_num);
00228     ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->bigram_bo_num);
00229     ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->bigram_bo_num);
00230     ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->bigram_bo_num);
00231   } else {
00232     ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00233     ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00234     if (ndata->version == 4) {
00235       ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00236       ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00237       n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00238     } else {
00239       ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00240     }
00241   }
00242       
00243   ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[2]);
00244   ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[2]);
00245   
00246   /* read 1-gram */
00247   j_printerr("1-gram.");
00248   rdn(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]);
00249   j_printerr(".");
00250   rdn(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]);
00251   j_printerr(".");
00252   rdn(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]);
00253   j_printerr(".");
00254   rdn(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]);
00255   j_printerr(".");
00256   rdn(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]);
00257   
00258   /* read 2-gram*/
00259   j_printerr("2-gram.");
00260   rdn(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]);
00261   j_printerr(".");
00262   rdn(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]);
00263   j_printerr(".");
00264   rdn(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]);
00265   j_printerr(".");
00266   if (file_version == 4) {
00267     rdn(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]);
00268     rdn(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]);
00269     j_printerr(".");
00270     rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num);
00271     j_printerr(".");
00272     rdn(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num);
00273     rdn(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num);
00274     j_printerr(".");
00275     rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num);
00276   } else {
00277     rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]);
00278     j_printerr(".");
00279     if (ndata->version == 4) {
00280       rdn(fp, n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00281       for(d=0;d<ndata->ngram_num[1];d++) {
00282         if (n3_bgn[d] == NNID_INVALID) {
00283           ndata->n3_bgn_lower[d] = 0;
00284           ndata->n3_bgn_upper[d] = NNID_INVALID_UPPER;
00285         } else {
00286           ntmp = n3_bgn[d] & 0xffff;
00287           ndata->n3_bgn_lower[d] = ntmp;
00288           ntmp = n3_bgn[d] >> 16;
00289           ndata->n3_bgn_upper[d] = ntmp;
00290         }
00291       }
00292     } else {
00293       rdn(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00294     }
00295     j_printerr(".");
00296     rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]);
00297   }
00298 
00299   /* read 3-gram*/
00300   j_printerr("3-gram.");
00301   rdn(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]);
00302   j_printerr(".");
00303   rdn(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]);
00304 
00305   /* make word search tree for later lookup */
00306   j_printerr("indexing...");
00307   ngram_make_lookup_tree(ndata);
00308 
00309   /* compact the 2-gram back-off and 3-gram links */
00310   if (file_version != 4 && ndata->version == 4) {
00311     free(n3_bgn);
00312     ngram_compact_bigram_context(ndata);
00313   }
00314   
00315   /* set unknown id */
00316   set_unknown_id(ndata);
00317   
00318   return TRUE;
00319 }

Generated on Tue Mar 28 16:01:39 2006 for Julius by  doxygen 1.4.2