libsent/src/ngram/ngram_read_bin.c

Go to the documentation of this file.
00001 
00053 /*
00054  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00055  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00056  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology
00057  * All rights reserved
00058  */
00059 
00060 #include <sent/stddefs.h>
00061 #include <sent/ngram2.h>
00062 
00063 static int file_version;  
00064 static boolean need_swap; 
00065 #ifdef WORDS_INT
00066 static boolean need_conv;       
00067 static boolean words_int_retry = FALSE; 
00068 #endif
00069 
00078 static void
00079 rdn(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00080 {
00081   size_t tmp;
00082   if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < (size_t)unitnum) {
00083     perror("ngram_read_bin");
00084     j_error("read failed\n");
00085   }
00086   if (need_swap) {
00087     if (unitbyte != 1) {
00088       swap_bytes(buf, unitbyte, unitnum);
00089     }
00090   }
00091 
00092 }
00093 
00094 #ifdef WORDS_INT
00095 
00103 static void
00104 rdn_wordid(FILE *fp, void *buf, int unitnum, boolean need_conv)
00105 {
00106   int i;
00107   unsigned short *s;
00108   WORD_ID *t;
00109   WORD_ID d;
00110 
00111   if (need_conv) {
00112     /* read unsigned short units */
00113     rdn(fp, buf, sizeof(unsigned short), unitnum);
00114     /* convert them to WORD_ID (integer) */
00115     for(i=unitnum-1;i>=0;i--) {
00116       s = (unsigned short *)buf + i;
00117       t = (WORD_ID *)buf + i;
00118       d = *s;
00119       *t = d;
00120     }
00121   } else {
00122     /* read as usual */
00123     rdn(fp, buf, sizeof(WORD_ID), unitnum);
00124   }
00125 }
00126 #endif
00127 
00133 static void
00134 check_header(FILE *fp)
00135 {
00136   char buf[BINGRAM_HDSIZE], *p;
00137   rdn(fp, buf, 1, BINGRAM_HDSIZE);
00138   
00139   p = buf;
00140 #ifdef WORDS_INT
00141   need_conv = FALSE;
00142 #endif
00143 
00144   /* version check */
00145   if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {
00146     /* bingram file made by mkbingram before 3.4.2 */
00147     file_version = 3;
00148     p += strlen(BINGRAM_IDSTR) + 1;
00149   } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {
00150     /* bingram file made by mkbingram later than 3.5 */
00151     file_version = 4;
00152     p += strlen(BINGRAM_IDSTR_V4) + 1;
00153   } else {
00154     /* not a bingram file */
00155     j_printerr("Error: invalid header, you probably use an old bingram\n");
00156     j_error("Error: if so, please re-make with newer mkbingram that comes with Julius-2.0 or later\n");
00157   }
00158   /* word size check (for bingram build by mkbingram 3.3p5 and later */
00159   if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {
00160     p += strlen(BINGRAM_SIZESTR_HEAD);
00161     if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {
00162       /* word size does not match (int / short) */
00163 #ifdef WORDS_INT
00164       if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) {
00165         /* this is 2-byte word ID, will convert while reading */
00166         j_printerr("\nWarning: 2-bytes bingram, converting to 4 bytes\n");
00167         need_conv = TRUE;
00168         p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1;
00169       } else {
00170         j_error("\nError: unknown word byte size!\n");
00171       }
00172 #else
00173       if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) {
00174         /*** 4bytes to 2bytes not implemented, just terminate here... ***/
00175         j_printerr("\nError: cannot handle 4-bytes bingram\n");
00176         j_error("Error: please use Julius compiled with --enable-words-int\n");
00177         //p += strlen(BINGRAM_SIZESTR_BODY_4BYTE) + 1;
00178       } else {
00179         j_error("\nError: unknown word byte size!\n");
00180       }
00181 #endif
00182     } else {
00183       p += strlen(BINGRAM_SIZESTR_BODY) + 1;
00184     }
00185 
00186     /* byte order check (v4 (rev.3.5) and later) */
00187     if (file_version == 4) {
00188       if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {
00189         j_error("\nError: no information for byte order??\n");
00190       }
00191       p += strlen(BINGRAM_BYTEORDER_HEAD);
00192       if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {
00193         /* file endian and running endian is different, need swapping */
00194         need_swap = TRUE;
00195       } else {
00196         need_swap = FALSE;
00197       }
00198       p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;
00199     }
00200   } /* if no BINGRAM_SIZESTR_HEAD found, just pass it */
00201 
00202   /* in case of V3 bingram file, the unit size of word_id and its byte order
00203      cannot be determined from the header.  In that case, we assume 
00204      byteorder to be a BIG ENDIAN.  The word_id unit size (2byte in normal,
00205      or 4byte if bingram generated with mkbingram with --enable-words-int)
00206      will be automagically detected.
00207      */
00208 
00209   if (file_version != 4) {
00210     /* assume input as big endian */
00211 #ifdef WORDS_BIGENDIAN
00212     need_swap = FALSE;
00213 #else
00214     need_swap = TRUE;
00215 #endif
00216   }
00217     
00218   /*j_printf("%s",buf);*/
00219 }
00220 
00229 boolean
00230 ngram_read_bin(FILE *fp, NGRAM_INFO *ndata)
00231 {
00232   int i,n,len;
00233   char *w, *p;
00234   NNID *n3_bgn;
00235   NNID d, ntmp;
00236 #ifdef WORDS_INT
00237   unsigned short *buf;
00238 #endif
00239 
00240 #ifdef WORDS_INT
00241   /* reset retry flag */
00242   words_int_retry = FALSE;
00243 
00244   /* when retrying, it restarts from here with words_int_retry = TRUE */
00245  ngram_read_bin_start:
00246 
00247 #endif
00248   
00249   ndata->from_bin = TRUE;
00250 
00251   /* check initial header */
00252   check_header(fp);
00253 
00254 #ifdef WORDS_INT
00255   /* in retry mode, force word_id conversion  */
00256   if (words_int_retry) need_conv = TRUE;
00257 #endif
00258   
00259 #ifdef WORDS_INT
00260   if (need_conv) j_printerr("(wordid conv)..");
00261 #endif
00262 
00263   /* read total info and set max_word_num */
00264   for(n=0;n<MAX_N;n++) {
00265     rdn(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1);
00266     if (file_version == 4 && ndata->ngram_num[n] >= NNIDMAX) {
00267       j_error("Error: too big %d-gram (%d, should be less than %d)\n", n+1, ndata->ngram_num[n], NNIDMAX);
00268     }
00269   }
00270   ndata->max_word_num = ndata->ngram_num[0];
00271   if (file_version == 4) rdn(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1);
00272 
00273   /* version requirement check */
00274   switch(file_version) {
00275   case 4:
00276     ndata->version = 4;
00277     break;
00278   case 3:
00279     if (ndata->ngram_num[2] >= NNIDMAX) {
00280       j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX);
00281       ndata->version = 3;
00282     } else {
00283       ndata->version = 4;       /* will be converted to v4 later */
00284     }
00285     break;
00286   }
00287 
00288   /* read wname */
00289   rdn(fp, &len, sizeof(int), 1);
00290   w = mymalloc(len);
00291   rdn(fp, w, 1, len);
00292   /* assign... */
00293   ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]);
00294   p = w; i = 0;
00295   while (p < w + len) {
00296     ndata->wname[i++] = p;
00297     while(*p != '\0') p++;
00298     p++;
00299   }
00300   if (i != ndata->ngram_num[0]) {
00301     j_error("wname error??\n");
00302   }
00303   /* malloc 1-gram */
00304   ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00305   ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00306   ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00307   ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[0]);
00308   ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[0]);
00309 
00310   /* read 1-gram */
00311   j_printerr("1-gram.");
00312   rdn(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]);
00313   j_printerr(".");
00314   rdn(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]);
00315   j_printerr(".");
00316   rdn(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]);
00317   j_printerr(".");
00318   rdn(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]);
00319   j_printerr(".");
00320 #ifdef WORDS_INT
00321   rdn_wordid(fp, ndata->n2_num, ndata->ngram_num[0], need_conv);
00322 #else
00323   rdn(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]);
00324 #endif
00325 
00326 #ifdef WORDS_INT
00327   {
00328     /* check if we are wrongly reading word_id=2byte bingram
00329        (if bingram version >= 4, this should not be happen because
00330         header correctly tells the word_id byte size.  This will 
00331         occur only if matches all the conditions below:
00332         - you run Julius with --enable-words-int,
00333         - you use old bingram of version <= 3, and
00334         - you use bingram file converted without --enable-words-int
00335      */
00336     WORD_ID w;
00337     for(w=0;w<ndata->ngram_num[0];w++) {
00338       if (ndata->n2_num[w] > ndata->ngram_num[0]) {
00339         if (words_int_retry) {
00340           j_error("\nError: retry failed, wrong bingram format\n");
00341         }
00342         j_printerr("\nWarning: incorrect data, may be a 2-byte v3 bingram, retry with converion\n");
00343         free(ndata->wname[0]);
00344         free(ndata->wname);
00345         free(ndata->p);
00346         free(ndata->bo_wt_lr);
00347         free(ndata->bo_wt_rl);
00348         free(ndata->n2_bgn);
00349         free(ndata->n2_num);
00350         myfrewind(fp);
00351         words_int_retry = TRUE;
00352         goto ngram_read_bin_start;
00353       }
00354     }
00355   }
00356 #endif
00357 
00358   /* malloc the rest */
00359   ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00360   ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00361   ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00362   if (file_version == 4) {
00363     ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00364     ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00365     ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->bigram_bo_num);
00366     ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->bigram_bo_num);
00367     ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->bigram_bo_num);
00368     ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->bigram_bo_num);
00369   } else {
00370     ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00371     ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00372     if (ndata->version == 4) {
00373       ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00374       ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00375       n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00376     } else {
00377       ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00378     }
00379   }
00380       
00381   ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[2]);
00382   ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[2]);
00383   
00384   /* read 2-gram*/
00385   j_printerr("2-gram.");
00386 #ifdef WORDS_INT
00387   rdn_wordid(fp, ndata->n2tonid, ndata->ngram_num[1], need_conv);
00388 #else
00389   rdn(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]);
00390 #endif
00391   j_printerr(".");
00392   rdn(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]);
00393   j_printerr(".");
00394   rdn(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]);
00395   j_printerr(".");
00396   if (file_version == 4) {
00397     rdn(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]);
00398     rdn(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]);
00399     j_printerr(".");
00400     rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num);
00401     j_printerr(".");
00402     rdn(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num);
00403     rdn(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num);
00404     j_printerr(".");
00405 #ifdef WORDS_INT
00406     rdn_wordid(fp, ndata->n3_num, ndata->bigram_bo_num, need_conv);
00407 #else
00408     rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num);
00409 #endif
00410   } else {
00411     rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]);
00412     j_printerr(".");
00413     if (ndata->version == 4) {
00414       rdn(fp, n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00415       for(d=0;d<ndata->ngram_num[1];d++) {
00416         if (n3_bgn[d] == NNID_INVALID) {
00417           ndata->n3_bgn_lower[d] = 0;
00418           ndata->n3_bgn_upper[d] = NNID_INVALID_UPPER;
00419         } else {
00420           ntmp = n3_bgn[d] & 0xffff;
00421           ndata->n3_bgn_lower[d] = ntmp;
00422           ntmp = n3_bgn[d] >> 16;
00423           ndata->n3_bgn_upper[d] = ntmp;
00424         }
00425       }
00426     } else {
00427       rdn(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00428     }
00429     j_printerr(".");
00430 #ifdef WORDS_INT
00431     rdn_wordid(fp, ndata->n3_num, ndata->ngram_num[1], need_conv);
00432 #else
00433     rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]);
00434 #endif
00435   }
00436 
00437   /* read 3-gram*/
00438   j_printerr("3-gram.");
00439 #ifdef WORDS_INT
00440   rdn_wordid(fp, ndata->n3tonid, ndata->ngram_num[2], need_conv);
00441 #else
00442   rdn(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]);
00443 #endif
00444   j_printerr(".");
00445   rdn(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]);
00446 
00447   /* make word search tree for later lookup */
00448   j_printerr("indexing...");
00449   ngram_make_lookup_tree(ndata);
00450 
00451   /* compact the 2-gram back-off and 3-gram links */
00452   if (file_version != 4 && ndata->version == 4) {
00453     free(n3_bgn);
00454     ngram_compact_bigram_context(ndata);
00455   }
00456   
00457   /* set unknown id */
00458   set_unknown_id(ndata);
00459   
00460   return TRUE;
00461 }

Generated on Tue Dec 26 16:16:33 2006 for Julius by  doxygen 1.5.0