libsent/src/ngram/ngram_write_bin.c

Go to the documentation of this file.
00001 
00045 /*
00046  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00047  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00048  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology
00049  * All rights reserved
00050  */
00051 
00052 #include <sent/stddefs.h>
00053 #include <sent/ngram2.h>
00054 
00055 static boolean need_swap; 
00056 
00065 static void
00066 wrt(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00067 {
00068   if (need_swap == TRUE && unitbyte != 1) {
00069     swap_bytes((char *)buf, unitbyte, unitnum);
00070   }
00071   if (myfwrite(buf, unitbyte, unitnum, fp) < (size_t)unitnum) {
00072     perror("write_ngram_bin: wrt");
00073     j_error("write failed\n");
00074   }
00075   if (need_swap == TRUE && unitbyte != 1) {
00076     swap_bytes((char *)buf, unitbyte, unitnum);
00077   }
00078 }
00079 
00088 static void
00089 write_header(FILE *fp, char *str, int version)
00090 {
00091   char buf[BINGRAM_HDSIZE];
00092   int i, totallen;
00093   for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
00094   switch(version) {
00095   case 4:
00096     totallen = strlen(BINGRAM_IDSTR_V4) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
00097     break;
00098   case 3:
00099     totallen = strlen(BINGRAM_IDSTR) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(str);
00100     break;
00101   }
00102   if (totallen >= BINGRAM_HDSIZE) {
00103     j_printerr("Warning: user header too long, last will be truncated\n");
00104     i = strlen(str) - (totallen - BINGRAM_HDSIZE);
00105     str[i] = '\0';
00106   }
00107   switch(version) {
00108   case 4:
00109     sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V4, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
00110     break;
00111   case 3:
00112     sprintf(buf, "%s\n%s%s\n%s", BINGRAM_IDSTR, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, str);
00113     break;
00114   }
00115   wrt(fp, buf, 1, BINGRAM_HDSIZE);
00116 }
00117 
00127 boolean
00128 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
00129 {
00130   int i,n,len;
00131 
00132   /* write initial header */
00133   write_header(fp, headerstr, ndata->version);
00134 
00135   /* set swap requirement */
00136   if (ndata->version == 4) {
00137     need_swap = FALSE;
00138   } else {
00139 #ifdef WORDS_BIGENDIAN
00140     need_swap = FALSE;
00141 #else
00142     need_swap = TRUE;
00143 #endif
00144   } 
00145 
00146   /* write total info */
00147   for(n=0;n<MAX_N;n++) {
00148     wrt(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1);
00149     /*j_printf("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/
00150   }
00151   if (ndata->version == 4) {
00152     wrt(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1);
00153   }
00154   j_printf("wrote total info\n");
00155   /* unk_*, isopen, max_word_num are set after read, so need not save */
00156 
00157   /* write wname */
00158   len = 0;
00159   for(i=0;i<ndata->ngram_num[0];i++) {
00160     len += strlen(ndata->wname[i]) + 1;
00161   }
00162   wrt(fp, &len, sizeof(int), 1);
00163   for(i=0;i<ndata->ngram_num[0];i++) {
00164     wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */
00165   }
00166   j_printf("wrote wnames (%d bytes)\n", len + sizeof(int));
00167   
00168   /* write 1-gram */
00169   wrt(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]);
00170   wrt(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]);
00171   wrt(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]);
00172   wrt(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]);
00173   wrt(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]);
00174   j_printf("wrote 1-gram (%d KB)\n",
00175            ((sizeof(LOGPROB)*3 + sizeof(NNID) + sizeof(WORD_ID)) * ndata->ngram_num[0]) / 1024);
00176   
00177   /* write 2-gram*/
00178   wrt(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]);
00179   wrt(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]);
00180   wrt(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]);
00181   switch (ndata->version) {
00182   case 4:
00183     wrt(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]);
00184     wrt(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]);
00185     wrt(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num);
00186     wrt(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num);
00187     wrt(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num);
00188     wrt(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num);
00189     j_printf("wrote 2-gram (%d KB)\n",
00190              ((sizeof(LOGPROB)*2 + sizeof(NNID_UPPER) + sizeof(NNID_LOWER) + sizeof(WORD_ID)) * ndata->ngram_num[1] + (sizeof(LOGPROB) + sizeof(NNID_UPPER) + sizeof(NNID_LOWER) + sizeof(WORD_ID)) * ndata->bigram_bo_num) / 1024);
00191     break;
00192   case 3:
00193     wrt(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]);
00194     wrt(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00195     wrt(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]);
00196     j_printf("wrote 2-gram (%d KB)\n",
00197              ((sizeof(LOGPROB)*3 + sizeof(NNID) + sizeof(WORD_ID)*2) * ndata->ngram_num[1]) / 1024);
00198     break;
00199   }
00200   
00201 
00202   /* write 3-gram*/
00203   wrt(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]);
00204   wrt(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]);
00205   j_printf("wrote 3-gram (%d KB)\n",
00206            ((sizeof(LOGPROB) + sizeof(WORD_ID)) * ndata->ngram_num[2]) / 1024);
00207 
00208   return TRUE;
00209 }

Generated on Tue Dec 26 16:16:33 2006 for Julius by  doxygen 1.5.0