libsent/src/ngram/ngram_write_bin.c

Go to the documentation of this file.
00001 
00046 /*
00047  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00048  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00049  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00050  * All rights reserved
00051  */
00052 
00053 #include <sent/stddefs.h>
00054 #include <sent/ngram2.h>
00055 
00056 static boolean need_swap; 
00057 
00058 #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE
00059 
00060 static int count;
00061 void
00062 reset_wrt_counter()
00063 {
00064   count = 0;
00065 }
00066 int
00067 get_wrt_counter()
00068 {
00069   return count;
00070 }
00071      
00072 
00081 static boolean
00082 wrtfunc(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00083 {
00084   if (need_swap == TRUE && unitbyte != 1) {
00085     swap_bytes((char *)buf, unitbyte, unitnum);
00086   }
00087   if (myfwrite(buf, unitbyte, unitnum, fp) < (size_t)unitnum) {
00088     jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
00089     return FALSE;
00090   }
00091   if (need_swap == TRUE && unitbyte != 1) {
00092     swap_bytes((char *)buf, unitbyte, unitnum);
00093   }
00094   count += unitbyte * unitnum;
00095   return TRUE;
00096 }
00097 
00106 static boolean
00107 write_header(FILE *fp, char *str)
00108 {
00109   char buf[BINGRAM_HDSIZE];
00110   int i, totallen;
00111 
00112   for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
00113   totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
00114   if (totallen >= BINGRAM_HDSIZE) {
00115     jlog("Warning: write_bingram: header too long, last will be truncated\n");
00116     i = strlen(str) - (totallen - BINGRAM_HDSIZE);
00117     str[i] = '\0';
00118   }
00119   sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
00120   wrt(fp, buf, 1, BINGRAM_HDSIZE);
00121 
00122   return TRUE;
00123 }
00124 
00134 boolean
00135 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
00136 {
00137   int i,n,len;
00138   NGRAM_TUPLE_INFO *t;
00139 
00140   reset_wrt_counter();
00141 
00142   /* write initial header */
00143   if (write_header(fp, headerstr) == FALSE) return FALSE;
00144 
00145   /* swap not needed any more */
00146   need_swap = FALSE;
00147 
00148   /* write some header info */
00149   wrt(fp, &(ndata->n), sizeof(int), 1);
00150   wrt(fp, &(ndata->dir), sizeof(int), 1);
00151   wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
00152 
00153   /* write total info */
00154   for(n=0;n<ndata->n;n++) {
00155     wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00156     /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/
00157   }
00158 
00159   /* unk_*, isopen, max_word_num are set after read, so need not save */
00160 
00161   /* write wname */
00162   len = 0;
00163   for(i=0;i<ndata->max_word_num;i++) {
00164     len += strlen(ndata->wname[i]) + 1;
00165   }
00166   wrt(fp, &len, sizeof(int), 1);
00167   for(i=0;i<ndata->max_word_num;i++) {
00168     wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */
00169   }
00170 
00171   /* write N-gram */
00172   for(n=0;n<ndata->n;n++) {
00173     t = &(ndata->d[n]);
00174 
00175     wrt(fp, &(t->is24bit), sizeof(boolean), 1);
00176     wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
00177     wrt(fp, &(t->bgnlistlen), sizeof(int), 1);
00178     wrt(fp, &(t->context_num), sizeof(int), 1);
00179     if (n > 0) {
00180       if (t->is24bit) {
00181         wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
00182         wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
00183       } else {
00184         wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
00185       }
00186       wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
00187       wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
00188     }
00189     wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00190     if (t->bo_wt) {
00191       i = 1;
00192       wrt(fp, &i, sizeof(int), 1);
00193       wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00194     } else {
00195       i = 0;
00196       wrt(fp, &i, sizeof(int), 1);
00197     }
00198     if (t->nnid2ctid_upper) {
00199       i = 1;
00200       wrt(fp, &i, sizeof(int), 1);
00201       wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
00202       wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
00203     } else {
00204       i = 0;
00205       wrt(fp, &i, sizeof(int), 1);
00206     }
00207 
00208   }
00209 
00210   /* write additional LR 2-gram */
00211   if (ndata->bo_wt_1) {
00212     i = 1;
00213     wrt(fp, &i, sizeof(int), 1);
00214     wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
00215   } else {
00216     i = 0;
00217     wrt(fp, &i, sizeof(int), 1);
00218   }
00219   if (ndata->p_2) {
00220     i = 1;
00221     wrt(fp, &i, sizeof(int), 1);
00222     wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
00223   } else {
00224     i = 0;
00225     wrt(fp, &i, sizeof(int), 1);
00226   }
00227 
00228   len = get_wrt_counter();
00229   jlog("Stat: ngram_write_bin: wrote %d bytes (%.1f MB)\n", len, len / 1048576.0);
00230   return TRUE;
00231 }

Generated on Tue Dec 18 15:59:56 2007 for Julius by  doxygen 1.5.4