00001
00046
00047
00048
00049
00050
00051
00052
00053 #include <sent/stddefs.h>
00054 #include <sent/ngram2.h>
00055
00056 static boolean need_swap;
00057
00058 #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE
00059
00060 static unsigned int count;
00061 void
00062 reset_wrt_counter()
00063 {
00064 count = 0;
00065 }
00066 static unsigned int
00067 get_wrt_counter()
00068 {
00069 return count;
00070 }
00071
00072
00081 static boolean
00082 wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
00083 {
00084 if (need_swap == TRUE && unitbyte != 1) {
00085 swap_bytes((char *)buf, unitbyte, unitnum);
00086 }
00087 if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) {
00088 jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
00089 return FALSE;
00090 }
00091 if (need_swap == TRUE && unitbyte != 1) {
00092 swap_bytes((char *)buf, unitbyte, unitnum);
00093 }
00094 count += unitbyte * unitnum;
00095 return TRUE;
00096 }
00097
00106 static boolean
00107 write_header(FILE *fp, char *str)
00108 {
00109 char buf[BINGRAM_HDSIZE];
00110 int i, totallen;
00111
00112 for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
00113 totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
00114 if (totallen >= BINGRAM_HDSIZE) {
00115 jlog("Warning: write_bingram: header too long, last will be truncated\n");
00116 i = strlen(str) - (totallen - BINGRAM_HDSIZE);
00117 str[i] = '\0';
00118 }
00119 sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
00120 wrt(fp, buf, 1, BINGRAM_HDSIZE);
00121
00122 return TRUE;
00123 }
00124
00134 boolean
00135 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
00136 {
00137 int i,n;
00138 unsigned int len;
00139 int wlen;
00140 NGRAM_TUPLE_INFO *t;
00141
00142 reset_wrt_counter();
00143
00144
00145 if (write_header(fp, headerstr) == FALSE) return FALSE;
00146
00147
00148 need_swap = FALSE;
00149
00150
00151 wrt(fp, &(ndata->n), sizeof(int), 1);
00152 wrt(fp, &(ndata->dir), sizeof(int), 1);
00153 wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
00154
00155
00156 for(n=0;n<ndata->n;n++) {
00157 wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00158
00159 }
00160
00161
00162
00163
00164 wlen = 0;
00165 for(i=0;i<ndata->max_word_num;i++) {
00166 wlen += strlen(ndata->wname[i]) + 1;
00167 }
00168 wrt(fp, &wlen, sizeof(int), 1);
00169 for(i=0;i<ndata->max_word_num;i++) {
00170 wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1);
00171 }
00172
00173
00174 for(n=0;n<ndata->n;n++) {
00175 t = &(ndata->d[n]);
00176
00177 wrt(fp, &(t->is24bit), sizeof(boolean), 1);
00178 wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
00179 wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1);
00180 wrt(fp, &(t->context_num), sizeof(NNID), 1);
00181 if (n > 0) {
00182 if (t->is24bit) {
00183 wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
00184 wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
00185 } else {
00186 wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
00187 }
00188 wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
00189 wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
00190 }
00191 wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00192 if (t->bo_wt) {
00193 i = 1;
00194 wrt(fp, &i, sizeof(int), 1);
00195 wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00196 } else {
00197 i = 0;
00198 wrt(fp, &i, sizeof(int), 1);
00199 }
00200 if (t->nnid2ctid_upper) {
00201 i = 1;
00202 wrt(fp, &i, sizeof(int), 1);
00203 wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
00204 wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
00205 } else {
00206 i = 0;
00207 wrt(fp, &i, sizeof(int), 1);
00208 }
00209
00210 }
00211
00212
00213 if (ndata->bo_wt_1) {
00214 i = 1;
00215 wrt(fp, &i, sizeof(int), 1);
00216 wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
00217 } else {
00218 i = 0;
00219 wrt(fp, &i, sizeof(int), 1);
00220 }
00221 if (ndata->p_2) {
00222 i = 1;
00223 wrt(fp, &i, sizeof(int), 1);
00224 wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
00225 } else {
00226 i = 0;
00227 wrt(fp, &i, sizeof(int), 1);
00228 }
00229
00230 len = get_wrt_counter();
00231 jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0);
00232 return TRUE;
00233 }