00001
00046
00047
00048
00049
00050
00051
00052
00053 #include <sent/stddefs.h>
00054 #include <sent/ngram2.h>
00055
00056 static boolean need_swap;
00057
00058 #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE
00059
00060 static int count;
00061 void
00062 reset_wrt_counter()
00063 {
00064 count = 0;
00065 }
00066 int
00067 get_wrt_counter()
00068 {
00069 return count;
00070 }
00071
00072
00081 static boolean
00082 wrtfunc(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00083 {
00084 if (need_swap == TRUE && unitbyte != 1) {
00085 swap_bytes((char *)buf, unitbyte, unitnum);
00086 }
00087 if (myfwrite(buf, unitbyte, unitnum, fp) < (size_t)unitnum) {
00088 jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
00089 return FALSE;
00090 }
00091 if (need_swap == TRUE && unitbyte != 1) {
00092 swap_bytes((char *)buf, unitbyte, unitnum);
00093 }
00094 count += unitbyte * unitnum;
00095 return TRUE;
00096 }
00097
00106 static boolean
00107 write_header(FILE *fp, char *str)
00108 {
00109 char buf[BINGRAM_HDSIZE];
00110 int i, totallen;
00111
00112 for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
00113 totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
00114 if (totallen >= BINGRAM_HDSIZE) {
00115 jlog("Warning: write_bingram: header too long, last will be truncated\n");
00116 i = strlen(str) - (totallen - BINGRAM_HDSIZE);
00117 str[i] = '\0';
00118 }
00119 sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
00120 wrt(fp, buf, 1, BINGRAM_HDSIZE);
00121
00122 return TRUE;
00123 }
00124
00134 boolean
00135 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
00136 {
00137 int i,n,len;
00138 NGRAM_TUPLE_INFO *t;
00139
00140 reset_wrt_counter();
00141
00142
00143 if (write_header(fp, headerstr) == FALSE) return FALSE;
00144
00145
00146 need_swap = FALSE;
00147
00148
00149 wrt(fp, &(ndata->n), sizeof(int), 1);
00150 wrt(fp, &(ndata->dir), sizeof(int), 1);
00151 wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
00152
00153
00154 for(n=0;n<ndata->n;n++) {
00155 wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00156
00157 }
00158
00159
00160
00161
00162 len = 0;
00163 for(i=0;i<ndata->max_word_num;i++) {
00164 len += strlen(ndata->wname[i]) + 1;
00165 }
00166 wrt(fp, &len, sizeof(int), 1);
00167 for(i=0;i<ndata->max_word_num;i++) {
00168 wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1);
00169 }
00170
00171
00172 for(n=0;n<ndata->n;n++) {
00173 t = &(ndata->d[n]);
00174
00175 wrt(fp, &(t->is24bit), sizeof(boolean), 1);
00176 wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
00177 wrt(fp, &(t->bgnlistlen), sizeof(int), 1);
00178 wrt(fp, &(t->context_num), sizeof(int), 1);
00179 if (n > 0) {
00180 if (t->is24bit) {
00181 wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
00182 wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
00183 } else {
00184 wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
00185 }
00186 wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
00187 wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
00188 }
00189 wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00190 if (t->bo_wt) {
00191 i = 1;
00192 wrt(fp, &i, sizeof(int), 1);
00193 wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00194 } else {
00195 i = 0;
00196 wrt(fp, &i, sizeof(int), 1);
00197 }
00198 if (t->nnid2ctid_upper) {
00199 i = 1;
00200 wrt(fp, &i, sizeof(int), 1);
00201 wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
00202 wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
00203 } else {
00204 i = 0;
00205 wrt(fp, &i, sizeof(int), 1);
00206 }
00207
00208 }
00209
00210
00211 if (ndata->bo_wt_1) {
00212 i = 1;
00213 wrt(fp, &i, sizeof(int), 1);
00214 wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
00215 } else {
00216 i = 0;
00217 wrt(fp, &i, sizeof(int), 1);
00218 }
00219 if (ndata->p_2) {
00220 i = 1;
00221 wrt(fp, &i, sizeof(int), 1);
00222 wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
00223 } else {
00224 i = 0;
00225 wrt(fp, &i, sizeof(int), 1);
00226 }
00227
00228 len = get_wrt_counter();
00229 jlog("Stat: ngram_write_bin: wrote %d bytes (%.1f MB)\n", len, len / 1048576.0);
00230 return TRUE;
00231 }