00001
00045
00046
00047
00048
00049
00050
00051
00052 #include <sent/stddefs.h>
00053 #include <sent/ngram2.h>
00054
00055 static boolean need_swap;
00056
00065 static void
00066 wrt(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00067 {
00068 if (need_swap == TRUE && unitbyte != 1) {
00069 swap_bytes((char *)buf, unitbyte, unitnum);
00070 }
00071 if (myfwrite(buf, unitbyte, unitnum, fp) < (size_t)unitnum) {
00072 perror("write_ngram_bin: wrt");
00073 j_error("write failed\n");
00074 }
00075 if (need_swap == TRUE && unitbyte != 1) {
00076 swap_bytes((char *)buf, unitbyte, unitnum);
00077 }
00078 }
00079
00088 static void
00089 write_header(FILE *fp, char *str, int version)
00090 {
00091 char buf[BINGRAM_HDSIZE];
00092 int i, totallen;
00093 for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
00094 switch(version) {
00095 case 4:
00096 totallen = strlen(BINGRAM_IDSTR_V4) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
00097 break;
00098 case 3:
00099 totallen = strlen(BINGRAM_IDSTR) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(str);
00100 break;
00101 }
00102 if (totallen >= BINGRAM_HDSIZE) {
00103 j_printerr("Warning: user header too long, last will be truncated\n");
00104 i = strlen(str) - (totallen - BINGRAM_HDSIZE);
00105 str[i] = '\0';
00106 }
00107 switch(version) {
00108 case 4:
00109 sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V4, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
00110 break;
00111 case 3:
00112 sprintf(buf, "%s\n%s%s\n%s", BINGRAM_IDSTR, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, str);
00113 break;
00114 }
00115 wrt(fp, buf, 1, BINGRAM_HDSIZE);
00116 }
00117
00127 boolean
00128 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
00129 {
00130 int i,n,len;
00131
00132
00133 write_header(fp, headerstr, ndata->version);
00134
00135
00136 if (ndata->version == 4) {
00137 need_swap = FALSE;
00138 } else {
00139 #ifdef WORDS_BIGENDIAN
00140 need_swap = FALSE;
00141 #else
00142 need_swap = TRUE;
00143 #endif
00144 }
00145
00146
00147 for(n=0;n<MAX_N;n++) {
00148 wrt(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1);
00149
00150 }
00151 if (ndata->version == 4) {
00152 wrt(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1);
00153 }
00154 j_printf("wrote total info\n");
00155
00156
00157
00158 len = 0;
00159 for(i=0;i<ndata->ngram_num[0];i++) {
00160 len += strlen(ndata->wname[i]) + 1;
00161 }
00162 wrt(fp, &len, sizeof(int), 1);
00163 for(i=0;i<ndata->ngram_num[0];i++) {
00164 wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1);
00165 }
00166 j_printf("wrote wnames (%d bytes)\n", len + sizeof(int));
00167
00168
00169 wrt(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]);
00170 wrt(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]);
00171 wrt(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]);
00172 wrt(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]);
00173 wrt(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]);
00174 j_printf("wrote 1-gram (%d KB)\n",
00175 ((sizeof(LOGPROB)*3 + sizeof(NNID) + sizeof(WORD_ID)) * ndata->ngram_num[0]) / 1024);
00176
00177
00178 wrt(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]);
00179 wrt(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]);
00180 wrt(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]);
00181 switch (ndata->version) {
00182 case 4:
00183 wrt(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]);
00184 wrt(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]);
00185 wrt(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num);
00186 wrt(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num);
00187 wrt(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num);
00188 wrt(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num);
00189 j_printf("wrote 2-gram (%d KB)\n",
00190 ((sizeof(LOGPROB)*2 + sizeof(NNID_UPPER) + sizeof(NNID_LOWER) + sizeof(WORD_ID)) * ndata->ngram_num[1] + (sizeof(LOGPROB) + sizeof(NNID_UPPER) + sizeof(NNID_LOWER) + sizeof(WORD_ID)) * ndata->bigram_bo_num) / 1024);
00191 break;
00192 case 3:
00193 wrt(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]);
00194 wrt(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00195 wrt(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]);
00196 j_printf("wrote 2-gram (%d KB)\n",
00197 ((sizeof(LOGPROB)*3 + sizeof(NNID) + sizeof(WORD_ID)*2) * ndata->ngram_num[1]) / 1024);
00198 break;
00199 }
00200
00201
00202
00203 wrt(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]);
00204 wrt(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]);
00205 j_printf("wrote 3-gram (%d KB)\n",
00206 ((sizeof(LOGPROB) + sizeof(WORD_ID)) * ndata->ngram_num[2]) / 1024);
00207
00208 return TRUE;
00209 }