00001
00103
00104
00105
00106
00107
00108
00109
00110 #ifndef __SENT_NGRAM2_H__
00111 #define __SENT_NGRAM2_H__
00112
00113 #include <sent/stddefs.h>
00114 #include <sent/ptree.h>
00115
00116 #define MAX_N 10
00117
00118 typedef unsigned char NNID_UPPER;
00119 typedef unsigned short NNID_LOWER;
00120 typedef int NNID;
00121 #define NNID_INVALID -1
00122 #define NNID_INVALID_UPPER 255
00123 #define NNIDMAX 16711680
00124
00125
00129 typedef struct {
00130 int totalnum;
00131 boolean is24bit;
00132 int bgnlistlen;
00133 NNID_UPPER *bgn_upper;
00134 NNID_LOWER *bgn_lower;
00135 NNID *bgn;
00136 WORD_ID *num;
00137
00138 WORD_ID *nnid2wid;
00139 LOGPROB *prob;
00140
00141 int context_num;
00142 LOGPROB *bo_wt;
00143 boolean ct_compaction;
00144 NNID_UPPER *nnid2ctid_upper;
00145 NNID_LOWER *nnid2ctid_lower;
00146
00147 } NGRAM_TUPLE_INFO;
00148
00157 typedef struct __ngram_info__ {
00158 int n;
00159 int dir;
00160 boolean from_bin;
00161 boolean bigram_index_reversed;
00162 WORD_ID max_word_num;
00163 char **wname;
00164 PATNODE *root;
00165 WORD_ID unk_id;
00166 int unk_num;
00167 LOGPROB unk_num_log;
00168 boolean isopen;
00169
00170 NGRAM_TUPLE_INFO d[MAX_N];
00171
00172
00173 LOGPROB *bo_wt_1;
00174 LOGPROB *p_2;
00175 LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID);
00176 } NGRAM_INFO;
00177
00178
00179
00180
00182 #define BINGRAM_IDSTR "julius_bingram_v3"
00184 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00186 #define BINGRAM_IDSTR_V5 "julius_bingram_v5"
00188 #define BINGRAM_HDSIZE 512
00190 #define BINGRAM_SIZESTR_HEAD "word="
00192 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00194 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00195 #ifdef WORDS_INT
00196 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00197 #else
00198 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00199 #endif
00201 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00203 #ifdef WORDS_BIGENDIAN
00204 #define BINGRAM_NATURAL_BYTEORDER "BE"
00205 #else
00206 #define BINGRAM_NATURAL_BYTEORDER "LE"
00207 #endif
00208
00209
00210 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w);
00211 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w);
00212 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00213 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00214 void bi_prob_func_set(NGRAM_INFO *ndata);
00215
00216 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition);
00217 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00218 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00219
00220 boolean ngram_compact_context(NGRAM_INFO *ndata, int n);
00221
00222 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00223 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00224 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00225
00226 NGRAM_INFO *ngram_info_new();
00227 void ngram_info_free(NGRAM_INFO *ngram);
00228 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00229 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir);
00230 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file);
00231 void set_unknown_id(NGRAM_INFO *ndata);
00232
00233 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);
00234
00235 #include <sent/vocabulary.h>
00236 void make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00237
00238 #endif