00001
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088 #ifndef __SENT_NGRAM2_H__
00089 #define __SENT_NGRAM2_H__
00090
00091 #include <sent/stddefs.h>
00092 #include <sent/ptree.h>
00093
00094
00096 #define MAX_N 3
00097
00098 typedef unsigned char NNID_UPPER;
00099 typedef unsigned short NNID_LOWER;
00100 typedef int NNID;
00101 #define NNID_INVALID -1
00102 #define NNID_INVALID_UPPER 255
00103 #define NNIDMAX 16711680
00104
00105
00113 typedef struct {
00114 int version;
00115 boolean from_bin;
00116 WORD_ID max_word_num;
00117 NNID ngram_num[MAX_N];
00118 NNID bigram_bo_num;
00119
00126 WORD_ID unk_id;
00127 int unk_num;
00128 LOGPROB unk_num_log;
00129 boolean isopen;
00130
00131
00132 char **wname;
00133 PATNODE *root;
00134
00135
00136 LOGPROB *p;
00137 LOGPROB *bo_wt_lr;
00138 LOGPROB *bo_wt_rl;
00139 NNID *n2_bgn;
00140 WORD_ID *n2_num;
00141
00142
00143 WORD_ID *n2tonid;
00144 LOGPROB *p_lr;
00145 LOGPROB *p_rl;
00146 NNID_UPPER *n2bo_upper;
00147 NNID_LOWER *n2bo_lower;
00148
00149
00150 LOGPROB *bo_wt_rrl;
00151 NNID *n3_bgn;
00152 NNID_UPPER *n3_bgn_upper;
00153 NNID_LOWER *n3_bgn_lower;
00154 WORD_ID *n3_num;
00155
00156
00157 WORD_ID *n3tonid;
00158 LOGPROB *p_rrl;
00159 } NGRAM_INFO;
00160
00161
00162
00163
00165 #define BINGRAM_IDSTR "julius_bingram_v3"
00167 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00169 #define BINGRAM_HDSIZE 512
00171 #define BINGRAM_SIZESTR_HEAD "word="
00173 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00175 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00176 #ifdef WORDS_INT
00177 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00178 #else
00179 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00180 #endif
00182 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00184 #ifdef WORDS_BIGENDIAN
00185 #define BINGRAM_NATURAL_BYTEORDER "BE"
00186 #else
00187 #define BINGRAM_NATURAL_BYTEORDER "LE"
00188 #endif
00189
00190
00191
00192 NNID search_bigram(NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r);
00193
00194 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00195 LOGPROB bi_prob_lr(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00196 LOGPROB bi_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00197 LOGPROB tri_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3);
00198
00199 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, int direction);
00200 void set_unknown_id(NGRAM_INFO *ndata);
00201 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00202 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00203
00204 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00205 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00206 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00207
00208 NGRAM_INFO *ngram_info_new();
00209 void ngram_info_free(NGRAM_INFO *ngram);
00210 void init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00211 void init_ngram_arpa(NGRAM_INFO *ndata, char *lrfile, char *rlfile);
00212
00213 void ngram_compact_bigram_context(NGRAM_INFO *ndata);
00214
00215 void print_ngram_info(NGRAM_INFO *ndata);
00216
00217 #include <sent/vocabulary.h>
00218 void make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00219
00220 #endif