00001
00103
00104
00105
00106
00107
00108
00109
00110 #ifndef __SENT_NGRAM2_H__
00111 #define __SENT_NGRAM2_H__
00112
00113 #include <sent/stddefs.h>
00114 #include <sent/ptree.h>
00115
00116 #define MAX_N 10
00117
00118 typedef unsigned int NNID;
00119 #define NNID_INVALID 0xffffffff
00120 #define NNID_MAX 0xfffffffe
00121
00122 typedef unsigned char NNID_UPPER;
00123 typedef unsigned short NNID_LOWER;
00124 #define NNID_INVALID_UPPER 255
00125 #define NNID_MAX_24 16711679
00126
00128 #define BEGIN_WORD_DEFAULT "<s>"
00130 #define END_WORD_DEFAULT "</s>"
00132 #define UNK_WORD_DEFAULT "<unk>"
00133 #define UNK_WORD_DEFAULT2 "<UNK>"
00135 #define UNK_WORD_MAXLEN 30
00136
00141 typedef struct {
00142 NNID totalnum;
00143 boolean is24bit;
00144 NNID bgnlistlen;
00145 NNID_UPPER *bgn_upper;
00146 NNID_LOWER *bgn_lower;
00147 NNID *bgn;
00148 WORD_ID *num;
00149
00150 WORD_ID *nnid2wid;
00151 LOGPROB *prob;
00152
00153 NNID context_num;
00154 LOGPROB *bo_wt;
00155 boolean ct_compaction;
00156 NNID_UPPER *nnid2ctid_upper;
00157 NNID_LOWER *nnid2ctid_lower;
00158
00159 } NGRAM_TUPLE_INFO;
00160
00169 typedef struct __ngram_info__ {
00170 int n;
00171 int dir;
00172 boolean from_bin;
00173 boolean bigram_index_reversed;
00174 boolean bos_eos_swap;
00175 WORD_ID max_word_num;
00176 char **wname;
00177 PATNODE *root;
00178 WORD_ID unk_id;
00179 int unk_num;
00180 LOGPROB unk_num_log;
00181 boolean isopen;
00182
00183 NGRAM_TUPLE_INFO d[MAX_N];
00184
00185
00186 LOGPROB *bo_wt_1;
00187 LOGPROB *p_2;
00188 LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID);
00189 } NGRAM_INFO;
00190
00191
00192
00193
00195 #define BINGRAM_IDSTR "julius_bingram_v3"
00197 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00199 #define BINGRAM_IDSTR_V5 "julius_bingram_v5"
00201 #define BINGRAM_HDSIZE 512
00203 #define BINGRAM_SIZESTR_HEAD "word="
00205 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00207 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00208 #ifdef WORDS_INT
00209 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00210 #else
00211 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00212 #endif
00214 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00216 #ifdef WORDS_BIGENDIAN
00217 #define BINGRAM_NATURAL_BYTEORDER "BE"
00218 #else
00219 #define BINGRAM_NATURAL_BYTEORDER "LE"
00220 #endif
00221
00222
00223 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w);
00224 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w);
00225 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00226 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00227 void bi_prob_func_set(NGRAM_INFO *ndata);
00228
00229 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition);
00230 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00231 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00232
00233 boolean ngram_compact_context(NGRAM_INFO *ndata, int n);
00234
00235 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00236 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00237 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00238
00239 NGRAM_INFO *ngram_info_new();
00240 void ngram_info_free(NGRAM_INFO *ngram);
00241 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00242 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir);
00243 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file);
00244 void set_unknown_id(NGRAM_INFO *ndata, char *str);
00245
00246 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);
00247
00248 #include <sent/vocabulary.h>
00249 boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00250 void fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo);
00251
00252 #endif