libsent/include/sent/ngram2.h

Go to the documentation of this file.
00001 
00103 /*
00104  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00105  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00106  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00107  * All rights reserved
00108  */
00109 
00110 #ifndef __SENT_NGRAM2_H__
00111 #define __SENT_NGRAM2_H__
00112 
00113 #include <sent/stddefs.h>
00114 #include <sent/ptree.h>
00115 
00116 #define MAX_N 10                
00117 
00118 typedef unsigned char NNID_UPPER; 
00119 typedef unsigned short NNID_LOWER; 
00120 typedef int NNID;              
00121 #define NNID_INVALID -1         
00122 #define NNID_INVALID_UPPER 255  
00123 #define NNIDMAX 16711680        
00124 
00125 
00129 typedef struct {
00130   int totalnum;                 
00131   boolean is24bit;              
00132   int bgnlistlen;               
00133   NNID_UPPER *bgn_upper;        
00134   NNID_LOWER *bgn_lower;        
00135   NNID *bgn;                    
00136   WORD_ID *num;         
00137 
00138   WORD_ID *nnid2wid;            
00139   LOGPROB *prob;                
00140 
00141   int context_num;              
00142   LOGPROB *bo_wt;               
00143   boolean ct_compaction;        
00144   NNID_UPPER *nnid2ctid_upper;  
00145   NNID_LOWER *nnid2ctid_lower;  
00146 
00147 } NGRAM_TUPLE_INFO;
00148 
00157 typedef struct __ngram_info__ {
00158   int n;                        
00159   int dir;                      
00160   boolean from_bin;             
00161   boolean bigram_index_reversed;                
00162   WORD_ID max_word_num;         
00163   char **wname;                 
00164   PATNODE *root;                
00165   WORD_ID unk_id;               
00166   int unk_num;                  
00167   LOGPROB unk_num_log;          
00168   boolean isopen;               
00169 
00170   NGRAM_TUPLE_INFO d[MAX_N];    
00171 
00172   /* for pass1 */
00173   LOGPROB *bo_wt_1;             
00174   LOGPROB *p_2;                 
00175   LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID); 
00176 } NGRAM_INFO;
00177 
00178 
00179 /* Definitions for binary N-gram */
00180 
00182 #define BINGRAM_IDSTR "julius_bingram_v3"
00184 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00186 #define BINGRAM_IDSTR_V5 "julius_bingram_v5"
00188 #define BINGRAM_HDSIZE 512
00190 #define BINGRAM_SIZESTR_HEAD "word="
00192 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00194 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00195 #ifdef WORDS_INT
00196 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00197 #else
00198 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00199 #endif
00201 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00203 #ifdef WORDS_BIGENDIAN
00204 #define BINGRAM_NATURAL_BYTEORDER "BE"
00205 #else
00206 #define BINGRAM_NATURAL_BYTEORDER "LE"
00207 #endif
00208 
00209 /* function declaration */
00210 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w);
00211 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w);
00212 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00213 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00214 void bi_prob_func_set(NGRAM_INFO *ndata);
00215 
00216 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition);
00217 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00218 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00219 
00220 boolean ngram_compact_context(NGRAM_INFO *ndata, int n);
00221 
00222 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00223 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00224 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00225 
00226 NGRAM_INFO *ngram_info_new();
00227 void ngram_info_free(NGRAM_INFO *ngram);
00228 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00229 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir);
00230 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file);
00231 void set_unknown_id(NGRAM_INFO *ndata);
00232 
00233 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);
00234 
00235 #include <sent/vocabulary.h>
00236 void make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00237 
00238 #endif /* __SENT_NGRAM2_H__ */

Generated on Tue Dec 18 15:59:53 2007 for Julius by  doxygen 1.5.4