libsent/include/sent/ngram2.h

Go to the documentation of this file.
00001 
00076 /*
00077  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00078  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00079  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology
00080  * All rights reserved
00081  */
00082 
00083 /***** revision 3 *****/
00084 /* ngram2.h .. n-gram language model for speech recognition */
00085 /* Third revision: sequencial allocation */
00086 /* for disk-LM and rapid read-in */
00087 
00088 #ifndef __SENT_NGRAM2_H__
00089 #define __SENT_NGRAM2_H__
00090 
00091 #include <sent/stddefs.h>
00092 #include <sent/ptree.h>
00093 
00094 
00096 #define MAX_N 3
00097 
00098 typedef unsigned char NNID_UPPER; 
00099 typedef unsigned short NNID_LOWER; 
00100 typedef int NNID;              
00101 #define NNID_INVALID -1         
00102 #define NNID_INVALID_UPPER 255  
00103 #define NNIDMAX 16711680        
00104 
00105 
00113 typedef struct {
00114   int version;                  
00115   boolean from_bin;             
00116   WORD_ID max_word_num;         
00117   NNID ngram_num[MAX_N];        
00118   NNID bigram_bo_num;           
00119 
00126   WORD_ID unk_id;
00127   int unk_num;                  
00128   LOGPROB unk_num_log;          
00129   boolean isopen;               
00130 
00131   /* basic data (nid: 0 - max_word_num-1) */
00132   char **wname;                 
00133   PATNODE *root;                
00134   
00135   /* 1-gram ( nid: 0 - ngram_num[0]-1 ) */
00136   LOGPROB *p;                   
00137   LOGPROB *bo_wt_lr;            
00138   LOGPROB *bo_wt_rl;            
00139   NNID *n2_bgn; 
00140   WORD_ID *n2_num;              
00141   
00142   /* 2-gram (n2: 0 - ngram_num[1] - 1) */
00143   WORD_ID *n2tonid;             
00144   LOGPROB *p_lr;                
00145   LOGPROB *p_rl;                
00146   NNID_UPPER *n2bo_upper; 
00147   NNID_LOWER *n2bo_lower; 
00148   
00149   /* 2-gram back-off values (separated from  rev.3.5) */
00150   LOGPROB *bo_wt_rrl;           
00151   NNID *n3_bgn;                 
00152   NNID_UPPER *n3_bgn_upper; 
00153   NNID_LOWER *n3_bgn_lower; 
00154   WORD_ID *n3_num;              
00155   
00156   /* 3-gram (n3: 0 - ngram_num[2] - 1) */
00157   WORD_ID *n3tonid;             
00158   LOGPROB *p_rrl;               
00159 } NGRAM_INFO;
00160 
00161 
00162 /* Definitions for binary N-gram */
00163 
00165 #define BINGRAM_IDSTR "julius_bingram_v3"
00167 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00169 #define BINGRAM_HDSIZE 512
00171 #define BINGRAM_SIZESTR_HEAD "word="
00173 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00175 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00176 #ifdef WORDS_INT
00177 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00178 #else
00179 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00180 #endif
00182 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00184 #ifdef WORDS_BIGENDIAN
00185 #define BINGRAM_NATURAL_BYTEORDER "BE"
00186 #else
00187 #define BINGRAM_NATURAL_BYTEORDER "LE"
00188 #endif
00189 
00190 
00191 /* function declaration */
00192 NNID search_bigram(NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r);
00193 // NNID search_trigram(NGRAM_INFO *ndata,  NNID n2, WORD_ID wkey);
00194 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00195 LOGPROB bi_prob_lr(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00196 LOGPROB bi_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00197 LOGPROB tri_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3);
00198 
00199 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, int direction);
00200 void set_unknown_id(NGRAM_INFO *ndata);
00201 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00202 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00203 
00204 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00205 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00206 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00207 
00208 NGRAM_INFO *ngram_info_new();
00209 void ngram_info_free(NGRAM_INFO *ngram);
00210 void init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00211 void init_ngram_arpa(NGRAM_INFO *ndata, char *lrfile, char *rlfile);
00212 
00213 void ngram_compact_bigram_context(NGRAM_INFO *ndata);
00214 
00215 void print_ngram_info(NGRAM_INFO *ndata);
00216 
00217 #include <sent/vocabulary.h>
00218 void make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00219 
00220 #endif /* __SENT_NGRAM2_H__ */

Generated on Tue Dec 26 16:16:33 2006 for Julius by  doxygen 1.5.0