libsent/include/sent/ngram2.h

説明を見る。
00001 
00103 /*
00104  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00105  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00106  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00107  * All rights reserved
00108  */
00109 
00110 #ifndef __SENT_NGRAM2_H__
00111 #define __SENT_NGRAM2_H__
00112 
00113 #include <sent/stddefs.h>
00114 #include <sent/ptree.h>
00115 
00116 #define MAX_N 10                
00117 
00118 typedef unsigned int NNID;            
00119 #define NNID_INVALID 0xffffffff  
00120 #define NNID_MAX 0xfffffffe     
00121 
00122 typedef unsigned char NNID_UPPER; 
00123 typedef unsigned short NNID_LOWER; 
00124 #define NNID_INVALID_UPPER 255  
00125 #define NNID_MAX_24 16711679        
00126 
00128 #define BEGIN_WORD_DEFAULT "<s>"
00130 #define END_WORD_DEFAULT "</s>"
00132 #define UNK_WORD_DEFAULT "<unk>"
00133 #define UNK_WORD_DEFAULT2 "<UNK>"
00135 #define UNK_WORD_MAXLEN 30
00136 
00141 typedef struct {
00142   NNID totalnum;                
00143   boolean is24bit;              
00144   NNID bgnlistlen;              
00145   NNID_UPPER *bgn_upper;        
00146   NNID_LOWER *bgn_lower;        
00147   NNID *bgn;                    
00148   WORD_ID *num;         
00149 
00150   WORD_ID *nnid2wid;            
00151   LOGPROB *prob;                
00152 
00153   NNID context_num;             
00154   LOGPROB *bo_wt;               
00155   boolean ct_compaction;        
00156   NNID_UPPER *nnid2ctid_upper;  
00157   NNID_LOWER *nnid2ctid_lower;  
00158 
00159 } NGRAM_TUPLE_INFO;
00160 
00169 typedef struct __ngram_info__ {
00170   int n;                        
00171   int dir;                      
00172   boolean from_bin;             
00173   boolean bigram_index_reversed;                
00174   boolean bos_eos_swap;         
00175   WORD_ID max_word_num;         
00176   char **wname;                 
00177   PATNODE *root;                
00178   WORD_ID unk_id;               
00179   int unk_num;                  
00180   LOGPROB unk_num_log;          
00181   boolean isopen;               
00182 
00183   NGRAM_TUPLE_INFO d[MAX_N];    
00184 
00185   /* for pass1 */
00186   LOGPROB *bo_wt_1;             
00187   LOGPROB *p_2;                 
00188   LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID); 
00189 } NGRAM_INFO;
00190 
00191 
00192 /* Definitions for binary N-gram */
00193 
00195 #define BINGRAM_IDSTR "julius_bingram_v3"
00197 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00199 #define BINGRAM_IDSTR_V5 "julius_bingram_v5"
00201 #define BINGRAM_HDSIZE 512
00203 #define BINGRAM_SIZESTR_HEAD "word="
00205 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00207 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00208 #ifdef WORDS_INT
00209 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00210 #else
00211 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00212 #endif
00214 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00216 #ifdef WORDS_BIGENDIAN
00217 #define BINGRAM_NATURAL_BYTEORDER "BE"
00218 #else
00219 #define BINGRAM_NATURAL_BYTEORDER "LE"
00220 #endif
00221 
00222 /* function declaration */
00223 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w);
00224 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w);
00225 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00226 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00227 void bi_prob_func_set(NGRAM_INFO *ndata);
00228 
00229 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition);
00230 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00231 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00232 
00233 boolean ngram_compact_context(NGRAM_INFO *ndata, int n);
00234 
00235 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00236 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00237 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00238 
00239 NGRAM_INFO *ngram_info_new();
00240 void ngram_info_free(NGRAM_INFO *ngram);
00241 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00242 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir);
00243 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file);
00244 void set_unknown_id(NGRAM_INFO *ndata, char *str);
00245 
00246 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);
00247 
00248 #include <sent/vocabulary.h>
00249 boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00250 void fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo);
00251 
00252 #endif /* __SENT_NGRAM2_H__ */

Juliusに対してThu Jul 23 12:16:23 2009に生成されました。  doxygen 1.5.1