#include <sent/stddefs.h>
#include <sent/ptree.h>
#include <sent/vocabulary.h>
Include dependency graph for ngram2.h:
This graph shows which files directly or indirectly include this file:
Go to the source code of this file.
Defines | |
#define | MAX_N 3 |
Maximum number of N (now fixed to trigram). | |
#define | NNID_INVALID -1 |
Value to indicate no id. | |
#define | NNID_INVALID_UPPER 255 |
Value to indicate no id at NNID_UPPER. | |
#define | NNIDMAX 16711680 |
Allowed maximum number of NNID (255*65536). | |
#define | BINGRAM_IDSTR "julius_bingram_v3" |
Header string to identify version of bingram (v3: <= rev.3.4.2). | |
#define | BINGRAM_IDSTR_V4 "julius_bingram_v4" |
Header string to identify version of bingram (v4: >= rev.3.5). | |
#define | BINGRAM_HDSIZE 512 |
Bingram header size in bytes. | |
#define | BINGRAM_SIZESTR_HEAD "word=" |
Bingram header info string to identify the unit byte (head). | |
#define | BINGRAM_SIZESTR_BODY "2byte(unsigned short)" |
Bingram header info string to identify the unit byte (body). | |
#define | BINGRAM_BYTEORDER_HEAD "byteorder=" |
Bingram header info string to identify the byte order (head) (v4). | |
#define | BINGRAM_NATURAL_BYTEORDER "LE" |
Bingram header info string to identify the byte order (body) (v4). | |
Typedefs | |
typedef unsigned char | NNID_UPPER |
Type definition for N-gram word ID. | |
typedef unsigned short | NNID_LOWER |
Type definition for N-gram word ID. | |
typedef int | NNID |
Type definition for N-gram word ID. | |
Functions | |
NNID | search_bigram (NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r) |
LOGPROB | uni_prob (NGRAM_INFO *ndata, WORD_ID w) |
LOGPROB | bi_prob_lr (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) |
LOGPROB | bi_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) |
LOGPROB | tri_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3) |
boolean | ngram_read_arpa (FILE *fp, NGRAM_INFO *ndata, int direction) |
void | set_unknown_id (NGRAM_INFO *ndata) |
Set unknown word ID to the N-gram data. | |
boolean | ngram_read_bin (FILE *fp, NGRAM_INFO *ndata) |
boolean | ngram_write_bin (FILE *fp, NGRAM_INFO *ndata, char *header_str) |
void | ngram_make_lookup_tree (NGRAM_INFO *ndata) |
WORD_ID | ngram_lookup_word (NGRAM_INFO *ndata, char *wordstr) |
WORD_ID | make_ngram_ref (NGRAM_INFO *, char *) |
NGRAM_INFO * | ngram_info_new () |
void | ngram_info_free (NGRAM_INFO *ngram) |
void | init_ngram_bin (NGRAM_INFO *ndata, char *ngram_file) |
void | init_ngram_arpa (NGRAM_INFO *ndata, char *lrfile, char *rlfile) |
void | ngram_compact_bigram_context (NGRAM_INFO *ndata) |
void | print_ngram_info (NGRAM_INFO *ndata) |
void | make_voca_ref (NGRAM_INFO *ndata, WORD_INFO *winfo) |
Julius uses left-to-right word bigram and reversed (right-to-left) trigram. Two input file format of ARPA standard format and Julius Binary format is supported. When using the ARPA format for recognition, the bigram file and reverse trigram file should be specified separately, and their coherence will be checked by Julius. When using the binary format, the two models are gathered in one file, and the data loading will be much faster than ARPA format. Model in either format will be stored in the same structure NGRAM_INFO.
The first three requirements can be fullfilled easily if you train the forward bigram and reverse trigram on the same training text. The last condition can be qualified if you set a cut-off value of trigram which is larger or equal to that of bigram. These conditions are checked when Julius or mkbingram reads in the ARPA models, and output error if not cleared.
From 3.5, tuple ID on 2-gram changed from 32bit to 24bit, and 2-gram back-off weights will not be saved if the corresponding 3-gram is empty. They will be performed when reading N-gram to reduce memory size.
Definition in file ngram2.h.
|
Search for 2-gram tuple (w_l, w_r) in the 2-gram part of N-gram.
Definition at line 39 of file ngram_access.c. Referenced by add_bigram_rl(), bi_prob_lr(), bi_prob_rl(), set_trigram(), and tri_prob_rl(). |
|
Get 1-gram probability of
Definition at line 154 of file ngram_access.c. Referenced by build_wchmm2(), calc_all_unigram_factoring_values(), get_nbest_uniprob(), max_successor_prob(), and pick_backtrellis_words(). |
|
Get LR 2-gram probability of word/class sequence
Definition at line 175 of file ngram_access.c. Referenced by max_successor_prob(), and max_successor_prob_iw(). |
|
Get RL 2-gram probability of word/class sequence
Definition at line 206 of file ngram_access.c. Referenced by pick_backtrellis_words(), and tri_prob_rl(). |
|
Get RL 3-gram probability of word/class sequence
Definition at line 239 of file ngram_access.c. Referenced by pick_backtrellis_words(). |
|
Read in one ARPA N-gram file, either LR 2-gram or RL 3-gram.
Definition at line 518 of file ngram_read_arpa.c. Referenced by init_ngram_arpa(). |
|
Set unknown word ID to the N-gram data. In CMU-Cam SLM toolkit, OOV words are always mapped to <unk>, which always appear at the very beginning of N-gram entry, so we fix the unknown word ID at "0".
Definition at line 72 of file ngram_read_arpa.c. Referenced by ngram_read_arpa(), and ngram_read_bin(). |
|
Read a N-gram binary file and store to data.
Definition at line 163 of file ngram_read_bin.c. Referenced by init_ngram_bin(). |
|
Write a whole N-gram data in binary format.
Definition at line 128 of file ngram_write_bin.c. |
|
Make index tree for searching N-gram ID from the entry name.
Definition at line 34 of file ngram_lookup.c. Referenced by ngram_read_bin(). |
|
Look up N-gram ID by entry name.
Definition at line 64 of file ngram_lookup.c. Referenced by lookup_word(), make_ngram_ref(), and set_unknown_id(). |
|
Return N-gram ID of entry name, or unknown class ID if not found.
Definition at line 84 of file ngram_lookup.c. Referenced by make_voca_ref(). |
|
Allocate a new N-gram structure.
Definition at line 33 of file ngram_malloc.c. Referenced by initialize_ngram(). |
|
Free N-gram data.
Definition at line 48 of file ngram_malloc.c. |
|
Read and setup N-gram data from binary format file.
Definition at line 35 of file init_ngram.c. Referenced by initialize_ngram(). |
|
Read and setup N-gram data from ARPA format files of 2-gram and 3-gram.
Definition at line 60 of file init_ngram.c. Referenced by initialize_ngram(). |
|
Compact the 2-gram context information.
Definition at line 630 of file ngram_read_arpa.c. Referenced by ngram_read_arpa(), and ngram_read_bin(). |
|
Output misccelaneous information of N-gram to standard output.
Definition at line 97 of file ngram_util.c. Referenced by print_info(). |
|
Make correspondence between word dictionary and N-gram vocabulary.
Definition at line 100 of file init_ngram.c. Referenced by initialize_ngram(). |