#include <sent/stddefs.h>
#include <sent/ptree.h>
#include <sent/vocabulary.h>
Include dependency graph for ngram2.h:
This graph shows which files directly or indirectly include this file:
Go to the source code of this file.
Data Structures | |
struct | NGRAM_INFO |
Main N-gram structure. More... | |
Defines | |
#define | MAX_N 3 |
Maximum number of N (now fixed to trigram). | |
#define | NNID_INVALID -1 |
Value to indicate no id. | |
#define | NNID_INVALID_UPPER 255 |
Value to indicate no id at NNID_UPPER. | |
#define | NNIDMAX 16711680 |
Allowed maximum number of NNID (255*65536). | |
#define | BINGRAM_IDSTR "julius_bingram_v3" |
Header string to identify version of bingram (v3: <= rev.3.4.2). | |
#define | BINGRAM_IDSTR_V4 "julius_bingram_v4" |
Header string to identify version of bingram (v4: >= rev.3.5). | |
#define | BINGRAM_HDSIZE 512 |
Bingram header size in bytes. | |
#define | BINGRAM_SIZESTR_HEAD "word=" |
Bingram header info string to identify the unit byte (head). | |
#define | BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)" |
Bingram header string that indicates 4 bytes unit. | |
#define | BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)" |
Bingram header string that indicates 2 bytes unit. | |
#define | BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE |
#define | BINGRAM_BYTEORDER_HEAD "byteorder=" |
Bingram header info string to identify the byte order (head) (v4). | |
#define | BINGRAM_NATURAL_BYTEORDER "LE" |
Bingram header info string to identify the byte order (body) (v4). | |
Typedefs | |
typedef unsigned char | NNID_UPPER |
Type definition for N-gram word ID. | |
typedef unsigned short | NNID_LOWER |
Type definition for N-gram word ID. | |
typedef int | NNID |
Type definition for N-gram word ID. | |
Functions | |
NNID | search_bigram (NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r) |
LOGPROB | uni_prob (NGRAM_INFO *ndata, WORD_ID w) |
LOGPROB | bi_prob_lr (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) |
LOGPROB | bi_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2) |
LOGPROB | tri_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3) |
boolean | ngram_read_arpa (FILE *fp, NGRAM_INFO *ndata, int direction) |
void | set_unknown_id (NGRAM_INFO *ndata) |
Set unknown word ID to the N-gram data. | |
boolean | ngram_read_bin (FILE *fp, NGRAM_INFO *ndata) |
boolean | ngram_write_bin (FILE *fp, NGRAM_INFO *ndata, char *header_str) |
void | ngram_make_lookup_tree (NGRAM_INFO *ndata) |
WORD_ID | ngram_lookup_word (NGRAM_INFO *ndata, char *wordstr) |
WORD_ID | make_ngram_ref (NGRAM_INFO *, char *) |
NGRAM_INFO * | ngram_info_new () |
void | ngram_info_free (NGRAM_INFO *ngram) |
void | init_ngram_bin (NGRAM_INFO *ndata, char *ngram_file) |
void | init_ngram_arpa (NGRAM_INFO *ndata, char *lrfile, char *rlfile) |
void | ngram_compact_bigram_context (NGRAM_INFO *ndata) |
void | print_ngram_info (NGRAM_INFO *ndata) |
void | make_voca_ref (NGRAM_INFO *ndata, WORD_INFO *winfo) |
Julius uses left-to-right word bigram and reversed (right-to-left) trigram. Two input file format of ARPA standard format and Julius Binary format is supported. When using the ARPA format for recognition, the bigram file and reverse trigram file should be specified separately, and their coherence will be checked by Julius. When using the binary format, the two models are gathered in one file, and the data loading will be much faster than ARPA format. Model in either format will be stored in the same structure NGRAM_INFO.
The first three requirements can be fullfilled easily if you train the forward bigram and reverse trigram on the same training text. The last condition can be qualified if you set a cut-off value of trigram which is larger or equal to that of bigram. These conditions are checked when Julius or mkbingram reads in the ARPA models, and output error if not cleared.
From 3.5, tuple ID on 2-gram changed from 32bit to 24bit, and 2-gram back-off weights will not be saved if the corresponding 3-gram is empty. They will be performed when reading N-gram to reduce memory size.
Definition in file ngram2.h.
NNID search_bigram | ( | NGRAM_INFO * | ndata, | |
WORD_ID | w_l, | |||
WORD_ID | w_r | |||
) |
Search for 2-gram tuple (w_l, w_r) in the 2-gram part of N-gram.
ndata | [in] word/class N-gram | |
w_l | [in] left word/class ID in N-gram | |
w_r | [in] right word/class ID in N-gram |
Definition at line 39 of file ngram_access.c.
Referenced by add_bigram_rl(), bi_prob_lr(), bi_prob_rl(), and tri_prob_rl().
LOGPROB uni_prob | ( | NGRAM_INFO * | ndata, | |
WORD_ID | w | |||
) |
Get 1-gram probability of in log10.
ndata | [in] word/class N-gram | |
w | [in] word/class ID in N-gram |
Definition at line 154 of file ngram_access.c.
LOGPROB bi_prob_lr | ( | NGRAM_INFO * | ndata, | |
WORD_ID | w1, | |||
WORD_ID | w2 | |||
) |
Get LR 2-gram probability of word/class sequence in log10
ndata | [in] word/class N-gram | |
w1 | [in] left word/class ID in N-gram | |
w2 | [in] right word/class ID in N-gram (to compute probability) |
Definition at line 175 of file ngram_access.c.
LOGPROB bi_prob_rl | ( | NGRAM_INFO * | ndata, | |
WORD_ID | w1, | |||
WORD_ID | w2 | |||
) |
Get RL 2-gram probability of word/class sequence in log10.
ndata | [in] word/class N-gram | |
w1 | [in] left word/class ID in N-gram (to compute probability) | |
w2 | [in] right word/class ID in N-gram |
Definition at line 206 of file ngram_access.c.
Referenced by tri_prob_rl().
LOGPROB tri_prob_rl | ( | NGRAM_INFO * | ndata, | |
WORD_ID | w1, | |||
WORD_ID | w2, | |||
WORD_ID | w3 | |||
) |
Get RL 3-gram probability of word/class sequence in log10.
ndata | [in] word/class N-gram | |
w1 | [in] left word/class ID in N-gram (to compute probability) | |
w2 | [in] middle word/class ID in N-gram | |
w3 | [in] right word/class ID in N-gram |
Definition at line 239 of file ngram_access.c.
boolean ngram_read_arpa | ( | FILE * | fp, | |
NGRAM_INFO * | ndata, | |||
int | direction | |||
) |
Read in one ARPA N-gram file, either LR 2-gram or RL 3-gram.
fp | [in] file pointer | |
ndata | [out] N-gram data to store the read data | |
direction | [in] specify whether this is LR 2-gram or RL 3-gram |
Definition at line 518 of file ngram_read_arpa.c.
Referenced by init_ngram_arpa().
void set_unknown_id | ( | NGRAM_INFO * | ndata | ) |
Set unknown word ID to the N-gram data.
In CMU-Cam SLM toolkit, OOV words are always mapped to UNK, which always appear at the very beginning of N-gram entry, so we fix the unknown word ID at "0".
ndata | [out] N-gram data to set unknown word ID. |
Definition at line 72 of file ngram_read_arpa.c.
Referenced by ngram_read_arpa().
boolean ngram_read_bin | ( | FILE * | fp, | |
NGRAM_INFO * | ndata | |||
) |
Read a N-gram binary file and store to data.
fp | [in] file pointer | |
ndata | [out] N-gram data to store the read data |
Definition at line 230 of file ngram_read_bin.c.
Referenced by init_ngram_bin().
boolean ngram_write_bin | ( | FILE * | fp, | |
NGRAM_INFO * | ndata, | |||
char * | headerstr | |||
) |
Write a whole N-gram data in binary format.
fp | [in] file pointer | |
ndata | [in] N-gram data to write | |
headerstr | [in] user header string |
Definition at line 128 of file ngram_write_bin.c.
void ngram_make_lookup_tree | ( | NGRAM_INFO * | ndata | ) |
Make index tree for searching N-gram ID from the entry name.
ndata | [in] N-gram data |
Definition at line 34 of file ngram_lookup.c.
WORD_ID ngram_lookup_word | ( | NGRAM_INFO * | ndata, | |
char * | wordstr | |||
) |
Look up N-gram ID by entry name.
ndata | [in] N-gram data | |
wordstr | [in] entry name to search |
Definition at line 64 of file ngram_lookup.c.
Referenced by lookup_word(), make_ngram_ref(), and set_unknown_id().
WORD_ID make_ngram_ref | ( | NGRAM_INFO * | ndata, | |
char * | wstr | |||
) |
Return N-gram ID of entry name, or unknown class ID if not found.
ndata | [in] N-gram data | |
wstr | [in] entry name to search |
Definition at line 84 of file ngram_lookup.c.
Referenced by make_voca_ref().
NGRAM_INFO* ngram_info_new | ( | ) |
Allocate a new N-gram structure.
Definition at line 33 of file ngram_malloc.c.
void ngram_info_free | ( | NGRAM_INFO * | ndata | ) |
void init_ngram_bin | ( | NGRAM_INFO * | ndata, | |
char * | bin_ngram_file | |||
) |
Read and setup N-gram data from binary format file.
ndata | [out] pointer to N-gram data structure to store the data | |
bin_ngram_file | [in] file name of the binary N-gram |
Definition at line 35 of file init_ngram.c.
void init_ngram_arpa | ( | NGRAM_INFO * | ndata, | |
char * | ngram_lr_file, | |||
char * | ngram_rl_file | |||
) |
Read and setup N-gram data from ARPA format files of 2-gram and 3-gram.
ndata | [out] pointer to N-gram data structure to store the data | |
ngram_lr_file | [in] file name of ARPA 2-gram file | |
ngram_rl_file | [in] file name of ARPA reverse 3-gram file |
Definition at line 60 of file init_ngram.c.
void ngram_compact_bigram_context | ( | NGRAM_INFO * | ndata | ) |
Compact the 2-gram context information.
ndata | [i/o] N-gram data |
Definition at line 630 of file ngram_read_arpa.c.
Referenced by ngram_read_arpa().
void print_ngram_info | ( | NGRAM_INFO * | ndata | ) |
Output misccelaneous information of N-gram to standard output.
ndata | [in] N-gram data |
Definition at line 97 of file ngram_util.c.
Referenced by print_info().
void make_voca_ref | ( | NGRAM_INFO * | ndata, | |
WORD_INFO * | winfo | |||
) |
Make correspondence between word dictionary and N-gram vocabulary.
ndata | [i/o] word/class N-gram, the unknown word information will be set. | |
winfo | [i/o] word dictionary, the word-to-ngram-entry mapping will be done here. |
Definition at line 100 of file init_ngram.c.