libsent/include/sent/ngram2.h File Reference

Definitions for word N-gram. More...

#include <sent/stddefs.h>
#include <sent/ptree.h>
#include <sent/vocabulary.h>

Include dependency graph for ngram2.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct NGRAM_INFO

Main N-gram structure. More...

Defines

#define MAX_N   3

Maximum number of N (now fixed to trigram).

#define NNID_INVALID   -1

Value to indicate no id.

#define NNID_INVALID_UPPER   255

Value to indicate no id at NNID_UPPER.

#define NNIDMAX   16711680

Allowed maximum number of NNID (255*65536).

#define BINGRAM_IDSTR   "julius_bingram_v3"

Header string to identify version of bingram (v3: <= rev.3.4.2).

#define BINGRAM_IDSTR_V4   "julius_bingram_v4"

Header string to identify version of bingram (v4: >= rev.3.5).

#define BINGRAM_HDSIZE   512

Bingram header size in bytes.

#define BINGRAM_SIZESTR_HEAD   "word="

Bingram header info string to identify the unit byte (head).

#define BINGRAM_SIZESTR_BODY_4BYTE   "4byte(int)"

Bingram header string that indicates 4 bytes unit.

#define BINGRAM_SIZESTR_BODY_2BYTE   "2byte(unsigned short)"

Bingram header string that indicates 2 bytes unit.

#define BINGRAM_SIZESTR_BODY   BINGRAM_SIZESTR_BODY_2BYTE

#define BINGRAM_BYTEORDER_HEAD   "byteorder="

Bingram header info string to identify the byte order (head) (v4).

#define BINGRAM_NATURAL_BYTEORDER   "LE"

Bingram header info string to identify the byte order (body) (v4).

Typedefs

typedef unsigned char NNID_UPPER

Type definition for N-gram word ID.

typedef unsigned short NNID_LOWER

Type definition for N-gram word ID.

typedef int NNID

Type definition for N-gram word ID.

Functions

NNID search_bigram (NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r)

LOGPROB uni_prob (NGRAM_INFO *ndata, WORD_ID w)

LOGPROB bi_prob_lr (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)

LOGPROB bi_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)

LOGPROB tri_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3)

boolean ngram_read_arpa (FILE *fp, NGRAM_INFO *ndata, int direction)

void set_unknown_id (NGRAM_INFO *ndata)

Set unknown word ID to the N-gram data.

boolean ngram_read_bin (FILE *fp, NGRAM_INFO *ndata)

boolean ngram_write_bin (FILE *fp, NGRAM_INFO *ndata, char *header_str)

void ngram_make_lookup_tree (NGRAM_INFO *ndata)

WORD_ID ngram_lookup_word (NGRAM_INFO *ndata, char *wordstr)

WORD_ID make_ngram_ref (NGRAM_INFO *, char *)

NGRAM_INFO * ngram_info_new ()

void ngram_info_free (NGRAM_INFO *ngram)

void init_ngram_bin (NGRAM_INFO *ndata, char *ngram_file)

void init_ngram_arpa (NGRAM_INFO *ndata, char *lrfile, char *rlfile)

void ngram_compact_bigram_context (NGRAM_INFO *ndata)

void print_ngram_info (NGRAM_INFO *ndata)

void make_voca_ref (NGRAM_INFO *ndata, WORD_INFO *winfo)

Detailed Description

Definitions for word N-gram.

Author:: Akinobu LEE

Date:: Fri Feb 11 15:04:02 2005

This file defines a structure for word N-gram language model.

Julius uses left-to-right word bigram and reversed (right-to-left) trigram. Two input file format of ARPA standard format and Julius Binary format is supported. When using the ARPA format for recognition, the bigram file and reverse trigram file should be specified separately, and their coherence will be checked by Julius. When using the binary format, the two models are gathered in one file, and the data loading will be much faster than ARPA format. Model in either format will be stored in the same structure NGRAM_INFO.

See also:: mkbingram

For memory efficiency of holding the huge word N-gram on memory, Julius merges the two language model into one structure. So the forward bigram and reverse trigram should meet the following requirements:

their vocabularies should be the same.
their unigram probabilities of each word should be the same.
the same bigram tuple sets are defined.
the bigram tuples for context word sequences of existing trigram tuples should exist in both.

The first three requirements can be fullfilled easily if you train the forward bigram and reverse trigram on the same training text. The last condition can be qualified if you set a cut-off value of trigram which is larger or equal to that of bigram. These conditions are checked when Julius or mkbingram reads in the ARPA models, and output error if not cleared.

From 3.5, tuple ID on 2-gram changed from 32bit to 24bit, and 2-gram back-off weights will not be saved if the corresponding 3-gram is empty. They will be performed when reading N-gram to reduce memory size.

Revision: 1.6

Definition in file ngram2.h.

Function Documentation

NNID search_bigram	(	NGRAM_INFO *	ndata,
		WORD_ID	w_l,
		WORD_ID	w_r
	)

Search for 2-gram tuple (w_l, w_r) in the 2-gram part of N-gram.

Parameters:

	ndata	[in] word/class N-gram
	w_l	[in] left word/class ID in N-gram
	w_r	[in] right word/class ID in N-gram

Returns:: corresponding index to the 2-gram data part if found, or NNID_INVALID if the tuple does not exist in 2-gram.

Definition at line 39 of file ngram_access.c.

Referenced by add_bigram_rl(), bi_prob_lr(), bi_prob_rl(), and tri_prob_rl().

LOGPROB uni_prob	(	NGRAM_INFO *	ndata,
		WORD_ID	w
	)

Get 1-gram probability of $w$ in log10.

Parameters:

	ndata	[in] word/class N-gram
	w	[in] word/class ID in N-gram

Returns:: log10 probability $\log p(w)$ .

Definition at line 154 of file ngram_access.c.

LOGPROB bi_prob_lr	(	NGRAM_INFO *	ndata,
		WORD_ID	w1,
		WORD_ID	w2
	)

Get LR 2-gram probability of word/class sequence $(w_1, w_2)$ in log10

Parameters:

	ndata	[in] word/class N-gram
	w1	[in] left word/class ID in N-gram
	w2	[in] right word/class ID in N-gram (to compute probability)

Returns:: log10 probability $\log p(w_2|w_1)$ .

Definition at line 175 of file ngram_access.c.

LOGPROB bi_prob_rl	(	NGRAM_INFO *	ndata,
		WORD_ID	w1,
		WORD_ID	w2
	)

Get RL 2-gram probability of word/class sequence $(w_1, w_2)$ in log10.

Parameters:

	ndata	[in] word/class N-gram
	w1	[in] left word/class ID in N-gram (to compute probability)
	w2	[in] right word/class ID in N-gram

Returns:: log10 probability $\log p(w_1|w_2)$ .

Definition at line 206 of file ngram_access.c.

Referenced by tri_prob_rl().

LOGPROB tri_prob_rl	(	NGRAM_INFO *	ndata,
		WORD_ID	w1,
		WORD_ID	w2,
		WORD_ID	w3
	)

Get RL 3-gram probability of word/class sequence $(w_1, w_2, w_3)$ in log10.

Parameters:

	ndata	[in] word/class N-gram
	w1	[in] left word/class ID in N-gram (to compute probability)
	w2	[in] middle word/class ID in N-gram
	w3	[in] right word/class ID in N-gram

Returns:: log10 probability $\log p(w_1|w_2, w_3)$ .

Definition at line 239 of file ngram_access.c.

boolean ngram_read_arpa	(	FILE *	fp,
		NGRAM_INFO *	ndata,
		int	direction
	)

Read in one ARPA N-gram file, either LR 2-gram or RL 3-gram.

Parameters:

	fp	[in] file pointer
	ndata	[out] N-gram data to store the read data
	direction	[in] specify whether this is LR 2-gram or RL 3-gram

Returns:: TRUE on success, FALSE on failure.

Definition at line 518 of file ngram_read_arpa.c.

Referenced by init_ngram_arpa().

void set_unknown_id ( NGRAM_INFO * ndata )

Set unknown word ID to the N-gram data.

In CMU-Cam SLM toolkit, OOV words are always mapped to UNK, which always appear at the very beginning of N-gram entry, so we fix the unknown word ID at "0".

Parameters:

ndata

[out] N-gram data to set unknown word ID.

Definition at line 72 of file ngram_read_arpa.c.

Referenced by ngram_read_arpa().

boolean ngram_read_bin	(	FILE *	fp,
		NGRAM_INFO *	ndata
	)

Read a N-gram binary file and store to data.

Parameters:

	fp	[in] file pointer
	ndata	[out] N-gram data to store the read data

Returns:: TRUE on success, FALSE on failure.

Definition at line 230 of file ngram_read_bin.c.

Referenced by init_ngram_bin().

boolean ngram_write_bin	(	FILE *	fp,
		NGRAM_INFO *	ndata,
		char *	headerstr
	)

Write a whole N-gram data in binary format.

Parameters:

	fp	[in] file pointer
	ndata	[in] N-gram data to write
	headerstr	[in] user header string

Returns:: TRUE on success, FALSE on failure

Definition at line 128 of file ngram_write_bin.c.

void ngram_make_lookup_tree ( NGRAM_INFO * ndata )

Make index tree for searching N-gram ID from the entry name.

Parameters:

ndata

[in] N-gram data

Definition at line 34 of file ngram_lookup.c.

WORD_ID ngram_lookup_word	(	NGRAM_INFO *	ndata,
		char *	wordstr
	)

Look up N-gram ID by entry name.

Parameters:

	ndata	[in] N-gram data
	wordstr	[in] entry name to search

Returns:: the found class/word ID, or WORD_INVALID if not found.

Definition at line 64 of file ngram_lookup.c.

Referenced by lookup_word(), make_ngram_ref(), and set_unknown_id().

WORD_ID make_ngram_ref	(	NGRAM_INFO *	ndata,
		char *	wstr
	)

Return N-gram ID of entry name, or unknown class ID if not found.

Parameters:

	ndata	[in] N-gram data
	wstr	[in] entry name to search

Returns:: the found class/word ID, or unknown ID if not found.

Definition at line 84 of file ngram_lookup.c.

Referenced by make_voca_ref().

NGRAM_INFO* ngram_info_new ( )

Allocate a new N-gram structure.

Returns:: pointer to the newly allocated structure.

Definition at line 33 of file ngram_malloc.c.

void ngram_info_free ( NGRAM_INFO * ndata )

Free N-gram data.

Parameters:

ndata

[in] N-gram data

Definition at line 48 of file ngram_malloc.c.

void init_ngram_bin	(	NGRAM_INFO *	ndata,
		char *	bin_ngram_file
	)

Read and setup N-gram data from binary format file.

Parameters:

	ndata	[out] pointer to N-gram data structure to store the data
	bin_ngram_file	[in] file name of the binary N-gram

Definition at line 35 of file init_ngram.c.

void init_ngram_arpa	(	NGRAM_INFO *	ndata,
		char *	ngram_lr_file,
		char *	ngram_rl_file
	)

Read and setup N-gram data from ARPA format files of 2-gram and 3-gram.

Parameters:

	ndata	[out] pointer to N-gram data structure to store the data
	ngram_lr_file	[in] file name of ARPA 2-gram file
	ngram_rl_file	[in] file name of ARPA reverse 3-gram file

Definition at line 60 of file init_ngram.c.

void ngram_compact_bigram_context ( NGRAM_INFO * ndata )

Compact the 2-gram context information.

Parameters:

ndata

[i/o] N-gram data

Definition at line 630 of file ngram_read_arpa.c.

Referenced by ngram_read_arpa().

void print_ngram_info ( NGRAM_INFO * ndata )

Output misccelaneous information of N-gram to standard output.

Parameters:

ndata

[in] N-gram data

Definition at line 97 of file ngram_util.c.

Referenced by print_info().

void make_voca_ref	(	NGRAM_INFO *	ndata,
		WORD_INFO *	winfo
	)

Make correspondence between word dictionary and N-gram vocabulary.

Parameters:

	ndata	[i/o] word/class N-gram, the unknown word information will be set.
	winfo	[i/o] word dictionary, the word-to-ngram-entry mapping will be done here.

Definition at line 100 of file init_ngram.c.

Generated on Tue Dec 26 12:54:05 2006 for Julian by

1.5.0


Data Structures
struct	NGRAM_INFO
	Main N-gram structure. More...
Defines
#define	MAX_N 3
	Maximum number of N (now fixed to trigram).
#define	NNID_INVALID -1
	Value to indicate no id.
#define	NNID_INVALID_UPPER 255
	Value to indicate no id at NNID_UPPER.
#define	NNIDMAX 16711680
	Allowed maximum number of NNID (255*65536).
#define	BINGRAM_IDSTR "julius_bingram_v3"
	Header string to identify version of bingram (v3: <= rev.3.4.2).
#define	BINGRAM_IDSTR_V4 "julius_bingram_v4"
	Header string to identify version of bingram (v4: >= rev.3.5).
#define	BINGRAM_HDSIZE 512
	Bingram header size in bytes.
#define	BINGRAM_SIZESTR_HEAD "word="
	Bingram header info string to identify the unit byte (head).
#define	BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
	Bingram header string that indicates 4 bytes unit.
#define	BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
	Bingram header string that indicates 2 bytes unit.
#define	BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
#define	BINGRAM_BYTEORDER_HEAD "byteorder="
	Bingram header info string to identify the byte order (head) (v4).
#define	BINGRAM_NATURAL_BYTEORDER "LE"
	Bingram header info string to identify the byte order (body) (v4).
Typedefs
typedef unsigned char	NNID_UPPER
	Type definition for N-gram word ID.
typedef unsigned short	NNID_LOWER
	Type definition for N-gram word ID.
typedef int	NNID
	Type definition for N-gram word ID.
Functions
NNID	search_bigram (NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r)
LOGPROB	uni_prob (NGRAM_INFO *ndata, WORD_ID w)
LOGPROB	bi_prob_lr (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
LOGPROB	bi_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
LOGPROB	tri_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3)
boolean	ngram_read_arpa (FILE fp, NGRAM_INFO ndata, int direction)
void	set_unknown_id (NGRAM_INFO *ndata)
	Set unknown word ID to the N-gram data.
boolean	ngram_read_bin (FILE fp, NGRAM_INFO ndata)
boolean	ngram_write_bin (FILE fp, NGRAM_INFO ndata, char *header_str)
void	ngram_make_lookup_tree (NGRAM_INFO *ndata)
WORD_ID	ngram_lookup_word (NGRAM_INFO ndata, char wordstr)
WORD_ID	make_ngram_ref (NGRAM_INFO , char )
NGRAM_INFO *	ngram_info_new ()
void	ngram_info_free (NGRAM_INFO *ngram)
void	init_ngram_bin (NGRAM_INFO ndata, char ngram_file)
void	init_ngram_arpa (NGRAM_INFO ndata, char lrfile, char *rlfile)
void	ngram_compact_bigram_context (NGRAM_INFO *ndata)
void	print_ngram_info (NGRAM_INFO *ndata)
void	make_voca_ref (NGRAM_INFO ndata, WORD_INFO winfo)