libsent/include/sent/ngram2.h File Reference

Definitions for word N-gram. More...

#include <sent/stddefs.h>
#include <sent/ptree.h>
#include <sent/vocabulary.h>

Go to the source code of this file.

Data Structures

struct NGRAM_TUPLE_INFO

N-gram entries for a m-gram (1 <= m <= N). More...

struct __ngram_info__

Main N-gram structure. More...

Defines

#define MAX_N   10

Maximum number of N for N-gram.

#define NNID_INVALID   0xffffffff

Value to indicate no id (full).

#define NNID_MAX   0xfffffffe

Value of maximum value (full).

#define NNID_INVALID_UPPER   255

Value to indicate no id at NNID_UPPER (24bit).

#define NNID_MAX_24   16711679

Allowed maximum number of id (255*65536-1) (24bit).

#define BEGIN_WORD_DEFAULT   "<s>"

Default word string of beginning-of-sentence word.

#define END_WORD_DEFAULT   "</s>"

Default word string of end-of-sentence word.

#define UNK_WORD_DEFAULT   "<unk>"

Default word string of unknown word for open vocabulary.

#define UNK_WORD_DEFAULT2   "<UNK>"

#define UNK_WORD_MAXLEN   30

Maximum length of unknown word string.

#define BINGRAM_IDSTR   "julius_bingram_v3"

Header string to identify version of bingram (v3: <= rev.3.4.2).

#define BINGRAM_IDSTR_V4   "julius_bingram_v4"

Header string to identify version of bingram (v4: <= rev.3.5.3).

#define BINGRAM_IDSTR_V5   "julius_bingram_v5"

Header string to identify version of bingram (v5: >= rev.4.0).

#define BINGRAM_HDSIZE   512

Bingram header size in bytes.

#define BINGRAM_SIZESTR_HEAD   "word="

Bingram header info string to identify the unit byte (head).

#define BINGRAM_SIZESTR_BODY_4BYTE   "4byte(int)"

Bingram header string that indicates 4 bytes unit.

#define BINGRAM_SIZESTR_BODY_2BYTE   "2byte(unsigned short)"

Bingram header string that indicates 2 bytes unit.

#define BINGRAM_SIZESTR_BODY   BINGRAM_SIZESTR_BODY_2BYTE

#define BINGRAM_BYTEORDER_HEAD   "byteorder="

Bingram header info string to identify the byte order (head) (v4).

#define BINGRAM_NATURAL_BYTEORDER   "LE"

Bingram header info string to identify the byte order (body) (v4).

Typedefs

typedef unsigned int NNID

Type definition for N-gram entry ID (full).

typedef unsigned char NNID_UPPER

N-gram entry ID (24bit: upper bit).

typedef unsigned short NNID_LOWER

N-gram entry ID (24bit: lower bit).

typedef __ngram_info__ NGRAM_INFO

Main N-gram structure.

Functions

NNID search_ngram (NGRAM_INFO *ndata, int n, WORD_ID *w)

Search for N-tuples.

LOGPROB ngram_prob (NGRAM_INFO *ndata, int n, WORD_ID *w)

Get N-gram probability of the last word w_n, given context w_1^n-1.

LOGPROB uni_prob (NGRAM_INFO *ndata, WORD_ID w)

Get 1-gram probability of $w$ in log10.

LOGPROB bi_prob (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)

Get 2-gram probability This function is not used in Julius, since each function of bi_prob_* will be called directly from the search.

void bi_prob_func_set (NGRAM_INFO *ndata)

Determinte which bi-gram computation function to be used according to the N-gram type, and set pointer to the proper function into the N-gram data.

boolean ngram_read_arpa (FILE *fp, NGRAM_INFO *ndata, boolean addition)

Read in one ARPA N-gram file.

boolean ngram_read_bin (FILE *fp, NGRAM_INFO *ndata)

Read a N-gram binary file and store to data.

boolean ngram_write_bin (FILE *fp, NGRAM_INFO *ndata, char *header_str)

Write a whole N-gram data in binary format.

boolean ngram_compact_context (NGRAM_INFO *ndata, int n)

Compaction of back-off elements in N-gram data.

void ngram_make_lookup_tree (NGRAM_INFO *ndata)

Make index tree for searching N-gram ID from the entry name.

WORD_ID ngram_lookup_word (NGRAM_INFO *ndata, char *wordstr)

Look up N-gram ID by entry name.

WORD_ID make_ngram_ref (NGRAM_INFO *, char *)

Return N-gram ID of entry name, or unknown class ID if not found.

NGRAM_INFO * ngram_info_new ()

Allocate a new N-gram structure.

void ngram_info_free (NGRAM_INFO *ngram)

Free N-gram data.

boolean init_ngram_bin (NGRAM_INFO *ndata, char *ngram_file)

Read and setup N-gram data from binary format file.

boolean init_ngram_arpa (NGRAM_INFO *ndata, char *ngram_file, int dir)

Read and setup N-gram data from ARPA format file.

boolean init_ngram_arpa_additional (NGRAM_INFO *ndata, char *bigram_file)

Read additional LR 2-gram for 1st pass.

void set_unknown_id (NGRAM_INFO *ndata, char *str)

Set unknown word ID to the N-gram data.

void print_ngram_info (FILE *fp, NGRAM_INFO *ndata)

Output misccelaneous information of N-gram to standard output.

boolean make_voca_ref (NGRAM_INFO *ndata, WORD_INFO *winfo)

Make correspondence between word dictionary and N-gram vocabulary.

void fix_uniprob_srilm (NGRAM_INFO *ndata, WORD_INFO *winfo)

Fix unigram probability of BOS / EOS word.

Detailed Description

Definitions for word N-gram.

This file defines a structure for word N-gram language model. Julius now support N-gram for arbitrary N (maximum number of N is defined as MAX_N, and N should be >= 2).

Both direction of forward (left-to-right) N-gram and backward (right-to-left) N-gram is supported. Since the final recognition process is done by right-to-left direction, using backward N-gram is recommended.

A forward 2-gram is necessary for the 1st recognition pass. If a forward N-gram is specified, Julius simply use its 2-gram part for the 1st pass. If only backward N-gram is specified, Julius calculate the forward probability from the defined backward N-gram by the equation "P(w_2|w_1) = P(w_1|w_2) * P(w_2) / P(w_1)." If both forward N-gram and backward N-gram are specified, Julius uses the 2-gram part of the forward n-gram at the 1st pass, and use the backward N-gram at the 2nd pass as the main LM. Note that the last behavior is the same as previous versions (<=3.5.x)

ARPA standard format and Julius binary format is supported. The binary format can be loaded much faster at startup, so it is recommended to use binary format by converting from ARPA format N-gram beforehand. All combination of N-gram (forward only, backward only, forward 2-gram + backward N-gram) is supported.

See also:: mkbingram

For memory efficiency of holding the huge word N-gram on memory, Julius merges the two language model into one structure. So the forward bigram and reverse trigram should meet the following requirements:

their vocabularies should be the same.
their unigram probabilities of each word should be the same.
the same bigram tuple sets are defined.
the bigram tuples for context word sequences of existing trigram tuples should exist in both.

The first three requirements can be fullfilled easily if you train the forward bigram and reverse trigram on the same training text. The last condition can be qualified if you set a cut-off value of trigram which is larger or equal to that of bigram. These conditions are checked when Julius or mkbingram reads in the ARPA models, and output error if not cleared.

From 3.5, tuple ID on 2-gram changed from 32bit to 24bit, and 2-gram back-off weights will not be saved if the corresponding 3-gram is empty. They will be performed when reading N-gram to reduce memory size.

Author:: Akinobu LEE

Date:: Fri Feb 11 15:04:02 2005

Revision: 1.6

Definition in file ngram2.h.

Typedef Documentation

typedef struct __ngram_info__ NGRAM_INFO

Main N-gram structure.

bigrams and trigrams are stored in the form of sequential lists. They are grouped by the same context, and referred from the context ((N-1)-gram) data by the beginning ID and its number.

Function Documentation

NNID search_ngram	(	NGRAM_INFO *	ndata,
		int	n,
		WORD_ID *	w
	)

Search for N-tuples.

Parameters:

	ndata	[in] word/class N-gram
	n	[in] N of N-gram (= number of words in w)
	w	[in] word sequence

Returns:

Definition at line 103 of file ngram_access.c.

Referenced by add_bigram().

LOGPROB ngram_prob	(	NGRAM_INFO *	ndata,
		int	n,
		WORD_ID *	w
	)

Get N-gram probability of the last word w_n, given context w_1^n-1.

Parameters:

	ndata	[in] word/class N-gram
	n	[in] N of N-gram (= number of words in w)
	w	[in] word sequence

Returns:

Definition at line 135 of file ngram_access.c.

Referenced by ngram_forw2back(), and ngram_prob().

LOGPROB uni_prob	(	NGRAM_INFO *	ndata,
		WORD_ID	w
	)

Get 1-gram probability of $w$ in log10.

Parameters:

	ndata	[in] word/class N-gram
	w	[in] word/class ID in N-gram

Returns:: log10 probability $\log p(w)$ .

Definition at line 229 of file ngram_access.c.

Referenced by get_nbest_uniprob(), and max_successor_prob().

LOGPROB bi_prob	(	NGRAM_INFO *	ndata,
		WORD_ID	w1,
		WORD_ID	w2
	)

Get 2-gram probability This function is not used in Julius, since each function of bi_prob_* will be called directly from the search.

Parameters:

	ndata	[in] N-gram data that holds the 2-gram
	w1	[in] left context word
	w2	[in] right target word

Returns:: the log N-gram probability P(w2|w1)

Definition at line 419 of file ngram_access.c.

void bi_prob_func_set ( NGRAM_INFO * ndata )

Determinte which bi-gram computation function to be used according to the N-gram type, and set pointer to the proper function into the N-gram data.

Parameters:

ndata

[i/o] N-gram information to use

Definition at line 449 of file ngram_access.c.

Referenced by ngram_read_bin().

boolean ngram_read_arpa	(	FILE *	fp,
		NGRAM_INFO *	ndata,
		boolean	addition
	)

Read in one ARPA N-gram file.

Supported combinations are LR 2-gram, RL 3-gram and LR 3-gram.

Parameters:

	fp	[in] file pointer
	ndata	[out] N-gram data to store the read data
	addition	[in] TRUE if going to read additional 2-gram

Returns:: TRUE on success, FALSE on failure.

Definition at line 525 of file ngram_read_arpa.c.

Referenced by init_ngram_arpa(), and init_ngram_arpa_additional().

boolean ngram_read_bin	(	FILE *	fp,
		NGRAM_INFO *	ndata
	)

Read a N-gram binary file and store to data.

Parameters:

	fp	[in] file pointer
	ndata	[out] N-gram data to store the read data

Returns:: TRUE on success, FALSE on failure.

Definition at line 597 of file ngram_read_bin.c.

Referenced by init_ngram_bin().

boolean ngram_write_bin	(	FILE *	fp,
		NGRAM_INFO *	ndata,
		char *	headerstr
	)

Write a whole N-gram data in binary format.

Parameters:

	fp	[in] file pointer
	ndata	[in] N-gram data to write
	headerstr	[in] user header string

Returns:: TRUE on success, FALSE on failure

Definition at line 135 of file ngram_write_bin.c.

boolean ngram_compact_context	(	NGRAM_INFO *	ndata,
		int	n
	)

Compaction of back-off elements in N-gram data.

Parameters:

	ndata	[i/o] N-gram information
	n	[i] N of N-gram

Returns:: TRUE on success, or FALSE on failure.

Definition at line 39 of file ngram_compact_context.c.

void ngram_make_lookup_tree ( NGRAM_INFO * ndata )

Make index tree for searching N-gram ID from the entry name.

Parameters:

ndata

[in] N-gram data

Definition at line 35 of file ngram_lookup.c.

Referenced by ngram_read_bin().

WORD_ID ngram_lookup_word	(	NGRAM_INFO *	ndata,
		char *	wordstr
	)

Look up N-gram ID by entry name.

Parameters:

	ndata	[in] N-gram data
	wordstr	[in] entry name to search

Returns:: the found class/word ID, or WORD_INVALID if not found.

Definition at line 65 of file ngram_lookup.c.

Referenced by add_bigram(), add_unigram(), make_ngram_ref(), and set_unknown_id().

WORD_ID make_ngram_ref	(	NGRAM_INFO *	ndata,
		char *	wstr
	)

Return N-gram ID of entry name, or unknown class ID if not found.

Parameters:

	ndata	[in] N-gram data
	wstr	[in] entry name to search

Returns:: the found class/word ID, or unknown ID if not found.

Definition at line 85 of file ngram_lookup.c.

Referenced by make_voca_ref().

NGRAM_INFO* ngram_info_new ( )

Allocate a new N-gram structure.

Returns:: pointer to the newly allocated structure.

Definition at line 34 of file ngram_malloc.c.

Referenced by initialize_ngram().

void ngram_info_free ( NGRAM_INFO * ndata )

Free N-gram data.

Parameters:

ndata

[in] N-gram data

Definition at line 68 of file ngram_malloc.c.

Referenced by initialize_ngram(), and j_process_lm_free().

boolean init_ngram_bin	(	NGRAM_INFO *	ndata,
		char *	bin_ngram_file
	)

Read and setup N-gram data from binary format file.

Parameters:

	ndata	[out] pointer to N-gram data structure to store the data
	bin_ngram_file	[in] file name of the binary N-gram

Definition at line 36 of file init_ngram.c.

Referenced by initialize_ngram().

boolean init_ngram_arpa	(	NGRAM_INFO *	ndata,
		char *	ngram_file,
		int	dir
	)

Read and setup N-gram data from ARPA format file.

Parameters:

	ndata	[out] pointer to N-gram data structure to store the data
	ngram_file	[in] file name of ARPA (reverse) 3-gram file
	dir	[in] direction (DIR_LR \| DIR_RL)

Definition at line 65 of file init_ngram.c.

Referenced by initialize_ngram().

boolean init_ngram_arpa_additional	(	NGRAM_INFO *	ndata,
		char *	bigram_file
	)

Read additional LR 2-gram for 1st pass.

Parameters:

	ndata	[out] pointer to N-gram data structure to store the data
	bigram_file	[in] file name of ARPA 2-gram file

Definition at line 98 of file init_ngram.c.

Referenced by initialize_ngram().

void set_unknown_id	(	NGRAM_INFO *	ndata,
		char *	str
	)

Set unknown word ID to the N-gram data.

Parameters:

	ndata	[out] N-gram data to set unknown word ID.
	str	[in] word name string of unknown word

Definition at line 169 of file init_ngram.c.

Referenced by initialize_ngram().

void print_ngram_info	(	FILE *	fp,
		NGRAM_INFO *	ndata
	)

Output misccelaneous information of N-gram to standard output.

Parameters:

	fp	[in] file pointer
	ndata	[in] N-gram data

Definition at line 79 of file ngram_util.c.

Referenced by print_engine_info().

boolean make_voca_ref	(	NGRAM_INFO *	ndata,
		WORD_INFO *	winfo
	)

Make correspondence between word dictionary and N-gram vocabulary.

Parameters:

	ndata	[i/o] word/class N-gram, the unknown word information will be set.
	winfo	[i/o] word dictionary, the word-to-ngram-entry mapping will be done here.

Definition at line 127 of file init_ngram.c.

Referenced by initialize_ngram().

void fix_uniprob_srilm	(	NGRAM_INFO *	ndata,
		WORD_INFO *	winfo
	)

Fix unigram probability of BOS / EOS word.

This function checks the probabilities of BOS / EOS word, and if it is set to "-99", give the same as another one. This is the case when the LM is trained by SRILM, which assigns unigram probability of "-99" to the beginning-of-sentence word, and causes search on reverse direction to fail.

Parameters:

	ndata	[i/o] N-gram data
	winfo	[i/o] Vocabulary information

Definition at line 206 of file init_ngram.c.

Referenced by initialize_ngram().

Generated on Thu Jul 23 12:14:19 2009 for Julius by

1.5.1


Data Structures
struct	NGRAM_TUPLE_INFO
	N-gram entries for a m-gram (1 <= m <= N). More...
struct	__ngram_info__
	Main N-gram structure. More...
Defines
#define	MAX_N 10
	Maximum number of N for N-gram.
#define	NNID_INVALID 0xffffffff
	Value to indicate no id (full).
#define	NNID_MAX 0xfffffffe
	Value of maximum value (full).
#define	NNID_INVALID_UPPER 255
	Value to indicate no id at NNID_UPPER (24bit).
#define	NNID_MAX_24 16711679
	Allowed maximum number of id (255*65536-1) (24bit).
#define	BEGIN_WORD_DEFAULT "<s>"
	Default word string of beginning-of-sentence word.
#define	END_WORD_DEFAULT "</s>"
	Default word string of end-of-sentence word.
#define	UNK_WORD_DEFAULT "<unk>"
	Default word string of unknown word for open vocabulary.
#define	UNK_WORD_DEFAULT2 "<UNK>"
#define	UNK_WORD_MAXLEN 30
	Maximum length of unknown word string.
#define	BINGRAM_IDSTR "julius_bingram_v3"
	Header string to identify version of bingram (v3: <= rev.3.4.2).
#define	BINGRAM_IDSTR_V4 "julius_bingram_v4"
	Header string to identify version of bingram (v4: <= rev.3.5.3).
#define	BINGRAM_IDSTR_V5 "julius_bingram_v5"
	Header string to identify version of bingram (v5: >= rev.4.0).
#define	BINGRAM_HDSIZE 512
	Bingram header size in bytes.
#define	BINGRAM_SIZESTR_HEAD "word="
	Bingram header info string to identify the unit byte (head).
#define	BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
	Bingram header string that indicates 4 bytes unit.
#define	BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
	Bingram header string that indicates 2 bytes unit.
#define	BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
#define	BINGRAM_BYTEORDER_HEAD "byteorder="
	Bingram header info string to identify the byte order (head) (v4).
#define	BINGRAM_NATURAL_BYTEORDER "LE"
	Bingram header info string to identify the byte order (body) (v4).
Typedefs
typedef unsigned int	NNID
	Type definition for N-gram entry ID (full).
typedef unsigned char	NNID_UPPER
	N-gram entry ID (24bit: upper bit).
typedef unsigned short	NNID_LOWER
	N-gram entry ID (24bit: lower bit).
typedef __ngram_info__	NGRAM_INFO
	Main N-gram structure.
Functions
NNID	search_ngram (NGRAM_INFO ndata, int n, WORD_ID w)
	Search for N-tuples.
LOGPROB	ngram_prob (NGRAM_INFO ndata, int n, WORD_ID w)
	Get N-gram probability of the last word w_n, given context w_1^n-1.
LOGPROB	uni_prob (NGRAM_INFO *ndata, WORD_ID w)
	Get 1-gram probability of in log10.
LOGPROB	bi_prob (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
	Get 2-gram probability This function is not used in Julius, since each function of bi_prob_* will be called directly from the search.
void	bi_prob_func_set (NGRAM_INFO *ndata)
	Determinte which bi-gram computation function to be used according to the N-gram type, and set pointer to the proper function into the N-gram data.
boolean	ngram_read_arpa (FILE fp, NGRAM_INFO ndata, boolean addition)
	Read in one ARPA N-gram file.
boolean	ngram_read_bin (FILE fp, NGRAM_INFO ndata)
	Read a N-gram binary file and store to data.
boolean	ngram_write_bin (FILE fp, NGRAM_INFO ndata, char *header_str)
	Write a whole N-gram data in binary format.
boolean	ngram_compact_context (NGRAM_INFO *ndata, int n)
	Compaction of back-off elements in N-gram data.
void	ngram_make_lookup_tree (NGRAM_INFO *ndata)
	Make index tree for searching N-gram ID from the entry name.
WORD_ID	ngram_lookup_word (NGRAM_INFO ndata, char wordstr)
	Look up N-gram ID by entry name.
WORD_ID	make_ngram_ref (NGRAM_INFO , char )
	Return N-gram ID of entry name, or unknown class ID if not found.
NGRAM_INFO *	ngram_info_new ()
	Allocate a new N-gram structure.
void	ngram_info_free (NGRAM_INFO *ngram)
	Free N-gram data.
boolean	init_ngram_bin (NGRAM_INFO ndata, char ngram_file)
	Read and setup N-gram data from binary format file.
boolean	init_ngram_arpa (NGRAM_INFO ndata, char ngram_file, int dir)
	Read and setup N-gram data from ARPA format file.
boolean	init_ngram_arpa_additional (NGRAM_INFO ndata, char bigram_file)
	Read additional LR 2-gram for 1st pass.
void	set_unknown_id (NGRAM_INFO ndata, char str)
	Set unknown word ID to the N-gram data.
void	print_ngram_info (FILE fp, NGRAM_INFO ndata)
	Output misccelaneous information of N-gram to standard output.
boolean	make_voca_ref (NGRAM_INFO ndata, WORD_INFO winfo)
	Make correspondence between word dictionary and N-gram vocabulary.
void	fix_uniprob_srilm (NGRAM_INFO ndata, WORD_INFO winfo)
	Fix unigram probability of BOS / EOS word.