libsent/include/sent/ngram2.h

単語N-gram言語モデルの定義 [詳細]

#include <sent/stddefs.h>
#include <sent/ptree.h>
#include <sent/vocabulary.h>

ngram2.hのインクルード依存関係図

このグラフは、どのファイルから直接、間接的にインクルードされているかを示しています。

ソースコードを見る。

データ構造

struct NGRAM_INFO

Main N-gram structure [詳細]

マクロ定義

#define MAX_N   3

Maximum number of N (now fixed to trigram)

#define NNID_INVALID   -1

Value to indicate no id

#define NNID_INVALID_UPPER   255

Value to indicate no id at NNID_UPPER

#define NNIDMAX   16711680

Allowed maximum number of NNID (255*65536)

#define BINGRAM_IDSTR   "julius_bingram_v3"

Header string to identify version of bingram (v3: <= rev.3.4.2)

#define BINGRAM_IDSTR_V4   "julius_bingram_v4"

Header string to identify version of bingram (v4: >= rev.3.5)

#define BINGRAM_HDSIZE   512

Bingram header size in bytes

#define BINGRAM_SIZESTR_HEAD   "word="

Bingram header info string to identify the unit byte (head)

#define BINGRAM_SIZESTR_BODY_4BYTE   "4byte(int)"

Bingram header string that indicates 4 bytes unit

#define BINGRAM_SIZESTR_BODY_2BYTE   "2byte(unsigned short)"

Bingram header string that indicates 2 bytes unit

#define BINGRAM_SIZESTR_BODY   BINGRAM_SIZESTR_BODY_2BYTE

#define BINGRAM_BYTEORDER_HEAD   "byteorder="

Bingram header info string to identify the byte order (head) (v4)

#define BINGRAM_NATURAL_BYTEORDER   "LE"

Bingram header info string to identify the byte order (body) (v4)

型定義

typedef unsigned char NNID_UPPER

Type definition for N-gram word ID

typedef unsigned short NNID_LOWER

Type definition for N-gram word ID

typedef int NNID

Type definition for N-gram word ID

関数

NNID search_bigram (NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r)

LOGPROB uni_prob (NGRAM_INFO *ndata, WORD_ID w)

LOGPROB bi_prob_lr (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)

LOGPROB bi_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)

LOGPROB tri_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3)

boolean ngram_read_arpa (FILE *fp, NGRAM_INFO *ndata, int direction)

void set_unknown_id (NGRAM_INFO *ndata)

Set unknown word ID to the N-gram data.

boolean ngram_read_bin (FILE *fp, NGRAM_INFO *ndata)

boolean ngram_write_bin (FILE *fp, NGRAM_INFO *ndata, char *header_str)

void ngram_make_lookup_tree (NGRAM_INFO *ndata)

WORD_ID ngram_lookup_word (NGRAM_INFO *ndata, char *wordstr)

WORD_ID make_ngram_ref (NGRAM_INFO *, char *)

NGRAM_INFO * ngram_info_new ()

void ngram_info_free (NGRAM_INFO *ngram)

void init_ngram_bin (NGRAM_INFO *ndata, char *ngram_file)

void init_ngram_arpa (NGRAM_INFO *ndata, char *lrfile, char *rlfile)

void ngram_compact_bigram_context (NGRAM_INFO *ndata)

void print_ngram_info (NGRAM_INFO *ndata)

void make_voca_ref (NGRAM_INFO *ndata, WORD_INFO *winfo)

説明

単語N-gram言語モデルの定義

作者:: Akinobu LEE

日付:: Fri Feb 11 15:04:02 2005

このファイルには単語N-gram言語モデルを格納するための構造体定義が含まれています．

Julius では，前向き2-gramと後向き3-gram を用います．入力ファイル形式は ARPA形式とJulius独自のバイナリ形式の２つをサポートしています．前者の場合，前向き2-gram と後向き3-gram をそれぞれ別々のファイルとして指定します．後者の場合，それらが統合された１つのバイナリファイルを読み込みます．読み込みは後者のほうが高速です．なお，Julius 内部では，どちらも同じ構造体 NGRAM_INFO に格納されます．

参照:: mkbingram

NGRAM_INFO ではメモリ量節約のため，これらを一つの構造体で表現しています．このことから，Julius は使用するこれら２つの言語モデルが以下を満たすことを要求します．

語彙が同一であること
各語彙の1-gram確率が同一であること
同じ 2-gram tuple 集合が定義されていること
3-gram のコンテキストである単語組の2-gramが定義されていること

上記の前提のほとんどは，これらの２つのN-gramを同一のコーパスから学習することで満たされます．最後の条件については，3-gram のカットオフ値に 2-gram のカットオフ値と同値かそれ以上の値を指定すればOKです．与えられたN-gramが上記を満たさない場合，Julius はエラーを出します．

Revision: 1.6

ngram2.h で定義されています。

関数

NNID search_bigram	(	NGRAM_INFO *	ndata,
		WORD_ID	w_l,
		WORD_ID	w_r
	)

Search for 2-gram tuple (w_l, w_r) in the 2-gram part of N-gram.

引数:

	ndata	[in] word/class N-gram
	w_l	[in] left word/class ID in N-gram
	w_r	[in] right word/class ID in N-gram

戻り値:: corresponding index to the 2-gram data part if found, or NNID_INVALID if the tuple does not exist in 2-gram.

ngram_access.c の 39 行で定義されています。

参照元 add_bigram_rl()・bi_prob_lr()・bi_prob_rl()・tri_prob_rl().

LOGPROB uni_prob	(	NGRAM_INFO *	ndata,
		WORD_ID	w
	)

Get 1-gram probability of $w$ in log10.

引数:

	ndata	[in] word/class N-gram
	w	[in] word/class ID in N-gram

戻り値:: log10 probability $\log p(w)$ .

ngram_access.c の 154 行で定義されています。

LOGPROB bi_prob_lr	(	NGRAM_INFO *	ndata,
		WORD_ID	w1,
		WORD_ID	w2
	)

Get LR 2-gram probability of word/class sequence $(w_1, w_2)$ in log10

引数:

	ndata	[in] word/class N-gram
	w1	[in] left word/class ID in N-gram
	w2	[in] right word/class ID in N-gram (to compute probability)

戻り値:: log10 probability $\log p(w_2|w_1)$ .

ngram_access.c の 175 行で定義されています。

LOGPROB bi_prob_rl	(	NGRAM_INFO *	ndata,
		WORD_ID	w1,
		WORD_ID	w2
	)

Get RL 2-gram probability of word/class sequence $(w_1, w_2)$ in log10.

引数:

	ndata	[in] word/class N-gram
	w1	[in] left word/class ID in N-gram (to compute probability)
	w2	[in] right word/class ID in N-gram

戻り値:: log10 probability $\log p(w_1|w_2)$ .

ngram_access.c の 206 行で定義されています。

参照元 tri_prob_rl().

LOGPROB tri_prob_rl	(	NGRAM_INFO *	ndata,
		WORD_ID	w1,
		WORD_ID	w2,
		WORD_ID	w3
	)

Get RL 3-gram probability of word/class sequence $(w_1, w_2, w_3)$ in log10.

引数:

	ndata	[in] word/class N-gram
	w1	[in] left word/class ID in N-gram (to compute probability)
	w2	[in] middle word/class ID in N-gram
	w3	[in] right word/class ID in N-gram

戻り値:: log10 probability $\log p(w_1|w_2, w_3)$ .

ngram_access.c の 239 行で定義されています。

boolean ngram_read_arpa	(	FILE *	fp,
		NGRAM_INFO *	ndata,
		int	direction
	)

Read in one ARPA N-gram file, either LR 2-gram or RL 3-gram.

引数:

	fp	[in] file pointer
	ndata	[out] N-gram data to store the read data
	direction	[in] specify whether this is LR 2-gram or RL 3-gram

戻り値:: TRUE on success, FALSE on failure.

ngram_read_arpa.c の 518 行で定義されています。

参照元 init_ngram_arpa().

void set_unknown_id ( NGRAM_INFO * ndata )

Set unknown word ID to the N-gram data.

In CMU-Cam SLM toolkit, OOV words are always mapped to UNK, which always appear at the very beginning of N-gram entry, so we fix the unknown word ID at "0".

引数:

ndata

[out] N-gram data to set unknown word ID.

ngram_read_arpa.c の 72 行で定義されています。

参照元 ngram_read_arpa().

boolean ngram_read_bin	(	FILE *	fp,
		NGRAM_INFO *	ndata
	)

Read a N-gram binary file and store to data.

引数:

	fp	[in] file pointer
	ndata	[out] N-gram data to store the read data

戻り値:: TRUE on success, FALSE on failure.

ngram_read_bin.c の 230 行で定義されています。

参照元 init_ngram_bin().

boolean ngram_write_bin	(	FILE *	fp,
		NGRAM_INFO *	ndata,
		char *	headerstr
	)

Write a whole N-gram data in binary format.

引数:

	fp	[in] file pointer
	ndata	[in] N-gram data to write
	headerstr	[in] user header string

戻り値:: TRUE on success, FALSE on failure

ngram_write_bin.c の 128 行で定義されています。

void ngram_make_lookup_tree ( NGRAM_INFO * ndata )

Make index tree for searching N-gram ID from the entry name.

引数:

ndata

[in] N-gram data

ngram_lookup.c の 34 行で定義されています。

WORD_ID ngram_lookup_word	(	NGRAM_INFO *	ndata,
		char *	wordstr
	)

Look up N-gram ID by entry name.

引数:

	ndata	[in] N-gram data
	wordstr	[in] entry name to search

戻り値:: the found class/word ID, or WORD_INVALID if not found.

ngram_lookup.c の 64 行で定義されています。

参照元 lookup_word()・make_ngram_ref()・set_unknown_id().

WORD_ID make_ngram_ref	(	NGRAM_INFO *	ndata,
		char *	wstr
	)

Return N-gram ID of entry name, or unknown class ID if not found.

引数:

	ndata	[in] N-gram data
	wstr	[in] entry name to search

戻り値:: the found class/word ID, or unknown ID if not found.

ngram_lookup.c の 84 行で定義されています。

参照元 make_voca_ref().

NGRAM_INFO* ngram_info_new ( )

Allocate a new N-gram structure.

戻り値:: pointer to the newly allocated structure.

ngram_malloc.c の 33 行で定義されています。

void ngram_info_free ( NGRAM_INFO * ndata )

Free N-gram data.

引数:

ndata

[in] N-gram data

ngram_malloc.c の 48 行で定義されています。

void init_ngram_bin	(	NGRAM_INFO *	ndata,
		char *	bin_ngram_file
	)

Read and setup N-gram data from binary format file.

引数:

	ndata	[out] pointer to N-gram data structure to store the data
	bin_ngram_file	[in] file name of the binary N-gram

init_ngram.c の 35 行で定義されています。

void init_ngram_arpa	(	NGRAM_INFO *	ndata,
		char *	ngram_lr_file,
		char *	ngram_rl_file
	)

Read and setup N-gram data from ARPA format files of 2-gram and 3-gram.

引数:

	ndata	[out] pointer to N-gram data structure to store the data
	ngram_lr_file	[in] file name of ARPA 2-gram file
	ngram_rl_file	[in] file name of ARPA reverse 3-gram file

init_ngram.c の 60 行で定義されています。

void ngram_compact_bigram_context ( NGRAM_INFO * ndata )

Compact the 2-gram context information.

引数:

ndata

[i/o] N-gram data

ngram_read_arpa.c の 630 行で定義されています。

参照元 ngram_read_arpa().

void print_ngram_info ( NGRAM_INFO * ndata )

Output misccelaneous information of N-gram to standard output.

引数:

ndata

[in] N-gram data

ngram_util.c の 97 行で定義されています。

参照元 print_info().

void make_voca_ref	(	NGRAM_INFO *	ndata,
		WORD_INFO *	winfo
	)

Make correspondence between word dictionary and N-gram vocabulary.

引数:

	ndata	[i/o] word/class N-gram, the unknown word information will be set.
	winfo	[i/o] word dictionary, the word-to-ngram-entry mapping will be done here.

init_ngram.c の 100 行で定義されています。

Julianに対してTue Dec 26 12:57:02 2006に生成されました。

1.5.0


データ構造
struct	NGRAM_INFO
	Main N-gram structure [詳細]
マクロ定義
#define	MAX_N 3
	Maximum number of N (now fixed to trigram)
#define	NNID_INVALID -1
	Value to indicate no id
#define	NNID_INVALID_UPPER 255
	Value to indicate no id at NNID_UPPER
#define	NNIDMAX 16711680
	Allowed maximum number of NNID (255*65536)
#define	BINGRAM_IDSTR "julius_bingram_v3"
	Header string to identify version of bingram (v3: <= rev.3.4.2)
#define	BINGRAM_IDSTR_V4 "julius_bingram_v4"
	Header string to identify version of bingram (v4: >= rev.3.5)
#define	BINGRAM_HDSIZE 512
	Bingram header size in bytes
#define	BINGRAM_SIZESTR_HEAD "word="
	Bingram header info string to identify the unit byte (head)
#define	BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
	Bingram header string that indicates 4 bytes unit
#define	BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
	Bingram header string that indicates 2 bytes unit
#define	BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
#define	BINGRAM_BYTEORDER_HEAD "byteorder="
	Bingram header info string to identify the byte order (head) (v4)
#define	BINGRAM_NATURAL_BYTEORDER "LE"
	Bingram header info string to identify the byte order (body) (v4)
型定義
typedef unsigned char	NNID_UPPER
	Type definition for N-gram word ID
typedef unsigned short	NNID_LOWER
	Type definition for N-gram word ID
typedef int	NNID
	Type definition for N-gram word ID
関数
NNID	search_bigram (NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r)
LOGPROB	uni_prob (NGRAM_INFO *ndata, WORD_ID w)
LOGPROB	bi_prob_lr (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
LOGPROB	bi_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
LOGPROB	tri_prob_rl (NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3)
boolean	ngram_read_arpa (FILE fp, NGRAM_INFO ndata, int direction)
void	set_unknown_id (NGRAM_INFO *ndata)
	Set unknown word ID to the N-gram data.
boolean	ngram_read_bin (FILE fp, NGRAM_INFO ndata)
boolean	ngram_write_bin (FILE fp, NGRAM_INFO ndata, char *header_str)
void	ngram_make_lookup_tree (NGRAM_INFO *ndata)
WORD_ID	ngram_lookup_word (NGRAM_INFO ndata, char wordstr)
WORD_ID	make_ngram_ref (NGRAM_INFO , char )
NGRAM_INFO *	ngram_info_new ()
void	ngram_info_free (NGRAM_INFO *ngram)
void	init_ngram_bin (NGRAM_INFO ndata, char ngram_file)
void	init_ngram_arpa (NGRAM_INFO ndata, char lrfile, char *rlfile)
void	ngram_compact_bigram_context (NGRAM_INFO *ndata)
void	print_ngram_info (NGRAM_INFO *ndata)
void	make_voca_ref (NGRAM_INFO ndata, WORD_INFO winfo)