00001
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046 #include <julius/julius.h>
00047
00048
00049 #define WCHMM_SIZE_CHECK
00050
00051
00052
00053
00054
00069 WCHMM_INFO *
00070 wchmm_new()
00071 {
00072 WCHMM_INFO *w;
00073 w = (WCHMM_INFO *)mymalloc(sizeof(WCHMM_INFO));
00074 w->lmtype = LM_UNDEF;
00075 w->lmvar = LM_UNDEF;
00076 w->ngram = NULL;
00077 w->dfa = NULL;
00078 w->winfo = NULL;
00079 w->malloc_root = NULL;
00080 #ifdef PASS1_IWCD
00081 w->lcdset_category_root = NULL;
00082 #endif
00083 w->wrk.out_from_len = 0;
00084
00085 w->uni_prob_user = NULL;
00086 w->bi_prob_user = NULL;
00087 return w;
00088 }
00089
00102 static void
00103 wchmm_init(WCHMM_INFO *wchmm)
00104 {
00105
00106 wchmm->maxwcn = wchmm->winfo->totalstatenum / 2;
00107 wchmm->state = (WCHMM_STATE *)mymalloc(sizeof(WCHMM_STATE)*wchmm->maxwcn);
00108 wchmm->self_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn);
00109 wchmm->next_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn);
00110 wchmm->ac = (A_CELL2 **)mymalloc(sizeof(A_CELL2 *)*wchmm->maxwcn);
00111 wchmm->stend = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn);
00112 wchmm->offset = (int **)mymalloc(sizeof(int *)*wchmm->winfo->num);
00113 wchmm->wordend = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00114 wchmm->maxstartnum = STARTNODE_STEP;
00115 wchmm->startnode = (int *)mymalloc(sizeof(int)*STARTNODE_STEP);
00116 wchmm->startnum = 0;
00117 if (wchmm->category_tree) {
00118 wchmm->start2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*STARTNODE_STEP);
00119 }
00120 if (wchmm->hmminfo->multipath) {
00121 wchmm->wordbegin = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00122 wchmm->wrk.out_from = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00123 wchmm->wrk.out_from_next = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00124 wchmm->wrk.out_a = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00125 wchmm->wrk.out_a_next = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00126 wchmm->wrk.out_from_len = wchmm->winfo->maxwn;
00127 } else {
00128 wchmm->wordend_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->winfo->num);
00129 }
00130 #ifdef PASS1_IWCD
00131 wchmm->outstyle = (unsigned char *)mymalloc(sizeof(unsigned char)*wchmm->maxwcn);
00132 #endif
00133 #ifdef UNIGRAM_FACTORING
00134 wchmm->start2isolate = NULL;
00135 wchmm->isolatenum = 0;
00136 #endif
00137 if (!wchmm->category_tree) {
00138 wchmm->sclist = NULL;
00139 wchmm->sclist2node = NULL;
00140 #ifdef UNIGRAM_FACTORING
00141 wchmm->fscore = NULL;
00142 #endif
00143 }
00144
00145 wchmm->n = 0;
00146 }
00147
00160 static void
00161 wchmm_extend(WCHMM_INFO *wchmm)
00162 {
00163
00164 wchmm->maxwcn += wchmm->winfo->totalstatenum / 6;
00165 wchmm->state = (WCHMM_STATE *)myrealloc(wchmm->state, sizeof(WCHMM_STATE)*wchmm->maxwcn);
00166 wchmm->self_a = (LOGPROB *)myrealloc(wchmm->self_a, sizeof(LOGPROB)*wchmm->maxwcn);
00167 wchmm->next_a = (LOGPROB *)myrealloc(wchmm->next_a, sizeof(LOGPROB)*wchmm->maxwcn);
00168 wchmm->ac = (A_CELL2 **)myrealloc(wchmm->ac, sizeof(A_CELL2 *)*wchmm->maxwcn);
00169 wchmm->stend = (WORD_ID *)myrealloc(wchmm->stend, sizeof(WORD_ID)*wchmm->maxwcn);
00170 #ifdef PASS1_IWCD
00171 wchmm->outstyle = (unsigned char *)myrealloc(wchmm->outstyle, sizeof(unsigned char)*wchmm->maxwcn);
00172 #endif
00173 }
00174
00187 static void
00188 wchmm_extend_startnode(WCHMM_INFO *wchmm)
00189 {
00190 wchmm->maxstartnum += STARTNODE_STEP;
00191 wchmm->startnode = (int *)myrealloc(wchmm->startnode, sizeof(int) * wchmm->maxstartnum);
00192 if (wchmm->category_tree) {
00193 wchmm->start2wid = (WORD_ID *)myrealloc(wchmm->start2wid, sizeof(WORD_ID) * wchmm->maxstartnum);
00194 }
00195 }
00196
00211 void
00212 wchmm_free(WCHMM_INFO *w)
00213 {
00214 S_CELL *sc, *sctmp;
00215 int i;
00216
00217
00218 #ifdef PASS1_IWCD
00219
00220 #endif
00221
00222 mybfree2(&(w->malloc_root));
00223 if (!w->category_tree) {
00224 if (w->sclist != NULL) {
00225 for(i=1;i<w->scnum;i++) {
00226 sc = w->sclist[i];
00227 while(sc) {
00228 sctmp = sc->next;
00229 free(sc);
00230 sc = sctmp;
00231 }
00232 }
00233 free(w->sclist);
00234 }
00235 if (w->sclist2node != NULL) free(w->sclist2node);
00236 #ifdef UNIGRAM_FACTORING
00237 if (w->fscore != NULL) free(w->fscore);
00238 #endif
00239 }
00240 #ifdef UNIGRAM_FACTORING
00241 if (w->start2isolate != NULL) free(w->start2isolate);
00242 #endif
00243 #ifdef PASS1_IWCD
00244 free(w->outstyle);
00245 #endif
00246 if (w->hmminfo->multipath) {
00247 free(w->wordbegin);
00248 } else {
00249 free(w->wordend_a);
00250 }
00251 if (w->category_tree) free(w->start2wid);
00252 free(w->startnode);
00253 free(w->wordend);
00254 free(w->offset);
00255 free(w->stend);
00256 free(w->ac);
00257 free(w->next_a);
00258 free(w->self_a);
00259 free(w->state);
00260 #ifdef PASS1_IWCD
00261 if (w->category_tree) lcdset_remove_with_category_all(w);
00262 #endif
00263 if (w->wrk.out_from_len != 0) {
00264 free(w->wrk.out_from);
00265 free(w->wrk.out_from_next);
00266 free(w->wrk.out_a);
00267 free(w->wrk.out_a_next);
00268 w->wrk.out_from_len = 0;
00269 }
00270 free(w);
00271 }
00272
00273
00274
00275
00276
00277
00278 static WORD_INFO *local_winfo;
00279
00298 static int
00299 compare_wseq(WORD_ID *widx1, WORD_ID *widx2)
00300 {
00301 int len1, len2, n;
00302 int p=0;
00303
00304 len1 = local_winfo->wlen[*widx1];
00305 len2 = local_winfo->wlen[*widx2];
00306
00307 n=0;
00308
00309 while (n < len1 && n < len2 && (p = strcmp((local_winfo->wseq[*widx1][n])->name, (local_winfo->wseq[*widx2][n])->name)) == 0 ) n++;
00310 if (n < len1) {
00311 if (n < len2) {
00312
00313 return(p);
00314 } else {
00315
00316 return(1);
00317 }
00318 } else {
00319 if (n < len2) {
00320
00321 return(-1);
00322 } else {
00323
00324 return(0);
00325 }
00326 }
00327 }
00328
00347 static void
00348 wchmm_sort_idx_by_wseq(WORD_INFO *winfo, WORD_ID *windex, WORD_ID bgn, WORD_ID len)
00349 {
00350 local_winfo = winfo;
00351 qsort(&(windex[bgn]), len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_wseq);
00352 }
00353
00372 static int
00373 compare_category(WORD_ID *widx1, WORD_ID *widx2)
00374 {
00375 int c1,c2;
00376 c1 = local_winfo->wton[*widx1];
00377 c2 = local_winfo->wton[*widx2];
00378 return(c1 - c2);
00379 }
00380
00397 static void
00398 wchmm_sort_idx_by_category(WORD_INFO *winfo, WORD_ID *windex, WORD_ID len)
00399 {
00400 local_winfo = winfo;
00401 qsort(windex, len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_category);
00402 }
00403
00404
00405
00406
00407
00408
00430 static int
00431 wchmm_check_match(WORD_INFO *winfo, int i, int j)
00432 {
00433 int k,tmplen;
00434
00435 for (tmplen=0,k=0;k<winfo->wlen[i];k++) {
00436 if (k > winfo->wlen[j]-1)
00437 break;
00438 if (! (strmatch(winfo->wseq[i][k]->name, winfo->wseq[j][k]->name)))
00439 break;
00440 tmplen++;
00441 }
00442 return(tmplen);
00443 }
00444
00457 static void
00458 acc_init(WCHMM_INFO *wchmm, int node)
00459 {
00460 wchmm->self_a[node] = LOG_ZERO;
00461 wchmm->next_a[node] = LOG_ZERO;
00462 wchmm->ac[node] = NULL;
00463 }
00464
00481 static void
00482 add_ac(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00483 {
00484 A_CELL2 *ac2;
00485
00486 for(ac2=wchmm->ac[node];ac2;ac2=ac2->next) {
00487 if (ac2->n < A_CELL2_ALLOC_STEP) break;
00488 }
00489 if (ac2 == NULL) {
00490 ac2 = (A_CELL2 *)mybmalloc2(sizeof(A_CELL2), &(wchmm->malloc_root));
00491 ac2->n = 0;
00492 ac2->next = wchmm->ac[node];
00493 wchmm->ac[node] = ac2;
00494 }
00495 ac2->arc[ac2->n] = arc;
00496 ac2->a[ac2->n] = a;
00497 ac2->n++;
00498 }
00499
00518 static void
00519 add_wacc(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00520 {
00521 if (arc == node) {
00522 wchmm->self_a[node] = a;
00523 } else if (arc == node + 1) {
00524 wchmm->next_a[node] = a;
00525 } else {
00526 add_ac(wchmm, node, a, arc);
00527 }
00528 }
00529
00556 static void
00557 get_outtrans_list(WCHMM_INFO *wchmm, WORD_ID w, int pos, int *node, LOGPROB *a, int *num, int maxnum, boolean insert_sp)
00558 {
00559 HMM_Logical *ltmp;
00560 int states;
00561 int k;
00562 LOGPROB prob;
00563 int oldnum;
00564
00565 if (pos < 0) {
00566
00567
00568 node[*num] = wchmm->wordbegin[w];
00569 a[*num] = 0.0;
00570 (*num)++;
00571
00572 } else {
00573
00574 ltmp = wchmm->winfo->wseq[w][pos];
00575 states = hmm_logical_state_num(ltmp);
00576
00577
00578 if ((hmm_logical_trans(ltmp))->a[0][states-1] != LOG_ZERO) {
00579
00580 oldnum = *num;
00581 get_outtrans_list(wchmm, w, pos-1, node, a, num, maxnum, FALSE);
00582
00583 for(k=oldnum;k<*num;k++) {
00584 a[k] += (hmm_logical_trans(ltmp))->a[0][states-1];
00585 }
00586 }
00587
00588 for (k = 1; k < states - 1; k++) {
00589 prob = (hmm_logical_trans(ltmp))->a[k][states-1];
00590 if (prob != LOG_ZERO) {
00591 if (*num >= maxnum) {
00592 j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum);
00593 }
00594 node[*num] = wchmm->offset[w][pos] + k - 1;
00595 a[*num] = prob;
00596 (*num)++;
00597 }
00598 }
00599
00600
00601
00602
00603
00604 if (insert_sp) {
00605
00606 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
00607 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][hmm_logical_state_num(wchmm->hmminfo->sp)-1];
00608 if (prob != LOG_ZERO) {
00609 if (*num >= maxnum) {
00610 j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum);
00611 }
00612 node[*num] = wchmm->offset[w][pos] + (states - 2) + k - 1;
00613 a[*num] = prob;
00614 (*num)++;
00615 }
00616 }
00617 }
00618 }
00619
00620
00621 return;
00622 }
00623
00642 static void
00643 wchmm_link_hmm(WCHMM_INFO *wchmm, int from_node, int to_node, HTK_HMM_Trans *tinfo)
00644 {
00645 A_CELL2 *actmp;
00646 LOGPROB a;
00647 int i, j;
00648 boolean tflag;
00649
00650
00651 for(i = tinfo->statenum - 2; i >= 0; i--) {
00652 if ((a = tinfo->a[i][tinfo->statenum-1]) != LOG_ZERO) {
00653
00654 tflag = FALSE;
00655 if (to_node == from_node && wchmm->self_a[from_node] == a) {
00656 tflag = TRUE;
00657 } else if (to_node == from_node + 1 && wchmm->next_a[from_node] == a) {
00658 tflag = TRUE;
00659 } else {
00660 for (actmp = wchmm->ac[from_node]; actmp; actmp = actmp->next) {
00661 for(j=0;j<actmp->n;j++) {
00662 if (actmp->arc[j] == to_node && actmp->a[j] == a) {
00663 tflag = TRUE;
00664 break;
00665 }
00666 }
00667 if (tflag == TRUE) break;
00668 }
00669 }
00670 if (tflag) break;
00671
00672 add_wacc(wchmm, from_node, a, to_node);
00673 return;
00674 }
00675 }
00676 j_internal_error("wchmm_link_hmm: No arc to endstate?\n");
00677 }
00678
00699 static void
00700 wchmm_link_subword(WCHMM_INFO *wchmm, int from_word, int from_seq, int to_word, int to_seq)
00701 {
00702 HMM_Logical *last;
00703 int lastp;
00704
00705 last = wchmm->winfo->wseq[from_word][from_seq];
00706 lastp = wchmm->offset[from_word][from_seq] + hmm_logical_state_num(last)-2 -1;
00707 wchmm_link_hmm(wchmm, lastp, wchmm->offset[to_word][to_seq],
00708 hmm_logical_trans(last));
00709 }
00710
00711
00712
00713
00714
00754 static void
00755 wchmm_duplicate_state(WCHMM_INFO *wchmm, int node, int word)
00756 {
00757 int j, n;
00758 int n_src, n_prev;
00759 A_CELL2 *ac;
00760 HMM_Logical *lastphone;
00761
00762
00763 if (wchmm->n + 1 >= wchmm->maxwcn) {
00764 wchmm_extend(wchmm);
00765 }
00766
00767 n = wchmm->n;
00768
00769 n_src = node;
00770
00771
00772 #ifdef PASS1_IWCD
00773 {
00774 RC_INFO *rcnew;
00775 LRC_INFO *lrcnew;
00776 wchmm->outstyle[n] = wchmm->outstyle[n_src];
00777 if (wchmm->outstyle[n] == AS_RSET) {
00778
00779 rcnew = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
00780 memcpy(rcnew, wchmm->state[n_src].out.rset, sizeof(RC_INFO));
00781 wchmm->state[n].out.rset = rcnew;
00782 } else if (wchmm->outstyle[n] == AS_LRSET) {
00783
00784 lrcnew = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
00785 memcpy(lrcnew, wchmm->state[n_src].out.lrset, sizeof(LRC_INFO));
00786 wchmm->state[n].out.lrset = lrcnew;
00787 } else {
00788
00789 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(ACOUSTIC_SPEC));
00790 }
00791 }
00792 #else
00793 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(HTK_HMM_State *));
00794 #endif
00795
00796 lastphone = wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-1];
00797 acc_init(wchmm, n);
00798
00799
00800 wchmm->self_a[n] = wchmm->self_a[n_src];
00801
00802
00803 if (hmm_logical_state_num(lastphone) == 3) {
00804
00805 if (wchmm->winfo->wlen[word] == 1) {
00806
00807 wchmm->offset[word][0] = n;
00808
00809 if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) {
00810 wchmm->startnode[wchmm->startnum] = n;
00811 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
00812
00813 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
00814 }
00815 } else {
00816
00817 n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-2]
00818 + hmm_logical_state_num(wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-2]) - 3;
00819 if(n_src == n_prev + 1) {
00820 add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n);
00821 } else {
00822 for(ac=wchmm->ac[n_prev];ac;ac=ac->next) {
00823 for(j=0;j<ac->n;j++) {
00824 if (ac->arc[j] == n_src) {
00825 add_wacc(wchmm, n_prev, ac->a[j], n);
00826 }
00827 }
00828 }
00829 }
00830
00831 wchmm->offset[word][wchmm->winfo->wlen[word]-1] = n;
00832 }
00833 } else {
00834
00835 for (n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-1]; n_prev < n_src; n_prev++) {
00836 if (n_src == n_prev + 1) {
00837 add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n);
00838 } else {
00839 for(ac=wchmm->ac[n_prev];ac;ac=ac->next) {
00840 for(j=0;j<ac->n;j++) {
00841 if (ac->arc[j] == n_src) {
00842 add_wacc(wchmm, n_prev, ac->a[j], n);
00843 }
00844 }
00845 }
00846 }
00847 if (n_prev == n_src + 1) {
00848 add_wacc(wchmm, n, wchmm->next_a[n_src], n_prev);
00849 } else {
00850 for(ac=wchmm->ac[n_src];ac;ac=ac->next) {
00851 for(j=0;j<ac->n;j++) {
00852 if (ac->arc[j] == n_prev) {
00853 add_wacc(wchmm, n, ac->a[j], n_prev);
00854 }
00855 }
00856 }
00857 }
00858 }
00859 }
00860
00861
00862 wchmm->stend[n] = word;
00863 wchmm->wordend[word] = n;
00864
00865
00866 wchmm->n++;
00867
00868 }
00869
00884 static int
00885 wchmm_duplicate_leafnode(WCHMM_INFO *wchmm)
00886 {
00887 int w, nlast, n, narc, narc_model;
00888 boolean *dupw;
00889 A_CELL2 *actmp;
00890 int dupcount;
00891
00892 dupcount = 0;
00893
00894 nlast = wchmm->n;
00895 dupw = (boolean *)mymalloc(sizeof(boolean) * nlast);
00896 for(n=0;n<nlast;n++) dupw[n] = FALSE;
00897
00898 for (w=0;w<wchmm->winfo->num;w++) {
00899 n = wchmm->wordend[w];
00900 if (dupw[n]) {
00901 wchmm_duplicate_state(wchmm, n, w); dupcount++;
00902 } else {
00903
00904 {
00905
00906 HMM_Logical *lastphone;
00907 HTK_HMM_Trans *tinfo;
00908 int laststate, i;
00909 lastphone = wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1];
00910 laststate = hmm_logical_state_num(lastphone) - 2;
00911 tinfo = hmm_logical_trans(lastphone);
00912 narc_model=0;
00913 for(i=1;i<hmm_logical_state_num(lastphone)-1;i++) {
00914 if (tinfo->a[laststate][i] != LOG_ZERO) narc_model++;
00915 }
00916
00917 narc = 0;
00918 if (wchmm->self_a[n] != LOG_ZERO) narc++;
00919 if (wchmm->next_a[n] != LOG_ZERO) narc++;
00920 for(actmp=wchmm->ac[n];actmp;actmp=actmp->next) narc += actmp->n;
00921 }
00922
00923 if (narc_model != narc) {
00924
00925
00926 wchmm_duplicate_state(wchmm, n, w); dupcount++;
00927
00928
00929 wchmm->stend[n] = WORD_INVALID;
00930 } else {
00931
00932
00933 wchmm->stend[n] = w;
00934 }
00935
00936 dupw[n] = TRUE;
00937 }
00938 }
00939 free(dupw);
00940
00941 return(dupcount);
00942 }
00943
00944
00945
00946
00947
00972 static boolean
00973 wchmm_add_word(WCHMM_INFO *wchmm, int word, int matchlen, int matchword, boolean enable_iwsp)
00974 {
00975 boolean ok_p;
00976 int j,k,n;
00977 int add_head, add_tail, add_to;
00978 int word_len, matchword_len;
00979 HMM_Logical *ltmp;
00980 int ato;
00981 LOGPROB prob;
00982 int ntmp;
00983 int ltmp_state_num;
00984 #ifdef PASS1_IWCD
00985 CD_Set *lcd = NULL;
00986 #endif
00987 int *out_from;
00988 int *out_from_next;
00989 LOGPROB *out_a;
00990 LOGPROB *out_a_next;
00991
00992
00993
00994 int out_num_prev, out_num_next;
00995 int kkk;
00996
00997 ok_p = TRUE;
00998 if (wchmm->hmminfo->multipath) {
00999 out_from = wchmm->wrk.out_from;
01000 out_from_next = wchmm->wrk.out_from_next;
01001 out_a = wchmm->wrk.out_a;
01002 out_a_next = wchmm->wrk.out_a_next;
01003 }
01004
01005
01006
01007
01008
01009
01010
01011
01012
01013
01014
01015 n = wchmm->n;
01016 word_len = wchmm->winfo->wlen[word];
01017 matchword_len = wchmm->winfo->wlen[matchword];
01018
01019
01020 wchmm->offset[word] = (int *)mybmalloc2(sizeof(int)*word_len, &(wchmm->malloc_root));
01021
01022
01023 add_head = matchlen;
01024 add_tail = word_len - 1;
01025 add_to = matchlen - 1;
01026
01027 if (wchmm->hmminfo->multipath) {
01028
01029 if (matchlen == 0) {
01030
01031 wchmm->wordbegin[word] = n;
01032 wchmm->stend[n] = WORD_INVALID;
01033 acc_init(wchmm, n);
01034 wchmm->state[n].out.state = NULL;
01035
01036 wchmm->startnode[wchmm->startnum] = n;
01037 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
01038
01039 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
01040 if (++n >= wchmm->maxwcn) wchmm_extend(wchmm);
01041 } else {
01042 wchmm->wordbegin[word] = wchmm->wordbegin[matchword];
01043 }
01044
01045
01046
01047
01048 out_num_prev = 0;
01049 if (matchlen == 0) {
01050
01051 out_from[0] = wchmm->wordbegin[word];
01052 out_a[0] = 0.0;
01053 out_num_prev = 1;
01054 } else {
01055
01056
01057 get_outtrans_list(wchmm, matchword, add_to, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, (enable_iwsp && add_tail - add_head + 1 <= 0) ? TRUE : FALSE);
01058
01059 }
01060 } else {
01061 if (matchlen == 0) {
01062 if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) {
01063
01064 wchmm->startnode[wchmm->startnum] = n;
01065 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
01066
01067 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
01068 }
01069 }
01070 }
01071
01072 if (add_tail - add_head + 1 > 0) {
01073 ntmp = n;
01074 for (j=add_head; j <= add_tail; j++) {
01075 ltmp = wchmm->winfo->wseq[word][j];
01076 ltmp_state_num = hmm_logical_state_num(ltmp);
01077 #ifdef PASS1_IWCD
01078 if (wchmm->ccd_flag) {
01079
01080
01081 if (wchmm->winfo->wlen[word] > 1 && j == wchmm->winfo->wlen[word] - 1) {
01082 if (wchmm->category_tree) {
01083 #ifdef USE_OLD_IWCD
01084 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01085 #else
01086 lcd = lcdset_lookup_with_category(wchmm, ltmp, wchmm->winfo->wton[word]);
01087 if (lcd == NULL) {
01088
01089
01090
01091 jlog("WARNING: wchmm: no lcdset found for [%s::%04d], fallback to [%s]\n", ltmp->name, wchmm->winfo->wton[word], ltmp->name);
01092 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01093 }
01094 #endif
01095 } else {
01096 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01097 }
01098 if (lcd == NULL) {
01099 jlog("ERROR: wchmm: at word #%d: no lcdset found for [%s]\n", word, ltmp->name);
01100 ok_p = FALSE;
01101 }
01102 }
01103 }
01104 #endif
01105 for (k = 1; k < ltmp_state_num - 1; k++) {
01106
01107 #ifdef PASS1_IWCD
01108 if (wchmm->ccd_flag) {
01109
01110 if (wchmm->winfo->wlen[word] == 1) {
01111 wchmm->outstyle[ntmp] = AS_LRSET;
01112 wchmm->state[ntmp].out.lrset = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
01113 (wchmm->state[ntmp].out.lrset)->hmm = ltmp;
01114 (wchmm->state[ntmp].out.lrset)->state_loc = k;
01115 if (wchmm->category_tree) {
01116 (wchmm->state[ntmp].out.lrset)->category = wchmm->winfo->wton[word];
01117 }
01118 } else if (j == 0) {
01119 wchmm->outstyle[ntmp] = AS_RSET;
01120 wchmm->state[ntmp].out.rset = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
01121 (wchmm->state[ntmp].out.rset)->hmm = ltmp;
01122 (wchmm->state[ntmp].out.rset)->state_loc = k;
01123 } else if (j == wchmm->winfo->wlen[word] - 1) {
01124 wchmm->outstyle[ntmp] = AS_LSET;
01125 wchmm->state[ntmp].out.lset = &(lcd->stateset[k]);
01126 } else {
01127 wchmm->outstyle[ntmp] = AS_STATE;
01128 if (ltmp->is_pseudo) {
01129 jlog("WARNING: wchmm: word-internal phone should not be pseudo\n");
01130 put_voca(stdout, wchmm->winfo, word);
01131 ok_p = FALSE;
01132 }
01133 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01134 }
01135 } else {
01136
01137 if (ltmp->is_pseudo) {
01138 j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n");
01139 put_voca(stdout, wchmm->winfo, word);
01140 ok_p = FALSE;
01141 }
01142 wchmm->outstyle[ntmp] = AS_STATE;
01143 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01144 }
01145 #else
01146 if (ltmp->is_pseudo) {
01147 j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n");
01148 put_voca(stdout, wchmm->winfo, word);
01149 ok_p = FALSE;
01150 }
01151 wchmm->state[ntmp].out = ltmp->body.defined->s[k];
01152 #endif
01153
01154
01155 acc_init(wchmm, ntmp);
01156 wchmm->stend[ntmp] = WORD_INVALID;
01157 if (! wchmm->hmminfo->multipath) {
01158
01159 for (ato = 1; ato < ltmp_state_num; ato++) {
01160 prob = (hmm_logical_trans(ltmp))->a[k][ato];
01161 if (prob != LOG_ZERO) {
01162 if (j == add_tail && k == ltmp_state_num - 2 && ato == ltmp_state_num - 1) {
01163
01164 } else {
01165 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01166 }
01167 }
01168 }
01169 }
01170
01171 ntmp++;
01172
01173 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01174 }
01175 }
01176
01177 if (wchmm->hmminfo->multipath) {
01178
01179
01180
01181
01182 ntmp = n;
01183 for (j = add_head; j <= add_tail; j++) {
01184 ltmp = wchmm->winfo->wseq[word][j];
01185 ltmp_state_num = hmm_logical_state_num(ltmp);
01186 out_num_next = 0;
01187
01188 for (ato = 1; ato < ltmp_state_num; ato++) {
01189 prob = (hmm_logical_trans(ltmp))->a[0][ato];
01190 if (prob != LOG_ZERO) {
01191
01192 if (ato == ltmp_state_num - 1) {
01193
01194 for(kkk=0; kkk<out_num_prev; kkk++) {
01195 out_from_next[out_num_next] = out_from[kkk];
01196 out_a_next[out_num_next] = out_a[kkk] + prob;
01197 out_num_next++;
01198 }
01199 } else {
01200 for(kkk=0; kkk<out_num_prev; kkk++) {
01201 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01202 }
01203 }
01204 }
01205 }
01206
01207 for(k = 1; k < ltmp_state_num - 1; k++) {
01208 for (ato = 1; ato < ltmp_state_num; ato++) {
01209 prob = (hmm_logical_trans(ltmp))->a[k][ato];
01210 if (prob != LOG_ZERO) {
01211 if (ato == ltmp_state_num - 1) {
01212
01213 out_from_next[out_num_next] = ntmp;
01214 out_a_next[out_num_next] = prob;
01215 out_num_next++;
01216 } else {
01217 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01218 }
01219 }
01220 }
01221 ntmp++;
01222 }
01223
01224 for(kkk=0;kkk<out_num_next;kkk++) {
01225 out_from[kkk] = out_from_next[kkk];
01226 out_a[kkk] = out_a_next[kkk];
01227 }
01228 out_num_prev = out_num_next;
01229 }
01230 }
01231
01232 }
01233
01234
01235
01236
01237
01238
01239
01240 if (wchmm->hmminfo->multipath && enable_iwsp && add_tail - add_head + 1 > 0) {
01241 int ntmp_bak;
01242
01243
01244 ntmp_bak = ntmp;
01245 if (wchmm->hmminfo->sp->is_pseudo) {
01246 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01247 wchmm->outstyle[ntmp] = AS_LSET;
01248 wchmm->state[ntmp].out.lset = &(wchmm->hmminfo->sp->body.pseudo->stateset[k]);
01249 acc_init(wchmm, ntmp);
01250 wchmm->stend[ntmp] = WORD_INVALID;
01251 ntmp++;
01252 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01253 }
01254 } else {
01255 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01256 wchmm->outstyle[ntmp] = AS_STATE;
01257 wchmm->state[ntmp].out.state = wchmm->hmminfo->sp->body.defined->s[k];
01258 acc_init(wchmm, ntmp);
01259 wchmm->stend[ntmp] = WORD_INVALID;
01260 ntmp++;
01261 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01262 }
01263 }
01264 ntmp = ntmp_bak;
01265
01266 out_num_next = 0;
01267 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01268 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[0][ato];
01269 if (prob != LOG_ZERO) {
01270
01271
01272 prob += wchmm->hmminfo->iwsp_penalty;
01273 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01274
01275 for(kkk=0; kkk<out_num_prev; kkk++) {
01276 out_from_next[out_num_next] = out_from[kkk];
01277 out_a_next[out_num_next] = out_a[kkk] + prob;
01278 out_num_next++;
01279 }
01280 } else {
01281
01282 for(kkk=0; kkk<out_num_prev; kkk++) {
01283 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01284 }
01285 }
01286 }
01287 }
01288
01289 if (hmm_logical_trans(wchmm->hmminfo->sp)->a[0][hmm_logical_state_num(wchmm->hmminfo->sp)-1] == LOG_ZERO) {
01290
01291
01292 prob = 0.0;
01293 for(kkk=0; kkk<out_num_prev; kkk++) {
01294 out_from_next[out_num_next] = out_from[kkk];
01295 out_a_next[out_num_next] = out_a[kkk] + prob;
01296 out_num_next++;
01297 }
01298 }
01299
01300 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01301 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01302 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][ato];
01303 if (prob != LOG_ZERO) {
01304 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01305 out_from_next[out_num_next] = ntmp;
01306 out_a_next[out_num_next] = prob;
01307 out_num_next++;
01308 } else {
01309 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01310 }
01311 }
01312 }
01313 ntmp++;
01314 }
01315
01316 for(kkk=0;kkk<out_num_next;kkk++) {
01317 out_from[kkk] = out_from_next[kkk];
01318 out_a[kkk] = out_a_next[kkk];
01319 }
01320 out_num_prev = out_num_next;
01321
01322 }
01323
01324
01325 for (j=0;j<word_len;j++) {
01326 if (j < add_head) {
01327 wchmm->offset[word][j] = wchmm->offset[matchword][j];
01328 } else if (add_tail < j) {
01329 wchmm->offset[word][j] = wchmm->offset[matchword][j+(matchword_len-word_len)];
01330 } else {
01331 wchmm->offset[word][j] = n;
01332 n += hmm_logical_state_num(wchmm->winfo->wseq[word][j]) - 2;
01333 }
01334 }
01335
01336
01337 if (wchmm->hmminfo->multipath) {
01338
01339
01340
01341 if (enable_iwsp && add_tail - add_head + 1 > 0) {
01342 n += hmm_logical_state_num(wchmm->hmminfo->sp) - 2;
01343 if (n != ntmp) j_internal_error("wchmm_add_word: cannot match\n");
01344 }
01345
01346
01347 wchmm->wordend[word] = n;
01348 wchmm->stend[n] = word;
01349 acc_init(wchmm, n);
01350 wchmm->state[n].out.state = NULL;
01351
01352
01353 for(k = 0; k < out_num_prev; k++) {
01354 add_wacc(wchmm, out_from[k], out_a[k], n);
01355 }
01356 n++;
01357 if (n >= wchmm->maxwcn) wchmm_extend(wchmm);
01358
01359 if (matchlen == 0) {
01360
01361
01362 out_num_prev = 0;
01363 get_outtrans_list(wchmm, word, word_len-1, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, enable_iwsp);
01364 for(k=0;k<out_num_prev;k++) {
01365 if (out_from[k] == wchmm->wordbegin[word]) {
01366 jlog("ERROR: *** ERROR: WORD SKIPPING TRANSITION NOT ALLOWED ***\n");
01367 jlog("ERROR: Word id=%d (%s[%s]) has \"word skipping transition\".\n", word, wchmm->winfo->wname[word], wchmm->winfo->woutput[word]);
01368 jlog("ERROR: All HMMs in the word:\n ");
01369 for(kkk=0;kkk<wchmm->winfo->wlen[word];kkk++) {
01370 jlog("%s ", wchmm->winfo->wseq[word][kkk]->name);
01371 }
01372 jlog("\n");
01373 jlog("ERROR: has transitions from initial state to final state.\n");
01374 jlog("ERROR: This type of word skipping is not supported.\n");
01375 ok_p = FALSE;
01376 }
01377 }
01378 }
01379
01380 wchmm->n = n;
01381
01382 } else {
01383
01384 wchmm->n = n;
01385 k = wchmm->offset[word][word_len-1] + hmm_logical_state_num(wchmm->winfo->wseq[word][word_len-1])-2 -1;
01386 wchmm->wordend[word] = k;
01387 wchmm->stend[k] = word;
01388
01389 if (matchlen != 0 && add_tail - add_head + 1 > 0) {
01390
01391
01392 wchmm_link_subword(wchmm, matchword,add_to,word,add_head);
01393 }
01394
01395 }
01396
01397 return(ok_p);
01398
01399 }
01400
01401
01402
01403
01404
01419 static void
01420 wchmm_calc_wordend_arc(WCHMM_INFO *wchmm)
01421 {
01422 WORD_ID w;
01423 HTK_HMM_Trans *tr;
01424 LOGPROB a;
01425
01426 for (w=0;w<wchmm->winfo->num;w++) {
01427 tr = hmm_logical_trans(wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]);
01428 a = tr->a[tr->statenum-2][tr->statenum-1];
01429 wchmm->wordend_a[w] = a;
01430 }
01431 }
01432
01433 #ifdef SEPARATE_BY_UNIGRAM
01434
01435
01436
01437
01438
01457 static int
01458 compare_prob(LOGPROB *a, LOGPROB *b)
01459 {
01460 if (*a < *b) return (1);
01461 if (*a > *b) return (-1);
01462 return(0);
01463 }
01464
01483 static LOGPROB
01484 get_nbest_uniprob(WCHMM_INFO *wchmm, int n)
01485 {
01486 LOGPROB *u_p;
01487 WORD_ID w;
01488 LOGPROB x;
01489 WORD_INFO *winfo;
01490 NGRAM_INFO *ngram;
01491
01492 winfo = wchmm->winfo;
01493 ngram = wchmm->ngram;
01494
01495 if (n < 1) n = 1;
01496 if (n > winfo->num) n = winfo->num;
01497
01498
01499 u_p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * winfo->num);
01500 for(w=0;w<winfo->num;w++) {
01501 if (ngram) {
01502 x = uni_prob(ngram, winfo->wton[w])
01503 #ifdef CLASS_NGRAM
01504 + winfo->cprob[w]
01505 #endif
01506 ;
01507 } else {
01508 x = LOG_ZERO;
01509 }
01510 if (wchmm->lmvar == LM_NGRAM_USER) {
01511 x = (*(wchmm->uni_prob_user))(wchmm->winfo, w, x);
01512 }
01513 u_p[w] = x;
01514 }
01515
01516
01517 qsort(u_p, winfo->num, sizeof(LOGPROB),
01518 (int (*)(const void *,const void *))compare_prob);
01519
01520
01521 x = u_p[n-1];
01522 free(u_p);
01523 return(x);
01524 }
01525
01526 #endif
01527
01528
01529
01530
01531
01532 #define COUNT_STEP 500
01533
01534
01555 boolean
01556 build_wchmm(WCHMM_INFO *wchmm, JCONF_LM *lmconf)
01557 {
01558 int i,j;
01559 int matchword=0, sharelen=0, maxsharelen=0;
01560 int num_duplicated;
01561 #ifdef SEPARATE_BY_UNIGRAM
01562 LOGPROB separate_thres;
01563 LOGPROB p;
01564 #endif
01565 boolean ok_p;
01566
01567
01568
01569 if (wchmm->winfo == NULL
01570 || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL)
01571 || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL)
01572 ) {
01573 jlog("ERROR: wchmm: linguistic info not available!!\n");
01574 return FALSE;
01575 }
01576
01577 ok_p = TRUE;
01578
01579 #ifdef SEPARATE_BY_UNIGRAM
01580
01581
01582 separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);
01583 #endif
01584
01585 #ifdef PASS1_IWCD
01586 #ifndef USE_OLD_IWCD
01587 if (wchmm->category_tree) {
01588 if (wchmm->ccd_flag) {
01589
01590 lcdset_register_with_category_all(wchmm);
01591 }
01592 }
01593 #endif
01594 #endif
01595
01596
01597
01598 wchmm_init(wchmm);
01599
01600
01601 wchmm->separated_word_count=0;
01602
01603 jlog("STAT: wchmm: Building HMM lexicon tree (left-to-right)\n");
01604 for (i=0;i<wchmm->winfo->num;i++) {
01605
01606 if (wchmm->lmtype == LM_PROB) {
01607 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01608
01609
01610
01611 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01612 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01613 ok_p = FALSE;
01614 }
01615 continue;
01616 }
01617 #ifndef NO_SEPARATE_SHORT_WORD
01618 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01619
01620
01621 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01622 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01623 ok_p = FALSE;
01624 }
01625 wchmm->separated_word_count++;
01626 continue;
01627 }
01628 #endif
01629 #ifdef SEPARATE_BY_UNIGRAM
01630 if (wchmm->ngram) {
01631 p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01632 #ifdef CLASS_NGRAM
01633 + wchmm->winfo->cprob[i]
01634 #endif
01635 ;
01636 } else {
01637 p = LOG_ZERO;
01638 }
01639 if (wchmm->lmvar == LM_NGRAM_USER) {
01640 p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p);
01641 }
01642 if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) {
01643
01644
01645 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01646 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01647 ok_p = FALSE;
01648 }
01649 wchmm->separated_word_count++;
01650 continue;
01651 }
01652 #endif
01653 }
01654
01655
01656 maxsharelen=0;
01657 for (j=0;j<i;j++) {
01658 if (wchmm->category_tree && wchmm->lmvar == LM_DFA_GRAMMAR) {
01659 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[j]) continue;
01660 }
01661 sharelen = wchmm_check_match(wchmm->winfo, i, j);
01662 if (sharelen == wchmm->winfo->wlen[i] && sharelen == wchmm->winfo->wlen[j]) {
01663
01664
01665 maxsharelen = sharelen;
01666 matchword = j;
01667 break;
01668 }
01669 if (sharelen > maxsharelen) {
01670 matchword = j;
01671 maxsharelen = sharelen;
01672 }
01673 }
01674 if (wchmm_add_word(wchmm, i, maxsharelen, matchword, lmconf->enable_iwsp) == FALSE) {
01675 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01676 ok_p = FALSE;
01677 }
01678 }
01679
01680 #if 0
01681
01682 for (i=0;i<wchmm->winfo->num;i++) {
01683 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01684 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01685 ok_p = FALSE;
01686 }
01687 }
01688 #endif
01689 jlog("STAT: %5d words ended (%6d nodes)\n",i,wchmm->n);
01690
01691 if (! wchmm->hmminfo->multipath) {
01692
01693 num_duplicated = wchmm_duplicate_leafnode(wchmm);
01694 jlog("STAT: %d leaf nodes are made unshared\n", num_duplicated);
01695
01696
01697 wchmm_calc_wordend_arc(wchmm);
01698 }
01699
01700
01701 check_wchmm(wchmm);
01702
01703
01704 if (!wchmm->category_tree) {
01705 make_successor_list(wchmm);
01706
01707 #ifdef UNIGRAM_FACTORING
01708 if (wchmm->lmtype == LM_PROB) {
01709
01710
01711 calc_all_unigram_factoring_values(wchmm);
01712 jlog("STAT: 1-gram factoring values has been pre-computed\n");
01713 }
01714 #endif
01715
01716 if (wchmm->hmminfo->multipath) {
01717
01718 adjust_sc_index(wchmm);
01719 }
01720
01721 #ifdef UNIGRAM_FACTORING
01722 if (wchmm->lmtype == LM_PROB) {
01723
01724 make_iwcache_index(wchmm);
01725 }
01726 #endif
01727
01728 }
01729
01730 jlog("STAT: done\n");
01731
01732 return ok_p;
01733 }
01734
01760 boolean
01761 build_wchmm2(WCHMM_INFO *wchmm, JCONF_LM *lmconf)
01762 {
01763 int i,j, last_i;
01764 int num_duplicated;
01765 WORD_ID *windex;
01766 #ifdef SEPARATE_BY_UNIGRAM
01767 LOGPROB separate_thres;
01768 LOGPROB p;
01769 #endif
01770 boolean ok_p;
01771 boolean ret;
01772
01773
01774
01775 if (wchmm->winfo == NULL
01776 || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL)
01777 || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL)
01778 ) {
01779 jlog("ERROR: wchmm: linguistic info not available!!\n");
01780 return FALSE;
01781 }
01782
01783 ok_p = TRUE;
01784
01785 wchmm->separated_word_count = 0;
01786
01787 jlog("STAT: Building HMM lexicon tree\n");
01788
01789 if (wchmm->lmtype == LM_PROB) {
01790 #ifdef SEPARATE_BY_UNIGRAM
01791
01792
01793 separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);
01794 #endif
01795 }
01796
01797 #ifdef PASS1_IWCD
01798 #ifndef USE_OLD_IWCD
01799 if (wchmm->category_tree) {
01800 if (wchmm->ccd_flag) {
01801
01802
01803
01804 lcdset_register_with_category_all(wchmm);
01805 }
01806 }
01807 #endif
01808 #endif
01809
01810
01811 wchmm_init(wchmm);
01812
01813
01814 windex = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wchmm->winfo->num);
01815 for(i=0;i<wchmm->winfo->num;i++) windex[i] = i;
01816
01817 if (wchmm->category_tree && wchmm->lmvar == LM_DFA_GRAMMAR) {
01818
01819
01820 wchmm_sort_idx_by_category(wchmm->winfo, windex, wchmm->winfo->num);
01821 {
01822 int last_cate;
01823 last_i = 0;
01824 last_cate = wchmm->winfo->wton[windex[0]];
01825 for(i = 1;i<wchmm->winfo->num;i++) {
01826 if (wchmm->winfo->wton[windex[i]] != last_cate) {
01827 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, i - last_i);
01828 last_cate = wchmm->winfo->wton[windex[i]];
01829 last_i = i;
01830 }
01831 }
01832 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, wchmm->winfo->num - last_i);
01833 }
01834
01835 } else {
01836
01837
01838 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, 0, wchmm->winfo->num);
01839
01840 }
01841
01842
01843
01844
01845
01846
01847
01848
01849
01850
01851
01852
01853
01854 last_i = WORD_INVALID;
01855 for (j=0;j<wchmm->winfo->num;j++) {
01856 i = windex[j];
01857
01858 if (wchmm->lmtype == LM_PROB) {
01859
01860
01861 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01862
01863 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01864 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01865 ok_p = FALSE;
01866 }
01867 continue;
01868 }
01869 #ifndef NO_SEPARATE_SHORT_WORD
01870
01871 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01872 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01873 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01874 ok_p = FALSE;
01875 }
01876 wchmm->separated_word_count++;
01877 continue;
01878 }
01879 #endif
01880 #ifdef SEPARATE_BY_UNIGRAM
01881 if (wchmm->ngram) {
01882 p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01883 #ifdef CLASS_NGRAM
01884 + wchmm->winfo->cprob[i]
01885 #endif
01886 ;
01887 } else {
01888 p = LOG_ZERO;
01889 }
01890 if (wchmm->lmvar == LM_NGRAM_USER) {
01891 p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p);
01892 }
01893
01894 if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) {
01895 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01896 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01897 ok_p = FALSE;
01898 }
01899 wchmm->separated_word_count++;
01900 continue;
01901 }
01902 #endif
01903 }
01904
01905 if (last_i == WORD_INVALID) {
01906 ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp);
01907 } else {
01908
01909 if (wchmm->category_tree && wchmm->lmvar == LM_DFA_GRAMMAR) {
01910 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[last_i]) {
01911 ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp);
01912 } else {
01913 ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp);
01914 }
01915 } else {
01916 ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp);
01917 }
01918 }
01919 if (ret == FALSE) {
01920 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01921 ok_p = FALSE;
01922 }
01923 last_i = i;
01924
01925 }
01926
01927
01928
01929
01930 free(windex);
01931
01932 if (wchmm->hmminfo->multipath) {
01933 jlog("STAT: lexicon size: %d nodes\n", wchmm->n);
01934 } else {
01935
01936 jlog("STAT: lexicon size: %d", wchmm->n);
01937 num_duplicated = wchmm_duplicate_leafnode(wchmm);
01938 jlog("+%d=%d\n", num_duplicated, wchmm->n);
01939 }
01940
01941 if (! wchmm->hmminfo->multipath) {
01942
01943 wchmm_calc_wordend_arc(wchmm);
01944 }
01945
01946
01947 check_wchmm(wchmm);
01948
01949
01950 if (!wchmm->category_tree) {
01951 make_successor_list(wchmm);
01952
01953 #ifdef UNIGRAM_FACTORING
01954 if (wchmm->lmtype == LM_PROB) {
01955
01956 calc_all_unigram_factoring_values(wchmm);
01957 jlog("STAT: 1-gram factoring values has been pre-computed\n");
01958 }
01959 #endif
01960 if (wchmm->hmminfo->multipath) {
01961
01962 adjust_sc_index(wchmm);
01963 }
01964 #ifdef UNIGRAM_FACTORING
01965 if (wchmm->lmtype == LM_PROB) {
01966
01967 make_iwcache_index(wchmm);
01968 }
01969 #endif
01970
01971 }
01972
01973
01974
01975 #ifdef WCHMM_SIZE_CHECK
01976 if (debug2_flag) {
01977
01978 jlog("STAT: --- memory size of word lexicon ---\n");
01979 jlog("STAT: wchmm: %d words, %d nodes\n", wchmm->winfo->num, wchmm->n);
01980 jlog("STAT: %9d bytes: wchmm->state[node] (exclude ac, sc)\n", sizeof(WCHMM_STATE) * wchmm->n);
01981 {
01982 int count1 = 0;
01983 int count2 = 0;
01984 int count3 = 0;
01985 for(i=0;i<wchmm->n;i++) {
01986 if (wchmm->self_a[i] != LOG_ZERO) count1++;
01987 if (wchmm->next_a[i] != LOG_ZERO) count2++;
01988 if (wchmm->ac[i] != NULL) count3++;
01989 }
01990 jlog("STAT: %9d bytes: wchmm->self_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count1 / (float)wchmm->n);
01991 jlog("STAT: %9d bytes: wchmm->next_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count2 / (float)wchmm->n);
01992 jlog("STAT: %9d bytes: wchmm->ac[node] (%4.1f%% used)\n", sizeof(A_CELL2 *) * wchmm->n, 100.0 * count3 / (float)wchmm->n);
01993 }
01994 jlog("STAT: %9d bytes: wchmm->stend[node]\n", sizeof(WORD_ID) * wchmm->n);
01995 {
01996 int w,count;
01997 count = 0;
01998 for(w=0;w<wchmm->winfo->num;w++) {
01999 count += wchmm->winfo->wlen[w] * sizeof(int) + sizeof(int *);
02000 }
02001 jlog("STAT: %9d bytes: wchmm->offset[w][]\n", count);
02002 }
02003 if (wchmm->hmminfo->multipath) {
02004 jlog("STAT: %9d bytes: wchmm->wordbegin[w]\n", wchmm->winfo->num * sizeof(int));
02005 }
02006 jlog("STAT: %9d bytes: wchmm->wordend[w]\n", wchmm->winfo->num * sizeof(int));
02007 jlog("STAT: %9d bytes: wchmm->startnode[]\n", wchmm->startnum * sizeof(int));
02008 if (wchmm->category_tree) {
02009 jlog("STAT: %9d bytes: wchmm->start2wid[]\n", wchmm->startnum * sizeof(WORD_ID));
02010 }
02011 #ifdef UNIGRAM_FACTORING
02012 if (wchmm->lmtype == LM_PROB) {
02013 jlog("STAT: %9d bytes: wchmm->start2isolate[]\n", wchmm->isolatenum * sizeof(int));
02014 }
02015 #endif
02016 if (!wchmm->hmminfo->multipath) {
02017 jlog("STAT: %9d bytes: wchmm->wordend_a[]\n", wchmm->winfo->num * sizeof(LOGPROB));
02018 }
02019 #ifdef PASS1_IWCD
02020 jlog("STAT: %9d bytes: wchmm->outstyle[]\n", wchmm->n * sizeof(unsigned char));
02021 {
02022 int c;
02023 c = 0;
02024 for(i=0;i<wchmm->n;i++) {
02025 switch(wchmm->outstyle[i]) {
02026 case AS_RSET:
02027 c += sizeof(RC_INFO);
02028 break;
02029 case AS_LRSET:
02030 c += sizeof(LRC_INFO);
02031 break;
02032 }
02033 }
02034 if (c > 0) jlog("STAT: %9d bytes: wchmm->out (RC_INFO / LRC_INFO)\n", c);
02035 }
02036 #endif
02037 if (!wchmm->category_tree) {
02038 jlog("STAT: %9d bytes: wchmm->sclist[]\n", wchmm->scnum * sizeof(S_CELL *));
02039 jlog("STAT: %9d bytes: wchmm->sclist2node[]\n", wchmm->scnum * sizeof(int));
02040 #ifdef UNIGRAM_FACTORING
02041 if (wchmm->lmtype == LM_PROB) {
02042 jlog("STAT: %9d bytes: wchmm->fscore[]\n", wchmm->fsnum * sizeof(LOGPROB));
02043 }
02044 #endif
02045 }
02046
02047 {
02048 int count, n;
02049 A_CELL2 *ac;
02050 count = 0;
02051 for(n=0;n<wchmm->n;n++) {
02052 for(ac=wchmm->ac[n];ac;ac=ac->next) {
02053 count += sizeof(A_CELL2);
02054 }
02055 }
02056 jlog("STAT: %9d bytes: A_CELL2\n", count);
02057 }
02058 if (!wchmm->category_tree) {
02059 jlog("STAT: %9d bytes: sclist\n", wchmm->scnum * sizeof(S_CELL *));
02060 jlog("STAT: %9d bytes: sclist2node\n", wchmm->scnum * sizeof(int));
02061 }
02062
02063 }
02064
02065 #endif
02066
02067
02068 return ok_p;
02069
02070 }
02071
02072
02087 void
02088 print_wchmm_info(WCHMM_INFO *wchmm)
02089 {
02090 int n,i, rootnum;
02091
02092 if (wchmm->hmminfo->multipath) {
02093 rootnum = wchmm->startnum;
02094 } else {
02095 if (wchmm->lmtype == LM_PROB) {
02096 rootnum = wchmm->startnum + 1;
02097 } else if (wchmm->lmtype == LM_DFA) {
02098 rootnum = wchmm->startnum;
02099 }
02100 }
02101
02102 jlog(" Lexicon tree:\n");
02103 jlog("\t total node num = %6d\n", wchmm->n);
02104 if (wchmm->lmtype == LM_PROB) {
02105 jlog("\t root node num = %6d\n", rootnum);
02106 #ifdef NO_SEPARATE_SHORT_WORD
02107 #ifdef SEPARATE_BY_UNIGRAM
02108 jlog("\t(%d hi-freq. words are separated from tree lexicon)\n", wchmm->separated_word_count);
02109 #else
02110 jlog(" (no words are separated from tree)\n");
02111 #endif
02112 #else
02113 jlog(" (%d short words (<= %d phonemes) are separated from tree)\n", wchmm->separated_word_count, SHORT_WORD_LEN);
02114 #endif
02115 }
02116 if (wchmm->lmtype == LM_DFA) {
02117 jlog("\t root node num = %6d\n", rootnum);
02118 }
02119 for(n=0,i=0;i<wchmm->n;i++) {
02120 if (wchmm->stend[i] != WORD_INVALID) n++;
02121 }
02122 jlog("\t leaf node num = %6d\n", n);
02123 if (!wchmm->category_tree) {
02124 jlog("\t fact. node num = %6d\n", wchmm->scnum - 1);
02125 }
02126 }
02127
02128