00001
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046 #include <julius/julius.h>
00047
00048
00049 #define WCHMM_SIZE_CHECK
00050
00051
00052
00053
00054
00069 WCHMM_INFO *
00070 wchmm_new()
00071 {
00072 WCHMM_INFO *w;
00073 w = (WCHMM_INFO *)mymalloc(sizeof(WCHMM_INFO));
00074 w->lmtype = LM_UNDEF;
00075 w->lmvar = LM_UNDEF;
00076 w->ngram = NULL;
00077 w->dfa = NULL;
00078 w->winfo = NULL;
00079 w->malloc_root = NULL;
00080 #ifdef PASS1_IWCD
00081 w->lcdset_category_root = NULL;
00082 w->lcdset_mroot = NULL;
00083 #endif
00084 w->wrk.out_from_len = 0;
00085
00086 w->uni_prob_user = NULL;
00087 w->bi_prob_user = NULL;
00088 return w;
00089 }
00090
00103 static void
00104 wchmm_init(WCHMM_INFO *wchmm)
00105 {
00106
00107 wchmm->maxwcn = wchmm->winfo->totalstatenum / 2;
00108 wchmm->state = (WCHMM_STATE *)mymalloc(sizeof(WCHMM_STATE)*wchmm->maxwcn);
00109 wchmm->self_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn);
00110 wchmm->next_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn);
00111 wchmm->ac = (A_CELL2 **)mymalloc(sizeof(A_CELL2 *)*wchmm->maxwcn);
00112 wchmm->stend = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn);
00113 wchmm->offset = (int **)mymalloc(sizeof(int *)*wchmm->winfo->num);
00114 wchmm->wordend = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00115 wchmm->maxstartnum = STARTNODE_STEP;
00116 wchmm->startnode = (int *)mymalloc(sizeof(int)*STARTNODE_STEP);
00117 wchmm->startnum = 0;
00118 if (wchmm->category_tree) {
00119 wchmm->start2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*STARTNODE_STEP);
00120 }
00121 if (wchmm->hmminfo->multipath) {
00122 wchmm->wordbegin = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00123 wchmm->wrk.out_from = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00124 wchmm->wrk.out_from_next = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00125 wchmm->wrk.out_a = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00126 wchmm->wrk.out_a_next = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00127 wchmm->wrk.out_from_len = wchmm->winfo->maxwn;
00128 } else {
00129 wchmm->wordend_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->winfo->num);
00130 }
00131 #ifdef PASS1_IWCD
00132 wchmm->outstyle = (unsigned char *)mymalloc(sizeof(unsigned char)*wchmm->maxwcn);
00133 #endif
00134 #ifdef UNIGRAM_FACTORING
00135 wchmm->start2isolate = NULL;
00136 wchmm->isolatenum = 0;
00137 #endif
00138 if (!wchmm->category_tree) {
00139 wchmm->sclist = NULL;
00140 wchmm->sclist2node = NULL;
00141 #ifdef UNIGRAM_FACTORING
00142 wchmm->fscore = NULL;
00143 #endif
00144 }
00145
00146 wchmm->n = 0;
00147 }
00148
00161 static void
00162 wchmm_extend(WCHMM_INFO *wchmm)
00163 {
00164
00165 wchmm->maxwcn += wchmm->winfo->totalstatenum / 6;
00166 wchmm->state = (WCHMM_STATE *)myrealloc(wchmm->state, sizeof(WCHMM_STATE)*wchmm->maxwcn);
00167 wchmm->self_a = (LOGPROB *)myrealloc(wchmm->self_a, sizeof(LOGPROB)*wchmm->maxwcn);
00168 wchmm->next_a = (LOGPROB *)myrealloc(wchmm->next_a, sizeof(LOGPROB)*wchmm->maxwcn);
00169 wchmm->ac = (A_CELL2 **)myrealloc(wchmm->ac, sizeof(A_CELL2 *)*wchmm->maxwcn);
00170 wchmm->stend = (WORD_ID *)myrealloc(wchmm->stend, sizeof(WORD_ID)*wchmm->maxwcn);
00171 #ifdef PASS1_IWCD
00172 wchmm->outstyle = (unsigned char *)myrealloc(wchmm->outstyle, sizeof(unsigned char)*wchmm->maxwcn);
00173 #endif
00174 }
00175
00188 static void
00189 wchmm_extend_startnode(WCHMM_INFO *wchmm)
00190 {
00191 wchmm->maxstartnum += STARTNODE_STEP;
00192 wchmm->startnode = (int *)myrealloc(wchmm->startnode, sizeof(int) * wchmm->maxstartnum);
00193 if (wchmm->category_tree) {
00194 wchmm->start2wid = (WORD_ID *)myrealloc(wchmm->start2wid, sizeof(WORD_ID) * wchmm->maxstartnum);
00195 }
00196 }
00197
00212 void
00213 wchmm_free(WCHMM_INFO *w)
00214 {
00215 S_CELL *sc, *sctmp;
00216 int i;
00217
00218
00219 #ifdef PASS1_IWCD
00220
00221 #endif
00222
00223 mybfree2(&(w->malloc_root));
00224 if (!w->category_tree) {
00225 if (w->sclist != NULL) {
00226 for(i=1;i<w->scnum;i++) {
00227 sc = w->sclist[i];
00228 while(sc) {
00229 sctmp = sc->next;
00230 free(sc);
00231 sc = sctmp;
00232 }
00233 }
00234 free(w->sclist);
00235 }
00236 if (w->sclist2node != NULL) free(w->sclist2node);
00237 #ifdef UNIGRAM_FACTORING
00238 if (w->fscore != NULL) free(w->fscore);
00239 #endif
00240 }
00241 #ifdef UNIGRAM_FACTORING
00242 if (w->start2isolate != NULL) free(w->start2isolate);
00243 #endif
00244 #ifdef PASS1_IWCD
00245 free(w->outstyle);
00246 #endif
00247 if (w->hmminfo->multipath) {
00248 free(w->wordbegin);
00249 } else {
00250 free(w->wordend_a);
00251 }
00252 if (w->category_tree) free(w->start2wid);
00253 free(w->startnode);
00254 free(w->wordend);
00255 free(w->offset);
00256 free(w->stend);
00257 free(w->ac);
00258 free(w->next_a);
00259 free(w->self_a);
00260 free(w->state);
00261 #ifdef PASS1_IWCD
00262 if (w->category_tree) lcdset_remove_with_category_all(w);
00263 #endif
00264 if (w->wrk.out_from_len != 0) {
00265 free(w->wrk.out_from);
00266 free(w->wrk.out_from_next);
00267 free(w->wrk.out_a);
00268 free(w->wrk.out_a_next);
00269 w->wrk.out_from_len = 0;
00270 }
00271 free(w);
00272 }
00273
00274
00275
00276
00277
00278
00297 static int
00298 compare_wseq(WORD_ID *widx1, WORD_ID *widx2, WORD_INFO *winfo)
00299 {
00300 int len1, len2, n;
00301 int p=0;
00302
00303 len1 = winfo->wlen[*widx1];
00304 len2 = winfo->wlen[*widx2];
00305
00306 n=0;
00307
00308 while (n < len1 && n < len2 && (p = strcmp((winfo->wseq[*widx1][n])->name, (winfo->wseq[*widx2][n])->name)) == 0 ) n++;
00309 if (n < len1) {
00310 if (n < len2) {
00311
00312 return(p);
00313 } else {
00314
00315 return(1);
00316 }
00317 } else {
00318 if (n < len2) {
00319
00320 return(-1);
00321 } else {
00322
00323 return(0);
00324 }
00325 }
00326 }
00327
00346 static void
00347 wchmm_sort_idx_by_wseq(WORD_INFO *winfo, WORD_ID *windex, WORD_ID bgn, WORD_ID len)
00348 {
00349 qsort_reentrant(&(windex[bgn]), len, sizeof(WORD_ID), (int (*)(const void *, const void *, void *))compare_wseq, winfo);
00350 }
00351
00370 static int
00371 compare_category(WORD_ID *widx1, WORD_ID *widx2, WORD_INFO *winfo)
00372 {
00373 int c1,c2;
00374 c1 = winfo->wton[*widx1];
00375 c2 = winfo->wton[*widx2];
00376 return(c1 - c2);
00377 }
00378
00395 static void
00396 wchmm_sort_idx_by_category(WORD_INFO *winfo, WORD_ID *windex, WORD_ID len)
00397 {
00398 qsort_reentrant(windex, len, sizeof(WORD_ID), (int (*)(const void *, const void *, void *))compare_category, winfo);
00399 }
00400
00401
00402
00403
00404
00405
00427 static int
00428 wchmm_check_match(WORD_INFO *winfo, int i, int j)
00429 {
00430 int k,tmplen;
00431
00432 for (tmplen=0,k=0;k<winfo->wlen[i];k++) {
00433 if (k > winfo->wlen[j]-1)
00434 break;
00435 if (! (strmatch(winfo->wseq[i][k]->name, winfo->wseq[j][k]->name)))
00436 break;
00437 tmplen++;
00438 }
00439 return(tmplen);
00440 }
00441
00454 static void
00455 acc_init(WCHMM_INFO *wchmm, int node)
00456 {
00457 wchmm->self_a[node] = LOG_ZERO;
00458 wchmm->next_a[node] = LOG_ZERO;
00459 wchmm->ac[node] = NULL;
00460 }
00461
00478 static void
00479 add_ac(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00480 {
00481 A_CELL2 *ac2;
00482
00483 for(ac2=wchmm->ac[node];ac2;ac2=ac2->next) {
00484 if (ac2->n < A_CELL2_ALLOC_STEP) break;
00485 }
00486 if (ac2 == NULL) {
00487 ac2 = (A_CELL2 *)mybmalloc2(sizeof(A_CELL2), &(wchmm->malloc_root));
00488 ac2->n = 0;
00489 ac2->next = wchmm->ac[node];
00490 wchmm->ac[node] = ac2;
00491 }
00492 ac2->arc[ac2->n] = arc;
00493 ac2->a[ac2->n] = a;
00494 ac2->n++;
00495 }
00496
00515 static void
00516 add_wacc(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00517 {
00518 if (arc == node) {
00519 wchmm->self_a[node] = a;
00520 } else if (arc == node + 1) {
00521 wchmm->next_a[node] = a;
00522 } else {
00523 add_ac(wchmm, node, a, arc);
00524 }
00525 }
00526
00553 static void
00554 get_outtrans_list(WCHMM_INFO *wchmm, WORD_ID w, int pos, int *node, LOGPROB *a, int *num, int maxnum, boolean insert_sp)
00555 {
00556 HMM_Logical *ltmp;
00557 int states;
00558 int k;
00559 LOGPROB prob;
00560 int oldnum;
00561
00562 if (pos < 0) {
00563
00564
00565 node[*num] = wchmm->wordbegin[w];
00566 a[*num] = 0.0;
00567 (*num)++;
00568
00569 } else {
00570
00571 ltmp = wchmm->winfo->wseq[w][pos];
00572 states = hmm_logical_state_num(ltmp);
00573
00574
00575 if ((hmm_logical_trans(ltmp))->a[0][states-1] != LOG_ZERO) {
00576
00577 oldnum = *num;
00578 get_outtrans_list(wchmm, w, pos-1, node, a, num, maxnum, FALSE);
00579
00580 for(k=oldnum;k<*num;k++) {
00581 a[k] += (hmm_logical_trans(ltmp))->a[0][states-1];
00582 }
00583 }
00584
00585 for (k = 1; k < states - 1; k++) {
00586 prob = (hmm_logical_trans(ltmp))->a[k][states-1];
00587 if (prob != LOG_ZERO) {
00588 if (*num >= maxnum) {
00589 j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum);
00590 }
00591 node[*num] = wchmm->offset[w][pos] + k - 1;
00592 a[*num] = prob;
00593 (*num)++;
00594 }
00595 }
00596
00597
00598
00599
00600
00601 if (insert_sp) {
00602
00603 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
00604 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][hmm_logical_state_num(wchmm->hmminfo->sp)-1];
00605 if (prob != LOG_ZERO) {
00606 if (*num >= maxnum) {
00607 j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum);
00608 }
00609 node[*num] = wchmm->offset[w][pos] + (states - 2) + k - 1;
00610 a[*num] = prob;
00611 (*num)++;
00612 }
00613 }
00614 }
00615 }
00616
00617
00618 return;
00619 }
00620
00639 static void
00640 wchmm_link_hmm(WCHMM_INFO *wchmm, int from_node, int to_node, HTK_HMM_Trans *tinfo)
00641 {
00642 A_CELL2 *actmp;
00643 LOGPROB a;
00644 int i, j;
00645 boolean tflag;
00646
00647
00648 for(i = tinfo->statenum - 2; i >= 0; i--) {
00649 if ((a = tinfo->a[i][tinfo->statenum-1]) != LOG_ZERO) {
00650
00651 tflag = FALSE;
00652 if (to_node == from_node && wchmm->self_a[from_node] == a) {
00653 tflag = TRUE;
00654 } else if (to_node == from_node + 1 && wchmm->next_a[from_node] == a) {
00655 tflag = TRUE;
00656 } else {
00657 for (actmp = wchmm->ac[from_node]; actmp; actmp = actmp->next) {
00658 for(j=0;j<actmp->n;j++) {
00659 if (actmp->arc[j] == to_node && actmp->a[j] == a) {
00660 tflag = TRUE;
00661 break;
00662 }
00663 }
00664 if (tflag == TRUE) break;
00665 }
00666 }
00667 if (tflag) break;
00668
00669 add_wacc(wchmm, from_node, a, to_node);
00670 return;
00671 }
00672 }
00673 j_internal_error("wchmm_link_hmm: No arc to endstate?\n");
00674 }
00675
00696 static void
00697 wchmm_link_subword(WCHMM_INFO *wchmm, int from_word, int from_seq, int to_word, int to_seq)
00698 {
00699 HMM_Logical *last;
00700 int lastp;
00701
00702 last = wchmm->winfo->wseq[from_word][from_seq];
00703 lastp = wchmm->offset[from_word][from_seq] + hmm_logical_state_num(last)-2 -1;
00704 wchmm_link_hmm(wchmm, lastp, wchmm->offset[to_word][to_seq],
00705 hmm_logical_trans(last));
00706 }
00707
00708
00709
00710
00711
00751 static void
00752 wchmm_duplicate_state(WCHMM_INFO *wchmm, int node, int word)
00753 {
00754 int j, n;
00755 int n_src, n_prev;
00756 A_CELL2 *ac;
00757 HMM_Logical *lastphone;
00758
00759
00760 if (wchmm->n + 1 >= wchmm->maxwcn) {
00761 wchmm_extend(wchmm);
00762 }
00763
00764 n = wchmm->n;
00765
00766 n_src = node;
00767
00768
00769 #ifdef PASS1_IWCD
00770 {
00771 RC_INFO *rcnew;
00772 LRC_INFO *lrcnew;
00773 wchmm->outstyle[n] = wchmm->outstyle[n_src];
00774 if (wchmm->outstyle[n] == AS_RSET) {
00775
00776 rcnew = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
00777 memcpy(rcnew, wchmm->state[n_src].out.rset, sizeof(RC_INFO));
00778 wchmm->state[n].out.rset = rcnew;
00779 } else if (wchmm->outstyle[n] == AS_LRSET) {
00780
00781 lrcnew = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
00782 memcpy(lrcnew, wchmm->state[n_src].out.lrset, sizeof(LRC_INFO));
00783 wchmm->state[n].out.lrset = lrcnew;
00784 } else {
00785
00786 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(ACOUSTIC_SPEC));
00787 }
00788 }
00789 #else
00790 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(HTK_HMM_State *));
00791 #endif
00792
00793 lastphone = wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-1];
00794 acc_init(wchmm, n);
00795
00796
00797 wchmm->self_a[n] = wchmm->self_a[n_src];
00798
00799
00800 if (hmm_logical_state_num(lastphone) == 3) {
00801
00802 if (wchmm->winfo->wlen[word] == 1) {
00803
00804 wchmm->offset[word][0] = n;
00805
00806 if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) {
00807 wchmm->startnode[wchmm->startnum] = n;
00808 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
00809
00810 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
00811 }
00812 } else {
00813
00814 n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-2]
00815 + hmm_logical_state_num(wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-2]) - 3;
00816 if(n_src == n_prev + 1) {
00817 add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n);
00818 } else {
00819 for(ac=wchmm->ac[n_prev];ac;ac=ac->next) {
00820 for(j=0;j<ac->n;j++) {
00821 if (ac->arc[j] == n_src) {
00822 add_wacc(wchmm, n_prev, ac->a[j], n);
00823 }
00824 }
00825 }
00826 }
00827
00828 wchmm->offset[word][wchmm->winfo->wlen[word]-1] = n;
00829 }
00830 } else {
00831
00832 for (n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-1]; n_prev < n_src; n_prev++) {
00833 if (n_src == n_prev + 1) {
00834 add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n);
00835 } else {
00836 for(ac=wchmm->ac[n_prev];ac;ac=ac->next) {
00837 for(j=0;j<ac->n;j++) {
00838 if (ac->arc[j] == n_src) {
00839 add_wacc(wchmm, n_prev, ac->a[j], n);
00840 }
00841 }
00842 }
00843 }
00844 if (n_prev == n_src + 1) {
00845 add_wacc(wchmm, n, wchmm->next_a[n_src], n_prev);
00846 } else {
00847 for(ac=wchmm->ac[n_src];ac;ac=ac->next) {
00848 for(j=0;j<ac->n;j++) {
00849 if (ac->arc[j] == n_prev) {
00850 add_wacc(wchmm, n, ac->a[j], n_prev);
00851 }
00852 }
00853 }
00854 }
00855 }
00856 }
00857
00858
00859 wchmm->stend[n] = word;
00860 wchmm->wordend[word] = n;
00861
00862
00863 wchmm->n++;
00864
00865 }
00866
00881 static int
00882 wchmm_duplicate_leafnode(WCHMM_INFO *wchmm)
00883 {
00884 int w, nlast, n, narc, narc_model;
00885 boolean *dupw;
00886 A_CELL2 *actmp;
00887 int dupcount;
00888
00889 dupcount = 0;
00890
00891 nlast = wchmm->n;
00892 dupw = (boolean *)mymalloc(sizeof(boolean) * nlast);
00893 for(n=0;n<nlast;n++) dupw[n] = FALSE;
00894
00895 for (w=0;w<wchmm->winfo->num;w++) {
00896 n = wchmm->wordend[w];
00897 if (dupw[n]) {
00898 wchmm_duplicate_state(wchmm, n, w); dupcount++;
00899 } else {
00900
00901 {
00902
00903 HMM_Logical *lastphone;
00904 HTK_HMM_Trans *tinfo;
00905 int laststate, i;
00906 lastphone = wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1];
00907 laststate = hmm_logical_state_num(lastphone) - 2;
00908 tinfo = hmm_logical_trans(lastphone);
00909 narc_model=0;
00910 for(i=1;i<hmm_logical_state_num(lastphone)-1;i++) {
00911 if (tinfo->a[laststate][i] != LOG_ZERO) narc_model++;
00912 }
00913
00914 narc = 0;
00915 if (wchmm->self_a[n] != LOG_ZERO) narc++;
00916 if (wchmm->next_a[n] != LOG_ZERO) narc++;
00917 for(actmp=wchmm->ac[n];actmp;actmp=actmp->next) narc += actmp->n;
00918 }
00919
00920 if (narc_model != narc) {
00921
00922
00923 wchmm_duplicate_state(wchmm, n, w); dupcount++;
00924
00925
00926 wchmm->stend[n] = WORD_INVALID;
00927 } else {
00928
00929
00930 wchmm->stend[n] = w;
00931 }
00932
00933 dupw[n] = TRUE;
00934 }
00935 }
00936 free(dupw);
00937
00938 return(dupcount);
00939 }
00940
00941
00942
00943
00944
00969 static boolean
00970 wchmm_add_word(WCHMM_INFO *wchmm, int word, int matchlen, int matchword, boolean enable_iwsp)
00971 {
00972 boolean ok_p;
00973 int j,k,n;
00974 int add_head, add_tail, add_to;
00975 int word_len, matchword_len;
00976 HMM_Logical *ltmp;
00977 int ato;
00978 LOGPROB prob;
00979 int ntmp;
00980 int ltmp_state_num;
00981 #ifdef PASS1_IWCD
00982 CD_Set *lcd = NULL;
00983 #endif
00984 int *out_from;
00985 int *out_from_next;
00986 LOGPROB *out_a;
00987 LOGPROB *out_a_next;
00988
00989
00990
00991 int out_num_prev, out_num_next;
00992 int kkk;
00993
00994 ok_p = TRUE;
00995 if (wchmm->hmminfo->multipath) {
00996 out_from = wchmm->wrk.out_from;
00997 out_from_next = wchmm->wrk.out_from_next;
00998 out_a = wchmm->wrk.out_a;
00999 out_a_next = wchmm->wrk.out_a_next;
01000 }
01001
01002
01003
01004
01005
01006
01007
01008
01009
01010
01011
01012 n = wchmm->n;
01013 word_len = wchmm->winfo->wlen[word];
01014 matchword_len = wchmm->winfo->wlen[matchword];
01015
01016
01017 wchmm->offset[word] = (int *)mybmalloc2(sizeof(int)*word_len, &(wchmm->malloc_root));
01018
01019
01020 add_head = matchlen;
01021 add_tail = word_len - 1;
01022 add_to = matchlen - 1;
01023
01024 if (wchmm->hmminfo->multipath) {
01025
01026 if (matchlen == 0) {
01027
01028 wchmm->wordbegin[word] = n;
01029 wchmm->stend[n] = WORD_INVALID;
01030 acc_init(wchmm, n);
01031 wchmm->state[n].out.state = NULL;
01032
01033 wchmm->startnode[wchmm->startnum] = n;
01034 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
01035
01036 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
01037 if (++n >= wchmm->maxwcn) wchmm_extend(wchmm);
01038 } else {
01039 wchmm->wordbegin[word] = wchmm->wordbegin[matchword];
01040 }
01041
01042
01043
01044
01045 out_num_prev = 0;
01046 if (matchlen == 0) {
01047
01048 out_from[0] = wchmm->wordbegin[word];
01049 out_a[0] = 0.0;
01050 out_num_prev = 1;
01051 } else {
01052
01053
01054 get_outtrans_list(wchmm, matchword, add_to, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, (enable_iwsp && add_tail - add_head + 1 <= 0) ? TRUE : FALSE);
01055
01056 }
01057 } else {
01058 if (matchlen == 0) {
01059 if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) {
01060
01061 wchmm->startnode[wchmm->startnum] = n;
01062 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
01063
01064 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
01065 }
01066 }
01067 }
01068
01069 if (add_tail - add_head + 1 > 0) {
01070 ntmp = n;
01071 for (j=add_head; j <= add_tail; j++) {
01072 ltmp = wchmm->winfo->wseq[word][j];
01073 ltmp_state_num = hmm_logical_state_num(ltmp);
01074 #ifdef PASS1_IWCD
01075 if (wchmm->ccd_flag) {
01076
01077
01078 if (wchmm->winfo->wlen[word] > 1 && j == wchmm->winfo->wlen[word] - 1) {
01079 if (wchmm->category_tree) {
01080 #ifdef USE_OLD_IWCD
01081 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01082 #else
01083 lcd = lcdset_lookup_with_category(wchmm, ltmp, wchmm->winfo->wton[word]);
01084 if (lcd == NULL) {
01085
01086
01087
01088 jlog("WARNING: wchmm: no lcdset found for [%s::%04d], fallback to [%s]\n", ltmp->name, wchmm->winfo->wton[word], ltmp->name);
01089 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01090 }
01091 #endif
01092 } else {
01093 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01094 }
01095 if (lcd == NULL) {
01096 jlog("ERROR: wchmm: at word #%d: no lcdset found for [%s]\n", word, ltmp->name);
01097 ok_p = FALSE;
01098 }
01099 }
01100 }
01101 #endif
01102 for (k = 1; k < ltmp_state_num - 1; k++) {
01103
01104 #ifdef PASS1_IWCD
01105 if (wchmm->ccd_flag) {
01106
01107 if (wchmm->winfo->wlen[word] == 1) {
01108 wchmm->outstyle[ntmp] = AS_LRSET;
01109 wchmm->state[ntmp].out.lrset = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
01110 (wchmm->state[ntmp].out.lrset)->hmm = ltmp;
01111 (wchmm->state[ntmp].out.lrset)->state_loc = k;
01112 if (wchmm->category_tree) {
01113 (wchmm->state[ntmp].out.lrset)->category = wchmm->winfo->wton[word];
01114 }
01115 } else if (j == 0) {
01116 wchmm->outstyle[ntmp] = AS_RSET;
01117 wchmm->state[ntmp].out.rset = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
01118 (wchmm->state[ntmp].out.rset)->hmm = ltmp;
01119 (wchmm->state[ntmp].out.rset)->state_loc = k;
01120 } else if (j == wchmm->winfo->wlen[word] - 1) {
01121 wchmm->outstyle[ntmp] = AS_LSET;
01122 wchmm->state[ntmp].out.lset = &(lcd->stateset[k]);
01123 } else {
01124 wchmm->outstyle[ntmp] = AS_STATE;
01125 if (ltmp->is_pseudo) {
01126 jlog("WARNING: wchmm: word-internal phone should not be pseudo\n");
01127 put_voca(stdout, wchmm->winfo, word);
01128 ok_p = FALSE;
01129 }
01130 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01131 }
01132 } else {
01133
01134 if (ltmp->is_pseudo) {
01135 j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n");
01136 put_voca(stdout, wchmm->winfo, word);
01137 ok_p = FALSE;
01138 }
01139 wchmm->outstyle[ntmp] = AS_STATE;
01140 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01141 }
01142 #else
01143 if (ltmp->is_pseudo) {
01144 j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n");
01145 put_voca(stdout, wchmm->winfo, word);
01146 ok_p = FALSE;
01147 }
01148 wchmm->state[ntmp].out = ltmp->body.defined->s[k];
01149 #endif
01150
01151
01152 acc_init(wchmm, ntmp);
01153 wchmm->stend[ntmp] = WORD_INVALID;
01154 if (! wchmm->hmminfo->multipath) {
01155
01156 for (ato = 1; ato < ltmp_state_num; ato++) {
01157 prob = (hmm_logical_trans(ltmp))->a[k][ato];
01158 if (prob != LOG_ZERO) {
01159 if (j == add_tail && k == ltmp_state_num - 2 && ato == ltmp_state_num - 1) {
01160
01161 } else {
01162 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01163 }
01164 }
01165 }
01166 }
01167
01168 ntmp++;
01169
01170 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01171 }
01172 }
01173
01174 if (wchmm->hmminfo->multipath) {
01175
01176
01177
01178
01179 ntmp = n;
01180 for (j = add_head; j <= add_tail; j++) {
01181 ltmp = wchmm->winfo->wseq[word][j];
01182 ltmp_state_num = hmm_logical_state_num(ltmp);
01183 out_num_next = 0;
01184
01185 for (ato = 1; ato < ltmp_state_num; ato++) {
01186 prob = (hmm_logical_trans(ltmp))->a[0][ato];
01187 if (prob != LOG_ZERO) {
01188
01189 if (ato == ltmp_state_num - 1) {
01190
01191 for(kkk=0; kkk<out_num_prev; kkk++) {
01192 out_from_next[out_num_next] = out_from[kkk];
01193 out_a_next[out_num_next] = out_a[kkk] + prob;
01194 out_num_next++;
01195 }
01196 } else {
01197 for(kkk=0; kkk<out_num_prev; kkk++) {
01198 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01199 }
01200 }
01201 }
01202 }
01203
01204 for(k = 1; k < ltmp_state_num - 1; k++) {
01205 for (ato = 1; ato < ltmp_state_num; ato++) {
01206 prob = (hmm_logical_trans(ltmp))->a[k][ato];
01207 if (prob != LOG_ZERO) {
01208 if (ato == ltmp_state_num - 1) {
01209
01210 out_from_next[out_num_next] = ntmp;
01211 out_a_next[out_num_next] = prob;
01212 out_num_next++;
01213 } else {
01214 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01215 }
01216 }
01217 }
01218 ntmp++;
01219 }
01220
01221 for(kkk=0;kkk<out_num_next;kkk++) {
01222 out_from[kkk] = out_from_next[kkk];
01223 out_a[kkk] = out_a_next[kkk];
01224 }
01225 out_num_prev = out_num_next;
01226 }
01227 }
01228
01229 }
01230
01231
01232
01233
01234
01235
01236
01237 if (wchmm->hmminfo->multipath && enable_iwsp && add_tail - add_head + 1 > 0) {
01238 int ntmp_bak;
01239
01240
01241 ntmp_bak = ntmp;
01242 if (wchmm->hmminfo->sp->is_pseudo) {
01243 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01244 wchmm->outstyle[ntmp] = AS_LSET;
01245 wchmm->state[ntmp].out.lset = &(wchmm->hmminfo->sp->body.pseudo->stateset[k]);
01246 acc_init(wchmm, ntmp);
01247 wchmm->stend[ntmp] = WORD_INVALID;
01248 ntmp++;
01249 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01250 }
01251 } else {
01252 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01253 wchmm->outstyle[ntmp] = AS_STATE;
01254 wchmm->state[ntmp].out.state = wchmm->hmminfo->sp->body.defined->s[k];
01255 acc_init(wchmm, ntmp);
01256 wchmm->stend[ntmp] = WORD_INVALID;
01257 ntmp++;
01258 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01259 }
01260 }
01261 ntmp = ntmp_bak;
01262
01263 out_num_next = 0;
01264 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01265 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[0][ato];
01266 if (prob != LOG_ZERO) {
01267
01268
01269 prob += wchmm->hmminfo->iwsp_penalty;
01270 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01271
01272 for(kkk=0; kkk<out_num_prev; kkk++) {
01273 out_from_next[out_num_next] = out_from[kkk];
01274 out_a_next[out_num_next] = out_a[kkk] + prob;
01275 out_num_next++;
01276 }
01277 } else {
01278
01279 for(kkk=0; kkk<out_num_prev; kkk++) {
01280 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01281 }
01282 }
01283 }
01284 }
01285
01286 if (hmm_logical_trans(wchmm->hmminfo->sp)->a[0][hmm_logical_state_num(wchmm->hmminfo->sp)-1] == LOG_ZERO) {
01287
01288
01289 prob = 0.0;
01290 for(kkk=0; kkk<out_num_prev; kkk++) {
01291 out_from_next[out_num_next] = out_from[kkk];
01292 out_a_next[out_num_next] = out_a[kkk] + prob;
01293 out_num_next++;
01294 }
01295 }
01296
01297 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01298 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01299 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][ato];
01300 if (prob != LOG_ZERO) {
01301 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01302 out_from_next[out_num_next] = ntmp;
01303 out_a_next[out_num_next] = prob;
01304 out_num_next++;
01305 } else {
01306 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01307 }
01308 }
01309 }
01310 ntmp++;
01311 }
01312
01313 for(kkk=0;kkk<out_num_next;kkk++) {
01314 out_from[kkk] = out_from_next[kkk];
01315 out_a[kkk] = out_a_next[kkk];
01316 }
01317 out_num_prev = out_num_next;
01318
01319 }
01320
01321
01322 for (j=0;j<word_len;j++) {
01323 if (j < add_head) {
01324 wchmm->offset[word][j] = wchmm->offset[matchword][j];
01325 } else if (add_tail < j) {
01326 wchmm->offset[word][j] = wchmm->offset[matchword][j+(matchword_len-word_len)];
01327 } else {
01328 wchmm->offset[word][j] = n;
01329 n += hmm_logical_state_num(wchmm->winfo->wseq[word][j]) - 2;
01330 }
01331 }
01332
01333
01334 if (wchmm->hmminfo->multipath) {
01335
01336
01337
01338 if (enable_iwsp && add_tail - add_head + 1 > 0) {
01339 n += hmm_logical_state_num(wchmm->hmminfo->sp) - 2;
01340 if (n != ntmp) j_internal_error("wchmm_add_word: cannot match\n");
01341 }
01342
01343
01344 wchmm->wordend[word] = n;
01345 wchmm->stend[n] = word;
01346 acc_init(wchmm, n);
01347 wchmm->state[n].out.state = NULL;
01348
01349
01350 for(k = 0; k < out_num_prev; k++) {
01351 add_wacc(wchmm, out_from[k], out_a[k], n);
01352 }
01353 n++;
01354 if (n >= wchmm->maxwcn) wchmm_extend(wchmm);
01355
01356 if (matchlen == 0) {
01357
01358
01359 out_num_prev = 0;
01360 get_outtrans_list(wchmm, word, word_len-1, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, enable_iwsp);
01361 for(k=0;k<out_num_prev;k++) {
01362 if (out_from[k] == wchmm->wordbegin[word]) {
01363 jlog("ERROR: *** ERROR: WORD SKIPPING TRANSITION NOT ALLOWED ***\n");
01364 jlog("ERROR: Word id=%d (%s[%s]) has \"word skipping transition\".\n", word, wchmm->winfo->wname[word], wchmm->winfo->woutput[word]);
01365 jlog("ERROR: All HMMs in the word:\n ");
01366 for(kkk=0;kkk<wchmm->winfo->wlen[word];kkk++) {
01367 jlog("%s ", wchmm->winfo->wseq[word][kkk]->name);
01368 }
01369 jlog("\n");
01370 jlog("ERROR: has transitions from initial state to final state.\n");
01371 jlog("ERROR: This type of word skipping is not supported.\n");
01372 ok_p = FALSE;
01373 }
01374 }
01375 }
01376
01377 wchmm->n = n;
01378
01379 } else {
01380
01381 wchmm->n = n;
01382 k = wchmm->offset[word][word_len-1] + hmm_logical_state_num(wchmm->winfo->wseq[word][word_len-1])-2 -1;
01383 wchmm->wordend[word] = k;
01384 wchmm->stend[k] = word;
01385
01386 if (matchlen != 0 && add_tail - add_head + 1 > 0) {
01387
01388
01389 wchmm_link_subword(wchmm, matchword,add_to,word,add_head);
01390 }
01391
01392 }
01393
01394 return(ok_p);
01395
01396 }
01397
01398
01399
01400
01401
01416 static void
01417 wchmm_calc_wordend_arc(WCHMM_INFO *wchmm)
01418 {
01419 WORD_ID w;
01420 HTK_HMM_Trans *tr;
01421 LOGPROB a;
01422
01423 for (w=0;w<wchmm->winfo->num;w++) {
01424 tr = hmm_logical_trans(wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]);
01425 a = tr->a[tr->statenum-2][tr->statenum-1];
01426 wchmm->wordend_a[w] = a;
01427 }
01428 }
01429
01430 #ifdef SEPARATE_BY_UNIGRAM
01431
01432
01433
01434
01435
01454 static int
01455 compare_prob(LOGPROB *a, LOGPROB *b)
01456 {
01457 if (*a < *b) return (1);
01458 if (*a > *b) return (-1);
01459 return(0);
01460 }
01461
01480 static LOGPROB
01481 get_nbest_uniprob(WCHMM_INFO *wchmm, int n)
01482 {
01483 LOGPROB *u_p;
01484 WORD_ID w;
01485 LOGPROB x;
01486 WORD_INFO *winfo;
01487 NGRAM_INFO *ngram;
01488
01489 winfo = wchmm->winfo;
01490 ngram = wchmm->ngram;
01491
01492 if (n < 1) n = 1;
01493 if (n > winfo->num) n = winfo->num;
01494
01495
01496 u_p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * winfo->num);
01497 for(w=0;w<winfo->num;w++) {
01498 if (ngram) {
01499 x = uni_prob(ngram, winfo->wton[w])
01500 #ifdef CLASS_NGRAM
01501 + winfo->cprob[w]
01502 #endif
01503 ;
01504 } else {
01505 x = LOG_ZERO;
01506 }
01507 if (wchmm->lmvar == LM_NGRAM_USER) {
01508 x = (*(wchmm->uni_prob_user))(wchmm->winfo, w, x);
01509 }
01510 u_p[w] = x;
01511 }
01512
01513
01514 qsort(u_p, winfo->num, sizeof(LOGPROB),
01515 (int (*)(const void *,const void *))compare_prob);
01516
01517
01518 x = u_p[n-1];
01519 free(u_p);
01520 return(x);
01521 }
01522
01523 #endif
01524
01525
01526
01527
01528
01529 #define COUNT_STEP 500
01530
01531
01552 boolean
01553 build_wchmm(WCHMM_INFO *wchmm, JCONF_LM *lmconf)
01554 {
01555 int i,j;
01556 int matchword=0, sharelen=0, maxsharelen=0;
01557 int num_duplicated;
01558 #ifdef SEPARATE_BY_UNIGRAM
01559 LOGPROB separate_thres;
01560 LOGPROB p;
01561 #endif
01562 boolean ok_p;
01563
01564
01565
01566 if (wchmm->winfo == NULL
01567 || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL)
01568 || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL)
01569 ) {
01570 jlog("ERROR: wchmm: linguistic info not available!!\n");
01571 return FALSE;
01572 }
01573
01574 ok_p = TRUE;
01575
01576 #ifdef SEPARATE_BY_UNIGRAM
01577
01578
01579 separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);
01580 #endif
01581
01582 #ifdef PASS1_IWCD
01583 #ifndef USE_OLD_IWCD
01584 if (wchmm->category_tree) {
01585 if (wchmm->ccd_flag) {
01586
01587 lcdset_register_with_category_all(wchmm);
01588 }
01589 }
01590 #endif
01591 #endif
01592
01593
01594
01595 wchmm_init(wchmm);
01596
01597
01598 wchmm->separated_word_count=0;
01599
01600 jlog("STAT: wchmm: Building HMM lexicon tree (left-to-right)\n");
01601 for (i=0;i<wchmm->winfo->num;i++) {
01602
01603 if (wchmm->lmtype == LM_PROB) {
01604 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01605
01606
01607
01608 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01609 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01610 ok_p = FALSE;
01611 }
01612 continue;
01613 }
01614 #ifndef NO_SEPARATE_SHORT_WORD
01615 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01616
01617
01618 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01619 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01620 ok_p = FALSE;
01621 }
01622 wchmm->separated_word_count++;
01623 continue;
01624 }
01625 #endif
01626 #ifdef SEPARATE_BY_UNIGRAM
01627 if (wchmm->ngram) {
01628 p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01629 #ifdef CLASS_NGRAM
01630 + wchmm->winfo->cprob[i]
01631 #endif
01632 ;
01633 } else {
01634 p = LOG_ZERO;
01635 }
01636 if (wchmm->lmvar == LM_NGRAM_USER) {
01637 p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p);
01638 }
01639 if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) {
01640
01641
01642 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01643 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01644 ok_p = FALSE;
01645 }
01646 wchmm->separated_word_count++;
01647 continue;
01648 }
01649 #endif
01650 }
01651
01652
01653 maxsharelen=0;
01654 for (j=0;j<i;j++) {
01655 if (wchmm->category_tree && wchmm->lmtype == LM_DFA) {
01656 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[j]) continue;
01657 }
01658 sharelen = wchmm_check_match(wchmm->winfo, i, j);
01659 if (sharelen == wchmm->winfo->wlen[i] && sharelen == wchmm->winfo->wlen[j]) {
01660
01661
01662 maxsharelen = sharelen;
01663 matchword = j;
01664 break;
01665 }
01666 if (sharelen > maxsharelen) {
01667 matchword = j;
01668 maxsharelen = sharelen;
01669 }
01670 }
01671 if (wchmm_add_word(wchmm, i, maxsharelen, matchword, lmconf->enable_iwsp) == FALSE) {
01672 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01673 ok_p = FALSE;
01674 }
01675 }
01676
01677 #if 0
01678
01679 for (i=0;i<wchmm->winfo->num;i++) {
01680 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01681 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01682 ok_p = FALSE;
01683 }
01684 }
01685 #endif
01686 jlog("STAT: %5d words ended (%6d nodes)\n",i,wchmm->n);
01687
01688 if (! wchmm->hmminfo->multipath) {
01689
01690 num_duplicated = wchmm_duplicate_leafnode(wchmm);
01691 jlog("STAT: %d leaf nodes are made unshared\n", num_duplicated);
01692
01693
01694 wchmm_calc_wordend_arc(wchmm);
01695 }
01696
01697
01698 check_wchmm(wchmm);
01699
01700
01701 if (!wchmm->category_tree) {
01702
01703 #ifdef UNIGRAM_FACTORING
01704 if (wchmm->lmtype == LM_PROB) {
01705
01706 make_successor_list_unigram_factoring(wchmm);
01707 jlog("STAT: 1-gram factoring values has been pre-computed\n");
01708 } else {
01709 make_successor_list(wchmm);
01710 }
01711 #else
01712 make_successor_list(wchmm);
01713 #endif
01714
01715 if (wchmm->hmminfo->multipath) {
01716
01717 adjust_sc_index(wchmm);
01718 }
01719
01720 #ifdef UNIGRAM_FACTORING
01721 if (wchmm->lmtype == LM_PROB) {
01722
01723 make_iwcache_index(wchmm);
01724 }
01725 #endif
01726
01727
01728 if (wchmm->sclist2node != NULL) {
01729 free(wchmm->sclist2node);
01730 wchmm->sclist2node = NULL;
01731 }
01732
01733 }
01734
01735 jlog("STAT: done\n");
01736
01737 return ok_p;
01738 }
01739
01765 boolean
01766 build_wchmm2(WCHMM_INFO *wchmm, JCONF_LM *lmconf)
01767 {
01768 int i,j, last_i;
01769 int num_duplicated;
01770 WORD_ID *windex;
01771 #ifdef SEPARATE_BY_UNIGRAM
01772 LOGPROB separate_thres;
01773 LOGPROB p;
01774 #endif
01775 boolean ok_p;
01776 boolean ret;
01777
01778
01779
01780 if (wchmm->winfo == NULL
01781 || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL)
01782 || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL)
01783 ) {
01784 jlog("ERROR: wchmm: linguistic info not available!!\n");
01785 return FALSE;
01786 }
01787
01788 ok_p = TRUE;
01789
01790 wchmm->separated_word_count = 0;
01791
01792 jlog("STAT: Building HMM lexicon tree\n");
01793
01794 if (wchmm->lmtype == LM_PROB) {
01795 #ifdef SEPARATE_BY_UNIGRAM
01796
01797
01798 separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);
01799 #endif
01800 }
01801
01802 #ifdef PASS1_IWCD
01803 #ifndef USE_OLD_IWCD
01804 if (wchmm->category_tree) {
01805 if (wchmm->ccd_flag) {
01806
01807
01808
01809 lcdset_register_with_category_all(wchmm);
01810 }
01811 }
01812 #endif
01813 #endif
01814
01815
01816 wchmm_init(wchmm);
01817
01818
01819 windex = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wchmm->winfo->num);
01820 for(i=0;i<wchmm->winfo->num;i++) windex[i] = i;
01821
01822 if (wchmm->category_tree && wchmm->lmtype == LM_DFA) {
01823
01824
01825 wchmm_sort_idx_by_category(wchmm->winfo, windex, wchmm->winfo->num);
01826 {
01827 int last_cate;
01828 last_i = 0;
01829 last_cate = wchmm->winfo->wton[windex[0]];
01830 for(i = 1;i<wchmm->winfo->num;i++) {
01831 if (wchmm->winfo->wton[windex[i]] != last_cate) {
01832 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, i - last_i);
01833 last_cate = wchmm->winfo->wton[windex[i]];
01834 last_i = i;
01835 }
01836 }
01837 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, wchmm->winfo->num - last_i);
01838 }
01839
01840 } else {
01841
01842
01843 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, 0, wchmm->winfo->num);
01844
01845 }
01846
01847
01848
01849
01850
01851
01852
01853
01854
01855
01856
01857
01858
01859 last_i = WORD_INVALID;
01860 for (j=0;j<wchmm->winfo->num;j++) {
01861 i = windex[j];
01862
01863 if (wchmm->lmtype == LM_PROB) {
01864
01865
01866 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01867
01868 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01869 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01870 ok_p = FALSE;
01871 }
01872 continue;
01873 }
01874 #ifndef NO_SEPARATE_SHORT_WORD
01875
01876 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01877 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01878 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01879 ok_p = FALSE;
01880 }
01881 wchmm->separated_word_count++;
01882 continue;
01883 }
01884 #endif
01885 #ifdef SEPARATE_BY_UNIGRAM
01886 if (wchmm->ngram) {
01887 p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01888 #ifdef CLASS_NGRAM
01889 + wchmm->winfo->cprob[i]
01890 #endif
01891 ;
01892 } else {
01893 p = LOG_ZERO;
01894 }
01895 if (wchmm->lmvar == LM_NGRAM_USER) {
01896 p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p);
01897 }
01898
01899 if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) {
01900 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01901 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01902 ok_p = FALSE;
01903 }
01904 wchmm->separated_word_count++;
01905 continue;
01906 }
01907 #endif
01908 }
01909
01910 if (last_i == WORD_INVALID) {
01911 ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp);
01912 } else {
01913
01914 if (wchmm->category_tree && wchmm->lmtype == LM_DFA) {
01915 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[last_i]) {
01916 ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp);
01917 } else {
01918 ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp);
01919 }
01920 } else {
01921 ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp);
01922 }
01923 }
01924 if (ret == FALSE) {
01925 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01926 ok_p = FALSE;
01927 }
01928 last_i = i;
01929
01930 }
01931
01932
01933
01934
01935 free(windex);
01936
01937 if (wchmm->hmminfo->multipath) {
01938 jlog("STAT: lexicon size: %d nodes\n", wchmm->n);
01939 } else {
01940
01941 jlog("STAT: lexicon size: %d", wchmm->n);
01942 num_duplicated = wchmm_duplicate_leafnode(wchmm);
01943 jlog("+%d=%d\n", num_duplicated, wchmm->n);
01944 }
01945
01946 if (! wchmm->hmminfo->multipath) {
01947
01948 wchmm_calc_wordend_arc(wchmm);
01949 }
01950
01951
01952 check_wchmm(wchmm);
01953
01954
01955 if (!wchmm->category_tree) {
01956
01957 #ifdef UNIGRAM_FACTORING
01958 if (wchmm->lmtype == LM_PROB) {
01959
01960 make_successor_list_unigram_factoring(wchmm);
01961 jlog("STAT: 1-gram factoring values has been pre-computed\n");
01962 } else {
01963 make_successor_list(wchmm);
01964 }
01965 #else
01966 make_successor_list(wchmm);
01967 #endif
01968 if (wchmm->hmminfo->multipath) {
01969
01970 adjust_sc_index(wchmm);
01971 }
01972 #ifdef UNIGRAM_FACTORING
01973 if (wchmm->lmtype == LM_PROB) {
01974
01975 make_iwcache_index(wchmm);
01976 }
01977 #endif
01978
01979
01980 if (wchmm->sclist2node != NULL) {
01981 free(wchmm->sclist2node);
01982 wchmm->sclist2node = NULL;
01983 }
01984
01985 }
01986
01987
01988
01989 #ifdef WCHMM_SIZE_CHECK
01990 if (debug2_flag) {
01991
01992 jlog("STAT: --- memory size of word lexicon ---\n");
01993 jlog("STAT: wchmm: %d words, %d nodes\n", wchmm->winfo->num, wchmm->n);
01994 jlog("STAT: %9d bytes: wchmm->state[node] (exclude ac, sc)\n", sizeof(WCHMM_STATE) * wchmm->n);
01995 {
01996 int count1 = 0;
01997 int count2 = 0;
01998 int count3 = 0;
01999 for(i=0;i<wchmm->n;i++) {
02000 if (wchmm->self_a[i] != LOG_ZERO) count1++;
02001 if (wchmm->next_a[i] != LOG_ZERO) count2++;
02002 if (wchmm->ac[i] != NULL) count3++;
02003 }
02004 jlog("STAT: %9d bytes: wchmm->self_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count1 / (float)wchmm->n);
02005 jlog("STAT: %9d bytes: wchmm->next_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count2 / (float)wchmm->n);
02006 jlog("STAT: %9d bytes: wchmm->ac[node] (%4.1f%% used)\n", sizeof(A_CELL2 *) * wchmm->n, 100.0 * count3 / (float)wchmm->n);
02007 }
02008 jlog("STAT: %9d bytes: wchmm->stend[node]\n", sizeof(WORD_ID) * wchmm->n);
02009 {
02010 int w,count;
02011 count = 0;
02012 for(w=0;w<wchmm->winfo->num;w++) {
02013 count += wchmm->winfo->wlen[w] * sizeof(int) + sizeof(int *);
02014 }
02015 jlog("STAT: %9d bytes: wchmm->offset[w][]\n", count);
02016 }
02017 if (wchmm->hmminfo->multipath) {
02018 jlog("STAT: %9d bytes: wchmm->wordbegin[w]\n", wchmm->winfo->num * sizeof(int));
02019 }
02020 jlog("STAT: %9d bytes: wchmm->wordend[w]\n", wchmm->winfo->num * sizeof(int));
02021 jlog("STAT: %9d bytes: wchmm->startnode[]\n", wchmm->startnum * sizeof(int));
02022 if (wchmm->category_tree) {
02023 jlog("STAT: %9d bytes: wchmm->start2wid[]\n", wchmm->startnum * sizeof(WORD_ID));
02024 }
02025 #ifdef UNIGRAM_FACTORING
02026 if (wchmm->lmtype == LM_PROB) {
02027 jlog("STAT: %9d bytes: wchmm->start2isolate[]\n", wchmm->isolatenum * sizeof(int));
02028 }
02029 #endif
02030 if (!wchmm->hmminfo->multipath) {
02031 jlog("STAT: %9d bytes: wchmm->wordend_a[]\n", wchmm->winfo->num * sizeof(LOGPROB));
02032 }
02033 #ifdef PASS1_IWCD
02034 jlog("STAT: %9d bytes: wchmm->outstyle[]\n", wchmm->n * sizeof(unsigned char));
02035 {
02036 int c;
02037 c = 0;
02038 for(i=0;i<wchmm->n;i++) {
02039 switch(wchmm->outstyle[i]) {
02040 case AS_RSET:
02041 c += sizeof(RC_INFO);
02042 break;
02043 case AS_LRSET:
02044 c += sizeof(LRC_INFO);
02045 break;
02046 }
02047 }
02048 if (c > 0) jlog("STAT: %9d bytes: wchmm->out (RC_INFO / LRC_INFO)\n", c);
02049 }
02050 #endif
02051 if (!wchmm->category_tree) {
02052 jlog("STAT: %9d bytes: wchmm->sclist[]\n", wchmm->scnum * sizeof(S_CELL *));
02053 jlog("STAT: %9d bytes: wchmm->sclist2node[]\n", wchmm->scnum * sizeof(int));
02054 #ifdef UNIGRAM_FACTORING
02055 if (wchmm->lmtype == LM_PROB) {
02056 jlog("STAT: %9d bytes: wchmm->fscore[]\n", wchmm->fsnum * sizeof(LOGPROB));
02057 }
02058 #endif
02059 }
02060
02061 {
02062 int count, n;
02063 A_CELL2 *ac;
02064 count = 0;
02065 for(n=0;n<wchmm->n;n++) {
02066 for(ac=wchmm->ac[n];ac;ac=ac->next) {
02067 count += sizeof(A_CELL2);
02068 }
02069 }
02070 jlog("STAT: %9d bytes: A_CELL2\n", count);
02071 }
02072 if (!wchmm->category_tree) {
02073 jlog("STAT: %9d bytes: sclist\n", wchmm->scnum * sizeof(S_CELL *));
02074 jlog("STAT: %9d bytes: sclist2node\n", wchmm->scnum * sizeof(int));
02075 }
02076
02077 }
02078
02079 #endif
02080
02081
02082 return ok_p;
02083
02084 }
02085
02086
02101 void
02102 print_wchmm_info(WCHMM_INFO *wchmm)
02103 {
02104 int n,i, rootnum;
02105
02106 if (wchmm->hmminfo->multipath) {
02107 rootnum = wchmm->startnum;
02108 } else {
02109 if (wchmm->lmtype == LM_PROB) {
02110 rootnum = wchmm->startnum + 1;
02111 } else if (wchmm->lmtype == LM_DFA) {
02112 rootnum = wchmm->startnum;
02113 }
02114 }
02115
02116 jlog(" Lexicon tree:\n");
02117 jlog("\t total node num = %6d\n", wchmm->n);
02118 if (wchmm->lmtype == LM_PROB) {
02119 jlog("\t root node num = %6d\n", rootnum);
02120 #ifdef NO_SEPARATE_SHORT_WORD
02121 #ifdef SEPARATE_BY_UNIGRAM
02122 jlog("\t(%d hi-freq. words are separated from tree lexicon)\n", wchmm->separated_word_count);
02123 #else
02124 jlog(" (no words are separated from tree)\n");
02125 #endif
02126 #else
02127 jlog(" (%d short words (<= %d phonemes) are separated from tree)\n", wchmm->separated_word_count, SHORT_WORD_LEN);
02128 #endif
02129 }
02130 if (wchmm->lmtype == LM_DFA) {
02131 jlog("\t root node num = %6d\n", rootnum);
02132 }
02133 for(n=0,i=0;i<wchmm->n;i++) {
02134 if (wchmm->stend[i] != WORD_INVALID) n++;
02135 }
02136 jlog("\t leaf node num = %6d\n", n);
02137 if (!wchmm->category_tree) {
02138 jlog("\t fact. node num = %6d\n", wchmm->scnum - 1);
02139 }
02140 }
02141
02142