00001
00159
00160
00161
00162
00163
00164
00165
00166 #include <julius/julius.h>
00167
00168
00169
00190 static void
00191 add_successor(WCHMM_INFO *wchmm, int node, WORD_ID w)
00192 {
00193 S_CELL *sctmp, *sc;
00194
00195
00196 sctmp=(S_CELL *) mymalloc(sizeof(S_CELL));
00197
00198 sctmp->word = w;
00199
00200 if (wchmm->state[node].scid == 0) {
00201 j_internal_error("add_successor: sclist id not assigned to branch node?\n");
00202 }
00203 sc = wchmm->sclist[wchmm->state[node].scid];
00204 if (sc == NULL || sctmp->word < sc->word) {
00205 sctmp->next = sc;
00206 wchmm->sclist[wchmm->state[node].scid] = sctmp;
00207 } else {
00208 for(;sc;sc=sc->next) {
00209 if (sc->next == NULL || sctmp->word < (sc->next)->word) {
00210 if (sctmp->word == sc->word) break;
00211 sctmp->next = sc->next;
00212 sc->next = sctmp;
00213 break;
00214 }
00215 }
00216 }
00217 }
00218
00239 static boolean
00240 match_successor(WCHMM_INFO *wchmm, int node1, int node2)
00241 {
00242 S_CELL *sc1,*sc2;
00243
00244
00245 if (wchmm->state[node1].scid == 0 || wchmm->state[node2].scid == 0) {
00246 j_internal_error("match_successor: sclist id not assigned to branch node?\n");
00247 }
00248 sc1 = wchmm->sclist[wchmm->state[node1].scid];
00249 sc2 = wchmm->sclist[wchmm->state[node2].scid];
00250 for (;;) {
00251 if (sc1 == NULL || sc2 == NULL) {
00252 if (sc1 == NULL && sc2 == NULL) {
00253 return TRUE;
00254 } else {
00255 return FALSE;
00256 }
00257 } else if (sc1->word != sc2->word) {
00258 return FALSE;
00259 }
00260 sc1 = sc1->next;
00261 sc2 = sc2->next;
00262 }
00263 }
00264
00279 static void
00280 free_successor(WCHMM_INFO *wchmm, int scid)
00281 {
00282 S_CELL *sc;
00283 S_CELL *sctmp;
00284
00285
00286 sc = wchmm->sclist[scid];
00287 while (sc != NULL) {
00288 sctmp = sc;
00289 sc = sc->next;
00290 free(sctmp);
00291 }
00292 }
00293
00308 static void
00309 compaction_successor(WCHMM_INFO *wchmm)
00310 {
00311 int src, dst;
00312
00313 dst = 1;
00314 for(src=1;src<wchmm->scnum;src++) {
00315 if (wchmm->state[wchmm->sclist2node[src]].scid <= 0) {
00316
00317 continue;
00318 }
00319 if (dst != src) {
00320 wchmm->sclist[dst] = wchmm->sclist[src];
00321 wchmm->sclist2node[dst] = wchmm->sclist2node[src];
00322 wchmm->state[wchmm->sclist2node[dst]].scid = dst;
00323 }
00324 dst++;
00325 }
00326 if (debug2_flag) {
00327 jlog("DEBUG: successor list shrinked from %d to %d\n", wchmm->scnum, dst);
00328 }
00329 wchmm->scnum = dst;
00330 }
00331
00346 static void
00347 shrink_successor(WCHMM_INFO *wchmm)
00348 {
00349 wchmm->sclist = (S_CELL **)myrealloc(wchmm->sclist, sizeof(S_CELL *) * wchmm->scnum);
00350 wchmm->sclist2node = (int *)myrealloc(wchmm->sclist2node, sizeof(int) * wchmm->scnum);
00351 }
00352
00369 void
00370 make_successor_list(WCHMM_INFO *wchmm)
00371 {
00372 int node;
00373 WORD_ID w;
00374 int i;
00375 boolean *freemark;
00376 int s;
00377
00378 jlog("STAT: make successor lists for factoring\n");
00379
00380
00381
00382 for (node=0;node<wchmm->n;node++) wchmm->state[node].scid = 0;
00383
00384 s = 1;
00385 for (w=0;w<wchmm->winfo->num;w++) {
00386 for (i=0;i<wchmm->winfo->wlen[w];i++) {
00387 if (wchmm->state[wchmm->offset[w][i]].scid == 0) {
00388 wchmm->state[wchmm->offset[w][i]].scid = s;
00389 s++;
00390 }
00391 }
00392 if (wchmm->state[wchmm->wordend[w]].scid == 0) {
00393 wchmm->state[wchmm->wordend[w]].scid = s;
00394 s++;
00395 }
00396 }
00397 wchmm->scnum = s;
00398 if (debug2_flag) {
00399 jlog("DEBUG: initial successor list size = %d\n", wchmm->scnum);
00400 }
00401
00402 wchmm->sclist = (S_CELL **)mymalloc(sizeof(S_CELL *) * wchmm->scnum);
00403 for (i=1;i<wchmm->scnum;i++) wchmm->sclist[i] = NULL;
00404 wchmm->sclist2node = (int *)mymalloc(sizeof(int) * wchmm->scnum);
00405
00406 freemark = (boolean *)mymalloc(sizeof(boolean) * wchmm->scnum);
00407 for (i=1;i<wchmm->scnum;i++) freemark[i] = FALSE;
00408
00409
00410 for (w=0;w<wchmm->winfo->num;w++) {
00411
00412 for (i=0;i<wchmm->winfo->wlen[w];i++) {
00413 wchmm->sclist2node[wchmm->state[wchmm->offset[w][i]].scid] = wchmm->offset[w][i];
00414 add_successor(wchmm, wchmm->offset[w][i], w);
00415 }
00416
00417 wchmm->sclist2node[wchmm->state[wchmm->wordend[w]].scid] = wchmm->wordend[w];
00418 add_successor(wchmm, wchmm->wordend[w], w);
00419 }
00420
00421
00422
00423
00424 for (w=0;w<wchmm->winfo->num;w++) {
00425 node = wchmm->wordend[w];
00426 i = wchmm->winfo->wlen[w]-1;
00427 while (i >= 0) {
00428 if (node == wchmm->offset[w][i]) {
00429
00430 i--;
00431 continue;
00432 }
00433 if (match_successor(wchmm, node, wchmm->offset[w][i])) {
00434 freemark[wchmm->state[node].scid] = TRUE;
00435 }
00436
00437
00438
00439
00440
00441 node = wchmm->offset[w][i];
00442 i--;
00443 }
00444 }
00445
00446 for (i=1;i<wchmm->scnum;i++) {
00447 if (freemark[i] == TRUE) {
00448 free_successor(wchmm, i);
00449
00450 wchmm->state[wchmm->sclist2node[i]].scid = 0;
00451 }
00452 }
00453
00454 compaction_successor(wchmm);
00455
00456 free(freemark);
00457
00458 jlog("STAT: done\n");
00459 }
00460
00479 void
00480 adjust_sc_index(WCHMM_INFO *wchmm)
00481 {
00482 WORD_ID w;
00483 int i,j,k;
00484 HMM_Logical *ltmp;
00485 int ltmp_state_num;
00486 int ato;
00487 LOGPROB prob;
00488 int node, scid;
00489 A_CELL2 *ac;
00490
00491
00492 for(w=0;w<wchmm->winfo->num;w++) {
00493 for(k=0;k<wchmm->winfo->wlen[w];k++) {
00494 node = wchmm->offset[w][k];
00495 scid = wchmm->state[node].scid;
00496 if (scid == 0) continue;
00497 ltmp = wchmm->winfo->wseq[w][k];
00498 ltmp_state_num = hmm_logical_state_num(ltmp);
00499 if ((hmm_logical_trans(ltmp))->a[0][ltmp_state_num-1] != LOG_ZERO) {
00500 j = k + 1;
00501 if (j == wchmm->winfo->wlen[w]) {
00502 if (wchmm->state[wchmm->wordend[w]].scid == 0) {
00503 jlog("STAT: word %d: factoring node copied for skip phone\n", w);
00504 wchmm->state[wchmm->wordend[w]].scid = scid;
00505 }
00506 } else {
00507 if (wchmm->state[wchmm->offset[w][j]].scid == 0) {
00508 jlog("STAT: word %d: factoring node copied for skip phone\n", w);
00509 wchmm->state[wchmm->offset[w][j]].scid = scid;
00510 }
00511 }
00512 }
00513 for(ato=1;ato<ltmp_state_num;ato++) {
00514 prob = (hmm_logical_trans(ltmp))->a[0][ato];
00515 if (prob != LOG_ZERO) {
00516 wchmm->state[node+ato-1].scid = scid;
00517 }
00518 }
00519 }
00520 }
00521
00522
00523 for(i=0;i<wchmm->startnum;i++) {
00524 node = wchmm->startnode[i];
00525 if (wchmm->state[node].out.state != NULL) {
00526 j_internal_error("adjust_sc_index: outprob exist in word-head node??\n");
00527 }
00528 if (wchmm->next_a[node] != LOG_ZERO) {
00529 if (wchmm->state[node+1].scid != 0) {
00530 if (wchmm->state[node].scid != 0 && wchmm->state[node].scid != wchmm->state[node+1].scid) {
00531 j_internal_error("adjust_sc_index: different successor list within word-head phone?\n");
00532 }
00533 wchmm->state[node].scid = wchmm->state[node+1].scid;
00534 wchmm->state[node+1].scid = 0;
00535 }
00536 }
00537 for(ac=wchmm->ac[node];ac;ac=ac->next) {
00538 for(k=0;k<ac->n;k++) {
00539 if (wchmm->state[ac->arc[k]].scid != 0) {
00540 if (wchmm->state[node].scid != 0 && wchmm->state[node].scid != wchmm->state[ac->arc[k]].scid) {
00541 j_internal_error("adjust_sc_index: different successor list within word-head phone?\n");
00542 }
00543 wchmm->state[node].scid = wchmm->state[ac->arc[k]].scid;
00544 wchmm->state[ac->arc[k]].scid = 0;
00545 }
00546 }
00547 }
00548 }
00549 }
00550
00551
00552
00553
00554
00573 void
00574 max_successor_cache_init(WCHMM_INFO *wchmm)
00575 {
00576 int i;
00577 LM_PROB_CACHE *l;
00578 WORD_ID wnum;
00579
00580
00581 shrink_successor(wchmm);
00582
00583
00584 l = &(wchmm->lmcache);
00585
00586 l->probcache = (LOGPROB *) mymalloc(sizeof(LOGPROB) * wchmm->scnum);
00587 l->lastwcache = (WORD_ID *) mymalloc(sizeof(WORD_ID) * wchmm->scnum);
00588 for (i=0;i<wchmm->scnum;i++) {
00589 l->lastwcache[i] = WORD_INVALID;
00590 }
00591
00592 if (wchmm->ngram) {
00593 wnum = wchmm->ngram->max_word_num;
00594 } else {
00595 wnum = wchmm->winfo->num;
00596 }
00597 #ifdef HASH_CACHE_IW
00598 l->iw_cache_num = wnum * jconf.search.pass1.iw_cache_rate / 100;
00599 if (l->iw_cache_num < 10) l->iw_cache_num = 10;
00600 #else
00601 l->iw_cache_num = wnum;
00602 #endif
00603 l->iw_sc_cache = (LOGPROB **)mymalloc(sizeof(LOGPROB *) * l->iw_cache_num);
00604 for (i=0;i<l->iw_cache_num;i++) {
00605 l->iw_sc_cache[i] = NULL;
00606 }
00607 #ifdef HASH_CACHE_IW
00608 l->iw_lw_cache = (WORD_ID *)mymalloc(sizeof(WORD_ID) * l->iw_cache_num);
00609 for (i=0;i<l->iw_cache_num;i++) {
00610 l->iw_lw_cache[i] = WORD_INVALID;
00611 }
00612 #endif
00613 }
00614
00627 static void
00628 max_successor_prob_iw_free(WCHMM_INFO *wchmm)
00629 {
00630 int i;
00631 LM_PROB_CACHE *l;
00632 l = &(wchmm->lmcache);
00633 for (i=0;i<l->iw_cache_num;i++) {
00634 if (l->iw_sc_cache[i] != NULL) free(l->iw_sc_cache[i]);
00635 l->iw_sc_cache[i] = NULL;
00636 }
00637 }
00638
00655 void
00656 max_successor_cache_free(WCHMM_INFO *wchmm)
00657 {
00658 free(wchmm->lmcache.probcache);
00659 free(wchmm->lmcache.lastwcache);
00660 max_successor_prob_iw_free(wchmm);
00661 free(wchmm->lmcache.iw_sc_cache);
00662 #ifdef HASH_CACHE_IW
00663 free(wchmm->lmcache.iw_lw_cache);
00664 #endif
00665 }
00666
00667 #ifdef UNIGRAM_FACTORING
00668
00709 void
00710 make_iwcache_index(WCHMM_INFO *wchmm)
00711 {
00712 int i, node, num;
00713
00714 wchmm->start2isolate = (int *)mymalloc(sizeof(int) * wchmm->startnum);
00715 num = 0;
00716 for(i=0;i<wchmm->startnum;i++) {
00717 node = wchmm->startnode[i];
00718 if (wchmm->state[node].scid >= 0) {
00719 wchmm->start2isolate[i] = num;
00720 num++;
00721 } else {
00722 wchmm->start2isolate[i] = -1;
00723 }
00724 }
00725 wchmm->isolatenum = num;
00726 }
00727
00772 void
00773 calc_all_unigram_factoring_values(WCHMM_INFO *wchmm)
00774 {
00775 S_CELL *sc, *sctmp;
00776 LOGPROB tmpprob, maxprob;
00777 int i, n;
00778
00779
00780 n = 0;
00781 for (i=1;i<wchmm->scnum;i++) {
00782 sc = wchmm->sclist[i];
00783 if (sc == NULL) {
00784 j_internal_error("call_all_unigram_factoring_values: sclist has no sc?\n");
00785 }
00786 if (sc->next != NULL) {
00787
00788 n++;
00789 }
00790 }
00791 wchmm->fsnum = n + 1;
00792
00793 wchmm->fscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->fsnum);
00794
00795 n = 1;
00796 for (i=1;i<wchmm->scnum;i++) {
00797 sc = wchmm->sclist[i];
00798 if (sc->next != NULL) {
00799 maxprob = LOG_ZERO;
00800 for (sctmp = sc; sctmp; sctmp = sctmp->next) {
00801 if (wchmm->ngram) {
00802 tmpprob = uni_prob(wchmm->ngram, wchmm->winfo->wton[sctmp->word])
00803 #ifdef CLASS_NGRAM
00804 + wchmm->winfo->cprob[sctmp->word]
00805 #endif
00806 ;
00807 } else {
00808 tmpprob = LOG_ZERO;
00809 }
00810 if (wchmm->lmvar == LM_NGRAM_USER) {
00811 tmpprob = (*(wchmm->uni_prob_user))(wchmm->winfo, sctmp->word, tmpprob);
00812 }
00813 if (maxprob < tmpprob) maxprob = tmpprob;
00814 }
00815 wchmm->fscore[n] = maxprob;
00816 free_successor(wchmm, i);
00817 wchmm->state[wchmm->sclist2node[i]].scid = - n;
00818 n++;
00819 }
00820 }
00821
00822 compaction_successor(wchmm);
00823 }
00824
00825 #else
00826
00849 static LOGPROB
00850 calc_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
00851 {
00852 S_CELL *sc;
00853 LOGPROB tmpprob, maxprob;
00854 WORD_ID lw;
00855
00856 maxprob = LOG_ZERO;
00857 if (wchmm->ngram) {
00858 lw = wchmm->winfo->wton[lastword];
00859 }
00860
00861 for (sc = wchmm->sclist[wchmm->state[node].scid]; sc; sc = sc->next) {
00862 if (wchmm->ngram) {
00863 tmpprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, lw , wchmm->winfo->wton[sc->word])
00864 #ifdef CLASS_NGRAM
00865 + wchmm->winfo->cprob[sc->word]
00866 #endif
00867 ;
00868 } else {
00869 tmpprob = LOG_ZERO;
00870 }
00871 if (wchmm->lmvar == LM_NGRAM_USER) {
00872 tmpprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, sc->word, tmpprob);
00873 }
00874 if (maxprob < tmpprob) maxprob = tmpprob;
00875 }
00876
00877 return(maxprob);
00878 }
00879
00880 #endif
00881
00924 LOGPROB
00925 max_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
00926 {
00927 LOGPROB maxprob;
00928 WORD_ID last_nword, w;
00929 int scid;
00930 LM_PROB_CACHE *l;
00931
00932 l = &(wchmm->lmcache);
00933
00934 if (lastword != WORD_INVALID) {
00935 if (wchmm->ngram) {
00936 last_nword = wchmm->winfo->wton[lastword];
00937 } else {
00938 last_nword = lastword;
00939 }
00940 scid = wchmm->state[node].scid;
00941 #ifdef UNIGRAM_FACTORING
00942 if (scid < 0) {
00943
00944 return(wchmm->fscore[(- scid)]);
00945 } else {
00946
00947
00948 if (last_nword != l->lastwcache[scid]) {
00949
00950 w = (wchmm->sclist[scid])->word;
00951 if (wchmm->ngram) {
00952 maxprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w])
00953 #ifdef CLASS_NGRAM
00954 + wchmm->winfo->cprob[w]
00955 #endif
00956 ;
00957 } else {
00958 maxprob = LOG_ZERO;
00959 }
00960 if (wchmm->lmvar == LM_NGRAM_USER) {
00961 maxprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, maxprob);
00962 }
00963 l->lastwcache[scid] = last_nword;
00964 l->probcache[scid] = maxprob;
00965 return(maxprob);
00966 } else {
00967
00968 return (l->probcache[scid]);
00969 }
00970 }
00971 #else
00972
00973 if (last_nword != l->lastwcache[scid]) {
00974 maxprob = calc_successor_prob(wchmm, lastword, node);
00975
00976 l->lastwcache[scid] = last_nword;
00977 l->probcache[scid] = maxprob;
00978 return(maxprob);
00979 } else {
00980 return (l->probcache[scid]);
00981 }
00982 #endif
00983 } else {
00984 return(0.0);
00985 #if 0
00986 maxprob = LOG_ZERO;
00987 for (sc=wchmm->state[node].sc;sc;sc=sc->next) {
00988 tmpprob = uni_prob(wchmm->ngram, sc->word);
00989 if (maxprob < tmpprob) maxprob = tmpprob;
00990 }
00991 return(maxprob);
00992 #endif
00993 }
00994
00995 }
00996
01031 LOGPROB *
01032 max_successor_prob_iw(WCHMM_INFO *wchmm, WORD_ID lastword)
01033 {
01034 int i, j, x, node;
01035 int last_nword;
01036 WORD_ID w;
01037 LM_PROB_CACHE *l;
01038 LOGPROB p;
01039
01040 l = &(wchmm->lmcache);
01041
01042 if (wchmm->ngram) {
01043 last_nword = wchmm->winfo->wton[lastword];
01044 } else {
01045 last_nword = lastword;
01046 }
01047
01048 #ifdef HASH_CACHE_IW
01049 x = last_nword % l->iw_cache_num;
01050 if (l->iw_lw_cache[x] == last_nword) {
01051 return(l->iw_sc_cache[x]);
01052 }
01053 #else
01054 if (l->iw_sc_cache[last_nword] != NULL) {
01055 return(l->iw_sc_cache[last_nword]);
01056 }
01057 x = last_nword;
01058
01059 #endif
01060
01061 if (l->iw_sc_cache[x] == NULL) {
01062 #ifdef UNIGRAM_FACTORING
01063 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);
01064 #else
01065 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);
01066 #endif
01067 if (l->iw_sc_cache[x] == NULL) {
01068
01069 max_successor_prob_iw_free(wchmm);
01070 jlog("STAT: inter-word LM cache (%dMB) rehashed\n",
01071 (l->iw_cache_num *
01072 #ifdef UNIGRAM_FACTORING
01073 wchmm->isolatenum
01074 #else
01075 wchmm->startnum
01076 #endif
01077 ) / 1000 * sizeof(LOGPROB) / 1000);
01078 #ifdef UNIGRAM_FACTORING
01079 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);
01080 #else
01081 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);
01082 #endif
01083 if (l->iw_sc_cache[x] == NULL) {
01084 j_internal_error("max_successor_prob_iw: cannot malloc\n");
01085 }
01086 }
01087 }
01088
01089
01090 #ifdef UNIGRAM_FACTORING
01091 for (j=0;j<wchmm->startnum;j++) {
01092 i = wchmm->start2isolate[j];
01093 if (i == -1) continue;
01094 node = wchmm->startnode[j];
01095 if (wchmm->state[node].scid <= 0) {
01096
01097 j_internal_error("max_successor_prob_iw: isolated (not shared) tree root node has unigram factoring value??\n");
01098 } else {
01099 w = (wchmm->sclist[wchmm->state[node].scid])->word;
01100 if (wchmm->ngram) {
01101 p = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w])
01102 #ifdef CLASS_NGRAM
01103 + wchmm->winfo->cprob[w]
01104 #endif
01105 ;
01106 } else {
01107 p = LOG_ZERO;
01108 }
01109 if (wchmm->lmvar == LM_NGRAM_USER) {
01110 p = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, p);
01111 }
01112 l->iw_sc_cache[x][i] = p;
01113 }
01114 }
01115 #else
01116 for (i=0;i<wchmm->startnum;i++) {
01117 node = wchmm->startnode[i];
01118 l->iw_sc_cache[x][i] = calc_successor_prob(wchmm, lastword, node);
01119 }
01120 #endif
01121 #ifdef HASH_CACHE_IW
01122 l->iw_lw_cache[x] = last_nword;
01123 #endif
01124
01125 return(l->iw_sc_cache[x]);
01126 }
01127
01177 boolean
01178 can_succeed(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
01179 {
01180 int lc;
01181 S_CELL *sc;
01182
01183
01184
01185 if (lastword == WORD_INVALID) {
01186 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) {
01187 if (dfa_cp_begin(wchmm->dfa, sc->word) == TRUE) return(TRUE);
01188 }
01189 return(FALSE);
01190 } else {
01191 lc = wchmm->winfo->wton[lastword];
01192 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) {
01193 if (dfa_cp(wchmm->dfa, lc, sc->word) == TRUE) return(TRUE);
01194 }
01195 return(FALSE);
01196 }
01197 }
01198
01199