Main Page | Modules | Data Structures | Directories | File List | Data Fields | Globals | Related Pages

wchmm.c

Go to the documentation of this file.
00001 
00036 /*
00037  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00038  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00039  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology, Nagoya Institute of Technology
00040  * All rights reserved
00041  */
00042 
00043 /* wchmm = word conjunction HMM = lexicon tree */
00044 
00045 #include <julius.h>
00046 
00047 
00048 static int dupcount = 0;        
00049 
00050 #undef WCHMM_SIZE_CHECK         
00051 
00052 
00053 /**************************************************************/
00054 /*********** Initialization of tree lexicon *******************/
00055 /**************************************************************/
00056 
00069 WCHMM_INFO *
00070 wchmm_new()
00071 {
00072   WCHMM_INFO *w;
00073   w = (WCHMM_INFO *)mymalloc(sizeof(WCHMM_INFO));
00074 #ifdef USE_NGRAM
00075   w->ngram = NULL;
00076 #endif
00077 #ifdef USE_DFA
00078   w->dfa = NULL;
00079 #endif
00080   w->winfo = NULL;
00081   w->malloc_root = NULL;
00082 #ifdef PASS1_IWCD
00083 #ifdef CATEGORY_TREE
00084   w->lcdset_category_root = NULL;
00085 #endif /* CATEGORY_TREE */
00086 #endif /* PASS1_IWCD */
00087   return w;
00088 }
00089 
00102 static void
00103 wchmm_init(WCHMM_INFO *wchmm)
00104 {
00105   wchmm->maxwcn = MAXWCNSTEP;
00106   wchmm->state = (WCHMM_STATE *)mymalloc(sizeof(WCHMM_STATE)*wchmm->maxwcn);
00107 #ifdef MULTIPATH_VERSION
00108   wchmm->wordbegin = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00109 #else
00110   wchmm->ststart = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn);
00111 #endif
00112   wchmm->stend = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn);
00113   wchmm->offset = (int **)mymalloc(sizeof(int *)*wchmm->winfo->num);
00114   wchmm->wordend = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00115 #ifdef MULTIPATH_VERSION
00116   wchmm->maxstartnum = STARTNODE_STEP;
00117   wchmm->startnode = (int *)mymalloc(sizeof(int)*STARTNODE_STEP);
00118   wchmm->startnum = 0;
00119 # ifdef CATEGORY_TREE
00120   wchmm->start2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*STARTNODE_STEP);
00121 # endif
00122 #else
00123   wchmm->startnode = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00124   wchmm->wordend_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->winfo->num);
00125 #endif /* MULTIPATH_VERSION */
00126 #ifdef PASS1_IWCD
00127   wchmm->outstyle = (unsigned char *)mymalloc(sizeof(unsigned char)*wchmm->maxwcn);
00128 #endif
00129 #ifdef UNIGRAM_FACTORING
00130   wchmm->start2isolate = NULL;
00131   wchmm->isolatenum = 0;
00132 #endif
00133 #ifndef CATEGORY_TREE
00134   wchmm->sclist = NULL;
00135   wchmm->sclist2node = NULL;
00136 #ifdef UNIGRAM_FACTORING
00137   wchmm->fscore = NULL;
00138 #endif
00139 #endif /* CATEGORY_TREE */
00140   wchmm->n = 0;
00141 }
00142 
00155 static void
00156 wchmm_extend(WCHMM_INFO *wchmm)
00157 {
00158   wchmm->maxwcn += MAXWCNSTEP;
00159   wchmm->state = (WCHMM_STATE *)myrealloc(wchmm->state, sizeof(WCHMM_STATE)*wchmm->maxwcn);
00160 #ifndef MULTIPATH_VERSION
00161   wchmm->ststart = (WORD_ID *)myrealloc(wchmm->ststart, sizeof(WORD_ID)*wchmm->maxwcn);
00162 #endif
00163   wchmm->stend = (WORD_ID *)myrealloc(wchmm->stend, sizeof(WORD_ID)*wchmm->maxwcn);
00164 #ifdef PASS1_IWCD
00165   wchmm->outstyle = (unsigned char *)myrealloc(wchmm->outstyle, sizeof(unsigned char)*wchmm->maxwcn);
00166 #endif
00167 }
00168 
00169 #ifdef MULTIPATH_VERSION
00170 
00182 static void
00183 wchmm_extend_startnode(WCHMM_INFO *wchmm)
00184 {
00185   wchmm->maxstartnum += STARTNODE_STEP;
00186   wchmm->startnode = (int *)myrealloc(wchmm->startnode, sizeof(int) * wchmm->maxstartnum);
00187 #ifdef CATEGORY_TREE
00188   wchmm->start2wid = (WORD_ID *)myrealloc(wchmm->startnode, sizeof(WORD_ID) * wchmm->maxstartnum);
00189 #endif
00190 }
00191 #endif /* MULTIPATH_VERSION */
00192 
00205 void
00206 wchmm_free(WCHMM_INFO *w)
00207 {
00208   S_CELL *sc, *sctmp;
00209   int i;
00210   /* wchmm->state[i].ac malloced by mybmalloc2() */
00211   /* wchmm->offset[][] malloced by mybmalloc2() */
00212 #ifdef PASS1_IWCD
00213   /* LRC_INFO, RC_INFO in wchmm->state[i].outsty malloced by mybmalloc2() */
00214 #endif
00215   /* they all will be freed by a single mybfree2() call */
00216   mybfree2(&(w->malloc_root));
00217 #ifndef CATEGORY_TREE
00218   if (w->sclist != NULL) {
00219     for(i=1;i<w->scnum;i++) {
00220       sc = w->sclist[i];
00221       while(sc) {
00222         sctmp = sc->next;
00223         free(sc);
00224         sc = sctmp;
00225       }
00226     }
00227     free(w->sclist);
00228   }
00229   if (w->sclist2node != NULL) free(w->sclist2node);
00230 #ifdef UNIGRAM_FACTORING
00231   if (w->fscore != NULL) free(w->fscore);
00232 #endif
00233 #endif
00234 #ifdef UNIGRAM_FACTORING
00235   if (w->start2isolate != NULL) free(w->start2isolate);
00236 #endif
00237 #ifdef PASS1_IWCD
00238   free(w->outstyle);
00239 #endif
00240 #ifndef MULTIPATH_VERSION
00241   free(w->wordend_a);
00242 #endif
00243   free(w->startnode);
00244 #ifdef MULTIPATH_VERSION
00245 # ifdef CATEGORY_TREE
00246   free(w->start2wid);
00247 # endif
00248 #endif /* MULTIPATH_VERSION */
00249   free(w->wordend);
00250   free(w->offset);
00251   free(w->stend);
00252 #ifdef MULTIPATH_VERSION
00253   free(w->wordbegin);
00254 #else
00255   free(w->ststart);
00256 #endif
00257   free(w->state);
00258 #ifdef PASS1_IWCD
00259 #ifdef CATEGORY_TREE
00260   lcdset_remove_with_category_all(w);
00261 #endif /* CATEGORY_TREE */
00262 #endif /* PASS1_IWCD */
00263   free(w);
00264 }
00265 
00266 
00267 /**************************************************************/
00268 /*********** Word sort functions for tree construction ********/
00269 /**************************************************************/
00270 
00271 static WORD_INFO *local_winfo;  
00272 
00291 static int
00292 compare_wseq(WORD_ID *widx1, WORD_ID *widx2)
00293 {
00294   int len1, len2, n;
00295   int p=0;
00296   
00297   len1 = local_winfo->wlen[*widx1];
00298   len2 = local_winfo->wlen[*widx2];
00299 
00300   n=0;
00301   /*  while (n < len1 && n < len2 && (p = (int)winfo->wseq[*widx1][n] - (int)winfo->wseq[*widx2][n]) == 0 ) n++;*/
00302   while (n < len1 && n < len2 && (p = strcmp((local_winfo->wseq[*widx1][n])->name, (local_winfo->wseq[*widx2][n])->name)) == 0 ) n++;
00303   if (n < len1) {
00304     if (n < len2) {
00305       /* differ */
00306       return(p);
00307     } else {
00308       /* 2 is part of 1 */
00309       return(1);
00310     }
00311   } else {
00312     if (n < len2) {
00313       /* 1 is part of 2 */
00314       return(-1);
00315     } else {
00316       /* same */
00317       return(0);
00318     }
00319   }
00320 }
00321 
00340 static void
00341 wchmm_sort_idx_by_wseq(WORD_INFO *winfo, WORD_ID *windex, WORD_ID bgn, WORD_ID len)
00342 {
00343   local_winfo = winfo;
00344   qsort(&(windex[bgn]), len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_wseq);
00345 }
00346 
00347 #ifdef CATEGORY_TREE
00348 
00366 static int
00367 compare_category(WORD_ID *widx1, WORD_ID *widx2)
00368 {
00369   int c1,c2;
00370   c1 = local_winfo->wton[*widx1];
00371   c2 = local_winfo->wton[*widx2];
00372   return(c1 - c2);
00373 }
00374 
00391 static void
00392 wchmm_sort_idx_by_category(WORD_INFO *winfo, WORD_ID *windex, WORD_ID len)
00393 {
00394   local_winfo = winfo;
00395   qsort(windex, len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_category);
00396 }
00397 #endif /* CATEGORY_TREE */
00398   
00399 
00400 /**********************************************************************/
00401 /************** Subroutines to link part of words  ********************/
00402 /**********************************************************************/
00403 
00404 /* 
00405    返り値: 共有音素数 */
00406 /* compare 2 words 'i', 'j' from start phoneme, and return the number
00407    of sharable phonemes. */
00429 static int
00430 wchmm_check_match(WORD_INFO *winfo, int i, int j)
00431 {
00432   int k,tmplen;
00433 
00434   for (tmplen=0,k=0;k<winfo->wlen[i];k++) {
00435     if (k > winfo->wlen[j]-1)
00436       break;
00437     if (! (strmatch(winfo->wseq[i][k]->name, winfo->wseq[j][k]->name)))
00438       break;
00439     tmplen++;
00440   }
00441   return(tmplen);
00442 }
00443 
00462 static void
00463 add_wacc(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00464 {
00465   A_CELL *ac;
00466   ac       = (A_CELL *) mybmalloc2(sizeof(A_CELL), &(wchmm->malloc_root));
00467   ac->a    = a;
00468   ac->arc  = arc;
00469   ac->next = wchmm->state[node].ac;
00470   wchmm->state[node].ac   = ac;
00471 }
00472 
00473 #ifdef MULTIPATH_VERSION
00474 
00475 /* MULTIPATH_VERSION: homophone processing is not needed */
00476 
00503 static void
00504 get_outtrans_list(WCHMM_INFO *wchmm, WORD_ID w, int pos, int *node, LOGPROB *a, int *num, int maxnum, boolean insert_sp)
00505 {
00506   HMM_Logical *ltmp;
00507   int states;
00508   int k, ato;
00509   LOGPROB prob;
00510   int oldnum;
00511 
00512   if (pos < 0) {
00513     
00514     /* set the word-beginning node, and return */
00515     node[*num] = wchmm->wordbegin[w];
00516     a[*num] = 0.0;
00517     (*num)++;
00518     
00519   } else {
00520 
00521     ltmp = wchmm->winfo->wseq[w][pos];
00522     states = hmm_logical_state_num(ltmp);
00523 
00524     /* check initial->final state */
00525     if ((hmm_logical_trans(ltmp))->a[0][states-1] != LOG_ZERO) {
00526       /* recursive call for previous phone */
00527       oldnum = *num;
00528       get_outtrans_list(wchmm, w, pos-1, node, a, num, maxnum, FALSE); /* previous phone should not be an sp-inserted phone */
00529       /* add probability of the skip transition to all the previous ones */
00530       for(k=oldnum;k<*num;k++) {
00531         a[k] += (hmm_logical_trans(ltmp))->a[0][states-1];
00532       }
00533     }
00534     /* add to list the arcs from output state to final state */
00535     for (k = 1; k < states - 1; k++) {
00536       prob = (hmm_logical_trans(ltmp))->a[k][states-1];
00537       if (prob != LOG_ZERO) {
00538         if (*num >= maxnum) {
00539           j_error("Maximum outtrans list num exceeded! (%d)\n", maxnum);
00540         }
00541         node[*num] = wchmm->offset[w][pos] + k - 1;
00542         a[*num] = prob;
00543         (*num)++;
00544       }
00545     }
00546     /* for -iwsp, add outgoing arc from the tail sp model
00547        only if need_sp == TRUE.
00548        need_sp should be TRUE only when the connecting [pos] phone is also an end phone of the to-be-added word (i.e. homophone word)
00549      */
00550     /*  */
00551     if (enable_iwsp && insert_sp) {
00552       /* consider sp */
00553       for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
00554         prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][hmm_logical_state_num(wchmm->hmminfo->sp)-1];
00555         if (prob != LOG_ZERO) {
00556           if (*num >= maxnum) {
00557             j_error("Maximum outtrans list num exceeded! (%d)\n", maxnum);
00558           }
00559           node[*num] = wchmm->offset[w][pos] + (states - 2) + k - 1;
00560           a[*num] = prob;
00561           (*num)++;
00562         }
00563       }
00564     }
00565   }
00566   /*printf("   %d(%s)-%d:\"%s\", num=%d\n", w, wchmm->winfo->woutput[w], pos,
00567     (pos < 0) ? "BGN" : wchmm->winfo->wseq[w][pos]->name, *num);*/
00568   return;
00569 }  
00570 
00571 #else  /* ~MULTIPATH_VERSION */
00572 
00591 static void
00592 wchmm_link_hmm(WCHMM_INFO *wchmm, int from_node, int to_node, HTK_HMM_Trans *tinfo)
00593 {     
00594   A_CELL *actmp;
00595   LOGPROB a, atmp;
00596   int i;
00597   boolean tflag;
00598 
00599   /* get transition probability to outer state in tinfo */
00600   for(i = tinfo->statenum - 2; i >= 0; i--) {
00601     if ((a = tinfo->a[i][tinfo->statenum-1]) != LOG_ZERO) { /* found */
00602       atmp = a;
00603       /* check if the arc already exist */
00604       tflag = FALSE;
00605       for (actmp = wchmm->state[from_node].ac; actmp; actmp = actmp->next) {
00606         if (actmp->arc == to_node && actmp->a == atmp) {
00607           tflag = TRUE;
00608           break;
00609         }
00610       }
00611       if (tflag) break;
00612       /* add the arc to wchmm */
00613       add_wacc(wchmm, from_node, atmp, to_node);
00614       return;                   /* exit function here */
00615     }
00616   }      
00617   j_error("Error: No arc to endstate?\n");
00618 }
00619 
00640 static void
00641 wchmm_link_subword(WCHMM_INFO *wchmm, int from_word, int from_seq, int to_word, int to_seq)
00642 {     
00643   HMM_Logical *last;
00644   int lastp;
00645 
00646   last = wchmm->winfo->wseq[from_word][from_seq];
00647   lastp = wchmm->offset[from_word][from_seq] + hmm_logical_state_num(last)-2 -1;
00648   wchmm_link_hmm(wchmm, lastp, wchmm->offset[to_word][to_seq],
00649                  hmm_logical_trans(last));
00650 }
00651 
00652 /**************************************************************/
00653 /******** homophone processing: duplicating leaf nodes ********/
00654 /**************************************************************/
00655 
00695 static void
00696 wchmm_duplicate_state(WCHMM_INFO *wchmm, int node, int word) /* source node, new word */
00697 {
00698   int n;
00699   int n_src, n_prev;
00700   A_CELL        *ac;
00701   HMM_Logical *lastphone;
00702 
00703   /* 1 state will newly created: expand tree if needed */
00704   if (wchmm->n + 1 >= wchmm->maxwcn) {
00705     wchmm_extend(wchmm);
00706   }
00707   /* n: the target new node to which 'node' is copied */
00708   n = wchmm->n;
00709 
00710   n_src = node;
00711 
00712   /* copy output probability info */
00713 #ifdef PASS1_IWCD
00714   {
00715     RC_INFO *rcnew;
00716     LRC_INFO *lrcnew;
00717     wchmm->outstyle[n] = wchmm->outstyle[n_src];
00718     if (wchmm->outstyle[n] == AS_RSET) {
00719       /* duplicate RC_INFO because it has its own cache */
00720       rcnew = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
00721       memcpy(rcnew, wchmm->state[n_src].out.rset, sizeof(RC_INFO));
00722       wchmm->state[n].out.rset = rcnew;
00723     } else if (wchmm->outstyle[n] == AS_LRSET) {
00724       /* duplicate LRC_INFO because it has its own cache */
00725       lrcnew = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
00726       memcpy(lrcnew, wchmm->state[n_src].out.lrset, sizeof(LRC_INFO));
00727       wchmm->state[n].out.lrset = lrcnew;
00728     } else {
00729       /* share same info, simply copy the pointer */
00730       memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(ACOUSTIC_SPEC));
00731     }
00732   }
00733 #else  /* ~PASS1_IWCD */
00734   memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(HTK_HMM_State *));
00735 #endif
00736 
00737   lastphone = wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-1];
00738   wchmm->state[n].ac = NULL;
00739 
00740   /* add self transition arc */
00741   for(ac=wchmm->state[n_src].ac; ac; ac=ac->next) {
00742     if (ac->arc == n_src) {
00743       add_wacc(wchmm, n, ac->a, n);
00744     }
00745   }
00746   
00747   /* copy transition arcs whose destination is the source node to new node */
00748   if (hmm_logical_state_num(lastphone) == 3) { /* = 1 state */
00749     /* phone with only 1 state should be treated carefully */
00750     if (wchmm->winfo->wlen[word] == 1) { /* word consists of only this phone */
00751       /* no arcs need to be copied: this is also a start node of a word */
00752       wchmm->ststart[n] = word;
00753       wchmm->offset[word][0] = n;
00754     } else {
00755       /* copy arcs from the last state of the previous phone */
00756       n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-2]
00757         + hmm_logical_state_num(wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-2]) - 3;
00758       for (ac=wchmm->state[n_prev].ac; ac; ac=ac->next) {
00759         if (ac->arc == n_src) {
00760           add_wacc(wchmm, n_prev, ac->a, n);
00761         }
00762       }
00763       /* also update the last offset (== wordend in this case) */
00764       wchmm->offset[word][wchmm->winfo->wlen[word]-1] = n;
00765       /* the new node should not be a start node of a word */
00766       wchmm->ststart[n] = WORD_INVALID;
00767     }
00768   } else {                      /* phone with more than 2 states */
00769     /* copy arcs from/to the source node to new node */
00770     for (n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-1]; n_prev < n_src; n_prev++) {
00771       for (ac=wchmm->state[n_prev].ac; ac; ac=ac->next) {
00772         if (ac->arc == n_src) {
00773           add_wacc(wchmm, n_prev, ac->a, n);
00774         }
00775       }
00776       for (ac=wchmm->state[n_src].ac; ac; ac=ac->next) {
00777         if (ac->arc == n_prev) {
00778           add_wacc(wchmm, n, ac->a, n_prev);
00779         }
00780       }
00781     }
00782     /* the new node should not be a start node of a word */
00783     wchmm->ststart[n] = WORD_INVALID;
00784   }
00785 
00786   /* map word <-> node */
00787   wchmm->stend[n]   = word;     /* 'n' is an end node of word 'word' */
00788   wchmm->wordend[word] = n;     /* the word end node of 'word' is 'n' */
00789 
00790   /* new state has been created: increment the size */
00791   wchmm->n++;
00792   
00793 }
00794 
00809 static void
00810 wchmm_duplicate_leafnode(WCHMM_INFO *wchmm)
00811 {
00812   int w, nlast, n, narc, narc_model;
00813   boolean *dupw;                /* node marker */
00814   A_CELL *actmp;
00815 
00816   nlast = wchmm->n;
00817   dupw = (boolean *)mymalloc(sizeof(boolean) * nlast);
00818   for(n=0;n<nlast;n++) dupw[n] = FALSE; /* initialize all marker */
00819 
00820   for (w=0;w<wchmm->winfo->num;w++) {
00821     n = wchmm->wordend[w];
00822     if (dupw[n]) {              /* if already marked (2nd time or later */
00823       wchmm_duplicate_state(wchmm, n, w); dupcount++; /* duplicate */
00824     } else {                    /* if not marked yet (1st time) */
00825       /* try to find an arc outside the word */
00826       {
00827         /* count number of model-internal arc from the last state */
00828         HMM_Logical *lastphone;
00829         HTK_HMM_Trans *tinfo;
00830         int laststate, i;
00831         lastphone = wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1];
00832         laststate = hmm_logical_state_num(lastphone) - 2;
00833         tinfo = hmm_logical_trans(lastphone);
00834         narc_model=0;
00835         for(i=1;i<hmm_logical_state_num(lastphone)-1;i++) {
00836           if (tinfo->a[laststate][i] != LOG_ZERO) narc_model++;
00837         }
00838         /* count number of actual arc from the last state in the tree */
00839         narc = 0;
00840         for(actmp=wchmm->state[n].ac;actmp;actmp=actmp->next) narc++;
00841       }
00842       /* if both number does not match, it means it is not a single word tail */
00843       if (narc_model != narc) {
00844         /* word 'w' is embedded as part of other words at this node 'n' */
00845         /* duplicate this node now */
00846         wchmm_duplicate_state(wchmm, n, w); dupcount++;
00847         /* as new node has been assigned as word end node of word 'w',
00848            reset this source node as it is not the word end node */
00849         wchmm->stend[n] = WORD_INVALID;
00850       } else {
00851         /* no arc to other node found, it means it is a single word tail */
00852         /* as this is first time, only make sure that this node is word end of [w] */
00853         wchmm->stend[n] = w;
00854       }
00855       /* mark node 'n' */
00856       dupw[n] = TRUE;
00857     }
00858   }
00859   free(dupw);
00860 }
00861 
00862 #endif /* ~MULTIPATH_VERSION */
00863 
00864 /**************************************************************/
00865 /*************** add a word to wchmm lexicon tree *************/
00866 /**************************************************************/
00867 
00890 static void
00891 wchmm_add_word(WCHMM_INFO *wchmm, int word, int matchlen, int matchword)
00892 {
00893   int   j,k,n;
00894   int   add_head, add_tail, add_to;
00895   int   word_len, matchword_len;
00896   HMM_Logical *ltmp;
00897   int ato;
00898   LOGPROB prob;
00899   int ntmp;
00900   int ltmp_state_num;
00901 #ifdef PASS1_IWCD
00902   CD_Set *lcd = NULL;
00903 #endif
00904   
00905 #ifdef MULTIPATH_VERSION
00906   int *out_from, *out_from_next;
00907   LOGPROB *out_a, *out_a_next;
00908   int out_num_prev, out_num_next;
00909   int kkk;
00910 #endif
00911   
00912 /* 
00913  *   if (matchlen > 0) {
00914  *     printf("--\n");
00915  *     put_voca(wchmm->winfo, word);
00916  *     put_voca(wchmm->winfo, matchword);
00917  *     printf("matchlen=%d\n", matchlen);
00918  *   }
00919  */
00920   
00921   /* variable abbreviations */
00922   n = wchmm->n;
00923   word_len      = wchmm->winfo->wlen[word];
00924   matchword_len = wchmm->winfo->wlen[matchword];
00925 
00926   /* malloc phone offset area */
00927   if((wchmm->offset[word]=(int *)mybmalloc2(sizeof(int)*word_len, &(wchmm->malloc_root)))==NULL){ 
00928     j_error("malloc failed at wchmm_add_word()\n");
00929   }
00930   
00931   /* allocate unshared (new) part */
00932   add_head = matchlen;
00933   add_tail = word_len - 1;
00934   add_to   = matchlen - 1;
00935 
00936 #ifdef MULTIPATH_VERSION
00937   /* make word-beginning node if needed */
00938   if (matchlen == 0) {
00939     /* create word-beginning node */
00940     wchmm->wordbegin[word] = n;
00941     wchmm->stend[n] = WORD_INVALID;
00942     wchmm->state[n].ac = NULL;
00943     wchmm->state[n].out.state = NULL;
00944     /* index the new word-beginning node as startnode (old ststart) */
00945     wchmm->startnode[wchmm->startnum] = n;
00946 #ifdef CATEGORY_TREE
00947     wchmm->start2wid[wchmm->startnum] = word;
00948 #endif
00949     /* expand data area if necessary */
00950     if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
00951     if (++n >= wchmm->maxwcn) wchmm_extend(wchmm);
00952   } else {
00953     wchmm->wordbegin[word] = wchmm->wordbegin[matchword];
00954   }
00955 
00956   /* now n is at beginning of output state */
00957 
00958   /* prepare work area for skip arcs */
00959   out_from = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00960   out_from_next = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00961   out_a = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00962   out_a_next = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00963 
00964   /* store the initial outgoing arcs to out_from[] and out_a[] */
00965   out_num_prev = 0;
00966   if (matchlen == 0) {
00967     /* set the word-beginning node */
00968     out_from[0] = wchmm->wordbegin[word];
00969     out_a[0] = 0.0;
00970     out_num_prev = 1;
00971   } else {
00972     /*printf("%d(%s)\n", word, wchmm->winfo->woutput[word]);*/
00973     /* on -iwsp, trailing sp is needed only when no phone will be created */
00974     get_outtrans_list(wchmm, matchword, add_to, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, (add_tail - add_head + 1 > 0) ? FALSE : TRUE);
00975     /*printf("NUM=%d\n", out_num_prev);*/
00976   }
00977   
00978 #endif /* MULTIPATH_VERSION */
00979   
00980   if (add_tail - add_head + 1 > 0) { /* there are new phones to be created */
00981       ntmp = n;
00982       for (j=add_head; j <= add_tail; j++) { /* for each new phones */
00983         ltmp = wchmm->winfo->wseq[word][j];
00984         ltmp_state_num = hmm_logical_state_num(ltmp);
00985 #ifdef PASS1_IWCD
00986         if (ccd_flag) {
00987           /* in the triphone lexicon tree, the last phone of a word has
00988              left-context cdset */
00989           if (wchmm->winfo->wlen[word] > 1 && j == wchmm->winfo->wlen[word] - 1) {
00990 #ifdef CATEGORY_TREE
00991             if (! old_iwcd_flag) {
00992               lcd = lcdset_lookup_with_category(wchmm, ltmp, wchmm->winfo->wton[word]);
00993               if (lcd == NULL) {
00994                 /* no category-aware cdset found.  This is case when no word
00995                    can follow this word grammatically.
00996                    so fallback to normal state */
00997                 j_printerr("Warning: no lcdset found for [%s::%d], fallback to [%s]\n", ltmp->name, wchmm->winfo->wton[word], ltmp->name);
00998                 lcd = lcdset_lookup_by_hmmname(hmminfo, ltmp->name);
00999               }
01000             } else {
01001               lcd = lcdset_lookup_by_hmmname(hmminfo, ltmp->name);
01002             }
01003 #else
01004             lcd = lcdset_lookup_by_hmmname(hmminfo, ltmp->name);
01005 #endif
01006             if (lcd == NULL) {
01007               j_error("Error: no lcdset found for [%s]\n",ltmp->name);
01008             }
01009           }
01010         }
01011 #endif /* PASS1_IWCD */
01012         for (k = 1; k < ltmp_state_num - 1; k++) { /* for each state in the phone */
01013           /* set state output prob info */
01014 #ifdef PASS1_IWCD
01015           if (ccd_flag) {
01016             /* output info of triphones needs special handling */
01017             if (wchmm->winfo->wlen[word] == 1) { /* word with only 1 phone */
01018               wchmm->outstyle[ntmp] = AS_LRSET;
01019               wchmm->state[ntmp].out.lrset = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
01020               (wchmm->state[ntmp].out.lrset)->hmm       = ltmp;
01021               (wchmm->state[ntmp].out.lrset)->state_loc = k;
01022 #ifdef CATEGORY_TREE
01023               (wchmm->state[ntmp].out.lrset)->category  = wchmm->winfo->wton[word];
01024 #endif
01025             } else if (j == 0) {        /* head phone of a word */
01026               wchmm->outstyle[ntmp] = AS_RSET;
01027               wchmm->state[ntmp].out.rset = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
01028               (wchmm->state[ntmp].out.rset)->hmm       = ltmp;
01029               (wchmm->state[ntmp].out.rset)->state_loc = k;
01030             } else if (j == wchmm->winfo->wlen[word] - 1) { /* last phone of a word */
01031               wchmm->outstyle[ntmp] = AS_LSET;
01032               wchmm->state[ntmp].out.lset = &(lcd->stateset[k]);
01033             } else {
01034               wchmm->outstyle[ntmp] = AS_STATE;
01035               if (ltmp->is_pseudo) {
01036                 j_printerr("Warning: word-internal phone should not be pseudo\n");
01037                 put_voca(wchmm->winfo, word);
01038               }
01039               wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01040             }
01041           } else {
01042             /* monophone */
01043             if (ltmp->is_pseudo) {
01044               j_printerr("InternalError: CDSET phoneme exist in monophone?\n");
01045               put_voca(wchmm->winfo, word);
01046             }
01047             wchmm->outstyle[ntmp] = AS_STATE;
01048             wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01049           }
01050 #else  /* ~PASS1_IWCD */
01051           if (ltmp->is_pseudo) {
01052             j_printerr("InternalError: CDSET phone exist in monophone?\n");
01053             put_voca(wchmm->winfo, word);
01054           }
01055           wchmm->state[ntmp].out = ltmp->body.defined->s[k];
01056 #endif /* PASS1_IWCD */
01057           
01058           /* initialize other info */
01059           wchmm->state[ntmp].ac = NULL;
01060           wchmm->stend[ntmp] = WORD_INVALID;
01061 #ifndef MULTIPATH_VERSION
01062           wchmm->ststart[ntmp] = WORD_INVALID;
01063           /* make transition arc from HMM transition info */
01064           for (ato = 1; ato < ltmp_state_num; ato++) {
01065             prob = (hmm_logical_trans(ltmp))->a[k][ato];
01066             if
01067               (prob != LOG_ZERO)
01068               {
01069               if (j == add_tail && k == ltmp_state_num - 2 && ato == ltmp_state_num - 1) {
01070                 /* arc outside new part will be handled later */
01071               } else {
01072                 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01073               }
01074             }
01075           }
01076 #endif
01077           
01078           ntmp++;
01079           /* expand wchmm if neccesary */
01080           if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01081         } /* end of state loop */
01082       } /* end of phone loop */
01083 
01084 #ifdef MULTIPATH_VERSION
01085           
01086       /* On multipath version, the skip transition should be handled! */
01087       
01088       /* make transition arc from HMM transition info */
01089       ntmp = n;
01090       for (j = add_head; j <= add_tail; j++) {
01091         ltmp = wchmm->winfo->wseq[word][j];
01092         ltmp_state_num = hmm_logical_state_num(ltmp);
01093         out_num_next = 0;
01094         /* arc from initial state ... need arc expansion from precious phone */
01095         for (ato = 1; ato < ltmp_state_num; ato++) {
01096           prob = (hmm_logical_trans(ltmp))->a[0][ato];
01097           if (prob != LOG_ZERO) {
01098             /* expand arc from previous HMM */
01099             if (ato == ltmp_state_num - 1) {
01100               /* to final state ... just register states for next expansion */
01101               for(kkk=0; kkk<out_num_prev; kkk++) {
01102                 out_from_next[out_num_next] = out_from[kkk];
01103                 out_a_next[out_num_next] = out_a[kkk] + prob;
01104                 out_num_next++;
01105               }
01106             } else {
01107               for(kkk=0; kkk<out_num_prev; kkk++) {
01108                 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01109               }
01110             }
01111           }
01112         } /* end of state loop */
01113         /* from outprob state */
01114         for(k = 1; k < ltmp_state_num - 1; k++) {
01115           for (ato = 1; ato < ltmp_state_num; ato++) {
01116             prob = (hmm_logical_trans(ltmp))->a[k][ato];
01117             if (prob != LOG_ZERO) {
01118               if (ato == ltmp_state_num - 1) {
01119                 /* to final state ... register states for next expansion */
01120                 out_from_next[out_num_next] = ntmp;
01121                 out_a_next[out_num_next] = prob;
01122                 out_num_next++;
01123               } else {
01124                 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01125               }
01126             }
01127           }
01128           ntmp++;
01129         } /* end of state loop */
01130         /* swap out list for next phone */
01131         for(kkk=0;kkk<out_num_next;kkk++) {
01132           out_from[kkk] = out_from_next[kkk];
01133           out_a[kkk] = out_a_next[kkk];
01134         }
01135         out_num_prev = out_num_next;
01136       } /* end of phone loop */
01137 
01138 #endif /* MULTIPATH_VERSION */
01139       
01140   } /* new phone node creation loop for this word */
01141 
01142 
01143 #ifdef MULTIPATH_VERSION
01144   /*************************/
01145   /* Short Pause appending */
01146   /*************************/
01147   
01148   /* if -iwsp, add noise model to the end of word at ntmp */
01149   if (enable_iwsp && add_tail - add_head + 1 > 0) { /* there are new phones to be created */
01150     int ntmp_bak;
01151     
01152     /* set short pause state info */
01153     ntmp_bak = ntmp;
01154     if (hmminfo->sp->is_pseudo) {
01155       for(k = 1;k < hmm_logical_state_num(hmminfo->sp) - 1; k++) {
01156         wchmm->outstyle[ntmp] = AS_LSET;
01157         wchmm->state[ntmp].out.lset = &(hmminfo->sp->body.pseudo->stateset[k]);
01158         wchmm->state[ntmp].ac = NULL;
01159         wchmm->stend[ntmp] = WORD_INVALID;
01160         ntmp++;
01161         if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01162       }
01163     } else {
01164       for(k = 1;k < hmm_logical_state_num(hmminfo->sp) - 1; k++) {
01165         wchmm->outstyle[ntmp] = AS_STATE;
01166         wchmm->state[ntmp].out.state = hmminfo->sp->body.defined->s[k];
01167         wchmm->state[ntmp].ac = NULL;
01168         wchmm->stend[ntmp] = WORD_INVALID;
01169         ntmp++;
01170         if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01171       }
01172     }
01173     ntmp = ntmp_bak;
01174     /* connect incoming arcs from previous phone */
01175     out_num_next = 0;
01176     for (ato = 1; ato < hmm_logical_state_num(hmminfo->sp); ato++) {
01177       prob = hmm_logical_trans(hmminfo->sp)->a[0][ato];
01178       if (prob != LOG_ZERO) {
01179         /* to control short pause insertion, transition probability toward
01180          the word-end short pause will be given a penalty */
01181         prob += hmminfo->iwsp_penalty;
01182         if (ato == hmm_logical_state_num(hmminfo->sp) - 1) {
01183           /* model has a model skip transition, just inherit them to next */
01184           for(kkk=0; kkk<out_num_prev; kkk++) {
01185             out_from_next[out_num_next] = out_from[kkk];
01186             out_a_next[out_num_next] = out_a[kkk] + prob;
01187             out_num_next++;
01188           }
01189         } else {
01190           /* connect incoming arcs from previous phone to this phone */
01191           for(kkk=0; kkk<out_num_prev; kkk++) {
01192             add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01193           }
01194         }
01195       }
01196     }
01197     /* if short pause model doesn't have a model skip transition, also add it */
01198     if (hmm_logical_trans(hmminfo->sp)->a[0][hmm_logical_state_num(hmminfo->sp)-1] == LOG_ZERO) {
01199       /* to make insertion sp model to have no effect on the original path,
01200          the skip transition probability should be 0.0 (=100%) */
01201       prob = 0.0;
01202       for(kkk=0; kkk<out_num_prev; kkk++) {
01203         out_from_next[out_num_next] = out_from[kkk];
01204         out_a_next[out_num_next] = out_a[kkk] + prob;
01205         out_num_next++;
01206       }
01207     }
01208     /* connect arcs within model, and store new outgoing arcs for wordend node */
01209     for (k = 1; k < hmm_logical_state_num(hmminfo->sp) - 1; k++) {
01210       for (ato = 1; ato < hmm_logical_state_num(hmminfo->sp); ato++) {
01211         prob = hmm_logical_trans(hmminfo->sp)->a[k][ato];
01212         if (prob != LOG_ZERO) {
01213           if (ato == hmm_logical_state_num(hmminfo->sp) - 1) {
01214             out_from_next[out_num_next] = ntmp;
01215             out_a_next[out_num_next] = prob;
01216             out_num_next++;
01217           } else {
01218             add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01219           }
01220         }
01221       }
01222       ntmp++;
01223     }
01224     /* swap work area for next */
01225     for(kkk=0;kkk<out_num_next;kkk++) {
01226       out_from[kkk] = out_from_next[kkk];
01227       out_a[kkk] = out_a_next[kkk];
01228     }
01229     out_num_prev = out_num_next;
01230   }
01231 
01232 #endif /* MULTIPATH_VERSION */
01233           
01234   /* make mapping: word <-> node on wchmm */
01235   for (j=0;j<word_len;j++) {
01236     if (j < add_head) { /* shared part */
01237       wchmm->offset[word][j] = wchmm->offset[matchword][j];
01238     } else if (add_tail < j) { /* shared tail part (should not happen..) */
01239       wchmm->offset[word][j] = wchmm->offset[matchword][j+(matchword_len-word_len)];
01240     } else {                    /* newly created part */
01241       wchmm->offset[word][j] = n;
01242       n += hmm_logical_state_num(wchmm->winfo->wseq[word][j]) - 2;
01243     }
01244   }
01245 #ifdef MULTIPATH_VERSION
01246 
01247   /* paranoia check if the short-pause addition has been done well */
01248   if (enable_iwsp && add_tail - add_head + 1 > 0) {
01249     n += hmm_logical_state_num(hmminfo->sp) - 2;
01250     if (n != ntmp) j_error("Algorithm Error! cannot match\n");
01251   }
01252   
01253   /* create word-end node */
01254   wchmm->wordend[word] = n;     /* tail node of 'word' is 'n' */
01255   wchmm->stend[n] = word;       /* node 'k' is a tail node of 'word' */
01256   wchmm->state[n].ac = NULL;
01257   wchmm->state[n].out.state = NULL;
01258   
01259   /* connect the final outgoing arcs in out_from[] to the word end node */
01260   for(k = 0; k < out_num_prev; k++) {
01261     add_wacc(wchmm, out_from[k], out_a[k], n);
01262   }
01263   n++;
01264   if (n >= wchmm->maxwcn) wchmm_extend(wchmm);
01265 
01266   if (matchlen == 0) {
01267     /* check if the new word has whole word-skipping transition */
01268     /* (use out_from and out_num_prev temporary) */
01269     out_num_prev = 0;
01270     get_outtrans_list(wchmm, word, word_len-1, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, TRUE);
01271     for(k=0;k<out_num_prev;k++) {
01272       if (out_from[k] == wchmm->wordbegin[word]) {
01273         j_printerr("\n*** ERROR: WORD SKIPPING TRANSITION NOT ALLOWED ***\n");
01274         j_printerr("  Word id=%d (%s[%s]) has \"word skipping transition\".\n", word, wchmm->winfo->wname[word], wchmm->winfo->woutput[word]);
01275         j_printerr("  All HMMs in the word:\n    ");
01276         for(kkk=0;kkk<wchmm->winfo->wlen[word];kkk++) {
01277           j_printerr("%s ", wchmm->winfo->wseq[word][kkk]->name);
01278         }
01279         j_printerr("\n  has transitions from initial state to final state.\n");
01280         j_error("  This type of word skipping is not supported.\n");
01281       }
01282     }
01283   }
01284   
01285   wchmm->n = n;
01286   free(out_from);
01287   free(out_from_next);
01288   free(out_a);
01289   free(out_a_next);
01290   
01291 #else  /* ~MULTIPATH_VERSION */
01292   
01293   wchmm->n = n;
01294   wchmm->ststart[wchmm->offset[word][0]] = word; /* word head */
01295   k = wchmm->offset[word][word_len-1] + hmm_logical_state_num(wchmm->winfo->wseq[word][word_len-1])-2 -1;
01296   wchmm->wordend[word] = k;     /* tail node of 'word' is 'k' */
01297   wchmm->stend[k] = word;       /* node 'k' is a tail node of 'word' */
01298   
01299   if (matchlen != 0 && add_tail - add_head + 1 > 0) {
01300     /* new part has been created in the above procedure: */
01301     /* now make link from shared part to the new part */
01302     wchmm_link_subword(wchmm, matchword,add_to,word,add_head);  
01303   }
01304   
01305 #endif /* ~MULTIPATH_VERSION */
01306   
01307 }
01308 
01309 /**************************************************************/
01310 /**** calculate overall info (after wchmm has been built) *****/
01311 /**************************************************************/
01312 
01313 #ifndef MULTIPATH_VERSION
01314 
01328 static void
01329 wchmm_index_ststart(WCHMM_INFO *wchmm)
01330 {
01331   int n;
01332   int id;
01333 
01334   id = 0;
01335   for (n=0;n<wchmm->n;n++) {
01336     if (wchmm->ststart[n] != WORD_INVALID) {
01337 #ifdef USE_NGRAM
01338       /* 先頭単語の始端へは遷移させないので,インデックスに含めない */
01339       /* exclude silence model on beginning of a sentence from the index:
01340          It cannot come after other words */
01341       if (wchmm->ststart[n] == wchmm->winfo->head_silwid) continue;
01342 #endif
01343       wchmm->startnode[id] = n;
01344       id++;
01345       if (id > wchmm->winfo->num) {
01346         j_printerr("Error: start node num exceeded %d\n", wchmm->winfo->num);
01347       }
01348     }
01349   }
01350   wchmm->startnum = id;         /* total num */
01351 }
01352 
01366 static void
01367 wchmm_calc_wordend_arc(WCHMM_INFO *wchmm)
01368 {
01369   WORD_ID w;
01370   HTK_HMM_Trans *tr;
01371   LOGPROB a;
01372 
01373   for (w=0;w<wchmm->winfo->num;w++) {
01374     tr = hmm_logical_trans(wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]);
01375     a = tr->a[tr->statenum-2][tr->statenum-1];
01376     wchmm->wordend_a[w] = a;
01377   }
01378 }
01379 
01380 #endif /* ~MULTIPATH_VERSION */
01381 
01382 /********************************************************************/
01383 /****** for separation (linearization) of high-frequent words *******/
01384 /********************************************************************/
01385 #ifdef USE_NGRAM
01386 #ifdef SEPARATE_BY_UNIGRAM
01387 
01406 static int
01407 compare_prob(LOGPROB *a, LOGPROB *b)
01408 {
01409   if (*a < *b)  return (1);
01410   if (*a > *b)  return (-1);
01411   return(0);
01412 }
01413 
01432 static LOGPROB
01433 get_nbest_uniprob(WORD_INFO *winfo, int n)
01434 {
01435   LOGPROB *u_p;
01436   WORD_ID w;
01437   LOGPROB x;
01438 
01439   if (n < 1) n = 1;
01440   if (n > winfo->num) n = winfo->num;
01441 
01442   /* store all unigram probability to u_p[] */
01443   u_p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * winfo->num);
01444   for(w=0;w<winfo->num;w++) u_p[w] =
01445                               uni_prob(ngram, winfo->wton[w])
01446 #ifdef CLASS_NGRAM
01447                               + winfo->cprob[w]
01448 #endif
01449                               ;
01450   /* sort them downward */
01451   qsort(u_p, winfo->num, sizeof(LOGPROB),
01452         (int (*)(const void *,const void *))compare_prob);
01453 
01454   /* return the Nth value */
01455   x = u_p[n-1];
01456   free(u_p);
01457   return(x);
01458 }
01459 
01460 #endif
01461 #endif /* USE_NGRAM */
01462 
01463 
01464 /**********************************************************/
01465 /****** MAKE WCHMM (LEXICON TREE) --- main function *******/
01466 /**********************************************************/
01467 static int separated_word_count; 
01468 
01469 #ifdef CATEGORY_TREE
01470 
01471 #define COUNT_STEP 500         
01472 
01473 
01490 void
01491 build_wchmm(WCHMM_INFO *wchmm)
01492 {
01493   int i,j;
01494   int matchword=0, sharelen=0, maxsharelen=0;
01495   int counter = COUNT_STEP;
01496 #ifdef SEPARATE_BY_UNIGRAM
01497   LOGPROB separate_thres;
01498 #endif
01499 
01500   /* lingustic infos must be set before build_wchmm() is called */
01501   /* check if necessary lingustic info is already assigned (for debug) */
01502   if (wchmm->winfo == NULL
01503 #ifdef USE_NGRAM
01504       || wchmm->ngram == NULL
01505 #endif
01506 #ifdef USE_DFA
01507       || wchmm->dfa == NULL
01508 #endif
01509       ) {
01510     j_error("InternalError: build_wchmm: lingustic info not available!!\n");
01511   }
01512   
01513 #ifdef SEPARATE_BY_UNIGRAM
01514   /* 上位[separate_wnum]番目の1-gramスコアを求める */
01515   /* 1-gramスコアがこの値以上のものは木から分ける */
01516   separate_thres = get_nbest_uniprob(wchmm->winfo, separate_wnum);
01517 #endif
01518 
01519 #ifdef PASS1_IWCD
01520 #ifdef CATEGORY_TREE
01521   if (ccd_flag && !old_iwcd_flag) {
01522     /* 全てのカテゴリID付き lcd_set を作成 */
01523     lcdset_register_with_category_all(wchmm, hmminfo, winfo, dfa);
01524   }
01525 #endif /* CATEGORY_TREE */
01526 #endif /* PASS1_IWCD */
01527   
01528 
01529   /* wchmmを初期化 */
01530   wchmm_init(wchmm);
01531 
01532   /* カウンタリセット */
01533   separated_word_count=0;
01534 
01535   j_printerr("Building HMM lexicon tree (left-to-right)...\n");
01536   for (i=0;i<wchmm->winfo->num;i++) {
01537     if (verbose_flag) {
01538       if (i >= counter) {
01539        j_printerr("\r %5d words proceeded (%6d nodes)",i,wchmm->n);
01540        counter += COUNT_STEP;
01541       }
01542     }
01543 #ifdef USE_NGRAM
01544     if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01545       /* 先頭/末尾の無音モデルは木構造化せず,
01546        * 先頭の無音単語の先頭への遷移,末尾単語の末尾からの遷移は作らない*/
01547       wchmm_add_word(wchmm, i,0,0); /* sharelen=0でそのまま */
01548       continue;
01549     }
01550 #ifndef NO_SEPARATE_SHORT_WORD
01551     if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01552       /* 長さの短い単語を木構造化しない(ここでは1音節) */
01553       wchmm_add_word(wchmm, i,0,0); /* sharelen=0でそのまま */
01554       separated_word_count++;
01555       continue;
01556     }
01557 #endif
01558 #ifdef SEPARATE_BY_UNIGRAM
01559     if (
01560        uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01561 #ifdef CLASS_NGRAM
01562        + wchmm->winfo->cprob[i]
01563 #endif
01564        >= separate_thres && separated_word_count < separate_wnum) {
01565       /* 頻度の高い単語を木構造化しない */
01566       /* separate_thres は上位separate_wnum番目のスコア */
01567       wchmm_add_word(wchmm, i,0,0);
01568       separated_word_count++;
01569       continue;
01570     }
01571 #endif
01572 #endif /* USE_NGRAM */
01573     /* 最も長く音素を共有出来る単語を探す */
01574     maxsharelen=0;
01575     for (j=0;j<i;j++) {
01576 #ifdef CATEGORY_TREE
01577       if (wchmm->winfo->wton[i] != wchmm->winfo->wton[j]) continue;
01578 #endif
01579       sharelen = wchmm_check_match(wchmm->winfo, i, j);
01580       if (sharelen == wchmm->winfo->wlen[i] && sharelen == wchmm->winfo->wlen[j]) {
01581        /* word に同音語が存在する */
01582        /* 必ず最大の長さであり,重複カウントを避けるためここで抜ける */
01583        maxsharelen = sharelen;
01584        matchword = j;
01585        break;
01586       }
01587       if (sharelen > maxsharelen) {
01588        matchword = j;
01589        maxsharelen = sharelen;
01590       }
01591     }
01592     wchmm_add_word(wchmm, i,maxsharelen,matchword);
01593   }
01594   j_printerr("\r %5d words ended     (%6d nodes)\n",i,wchmm->n);
01595 
01596 #if 0
01597   /* 木構造を作らない */
01598   for (i=0;i<wchmm->winfo->num;i++) {
01599     if (verbose_flag) {
01600       if (i >= counter) {
01601        j_printerr("  %5d words proceeded (%6d nodes)\n",i,wchmm->n);
01602        counter += COUNT_STEP;
01603       }
01604     }
01605     wchmm_add_word(wchmm, i,0,0); /* sharelen=0でそのまま */
01606   }
01607   j_printerr("  %5d words ended     (%6d nodes)\n",i,wchmm->n);
01608 #endif  
01609 
01610 #ifndef MULTIPATH_VERSION
01611   /* 同一音素系列を持つ単語同士の leaf node を2重化して区別する */
01612   wchmm_duplicate_leafnode(wchmm);
01613   VERMES("  %d leaf nodes are made unshared\n",dupcount);
01614   
01615   /* 単語の終端から外への遷移確率を求めておく */
01616   wchmm_calc_wordend_arc(wchmm);
01617 #endif
01618 
01619   /* wchmmの整合性をチェックする */
01620   check_wchmm(wchmm);
01621 
01622 #ifndef MULTIPATH_VERSION
01623   /* 単語の先頭ノード ststart の番号を 別に格納 */
01624   wchmm_index_ststart(wchmm);
01625 #endif
01626 
01627   /* factoring用に各状態に後続単語のリストを付加する */
01628 #ifndef CATEGORY_TREE
01629   make_successor_list(wchmm);
01630 #ifdef USE_NGRAM
01631 #ifdef UNIGRAM_FACTORING
01632   /* 前もってfactoring値を計算 */
01633   /* 末端以外のscは必要ないのでフリーする */
01634   calc_all_unigram_factoring_values(wchmm);
01635   j_printerr("  1-gram factoring values has been pre-computed\n");
01636 #endif /* UNIGRAM_FACTORING */
01637 #endif /* USE_NGRAM */
01638 #ifdef MULTIPATH_VERSION
01639   /* 構築された factoring 情報をスキップ遷移および文頭文法ノードにコピー */
01640   adjust_sc_index(wchmm);
01641 #endif
01642 #ifdef USE_NGRAM
01643 #ifdef UNIGRAM_FACTORING
01644   /* 単語間LMキャッシュが必要なノードのリストを作る */
01645   make_iwcache_index(wchmm);
01646 #endif /* UNIGRAM_FACTORING */
01647 #endif /* USE_NGRAM */
01648 #endif /* CATEGORY_TREE */
01649 
01650   j_printerr("done\n");
01651 
01652   /* 起動時 -check でチェックモードへ */
01653   if (wchmm_check_flag) {
01654     wchmm_check_interactive(wchmm);
01655   }
01656 }
01657 
01658 #endif /* CATEGORY_TREE */
01659 
01681 void
01682 build_wchmm2(WCHMM_INFO *wchmm)
01683 {
01684   int i,j, last_i;
01685   int count_step, counter;
01686   WORD_ID *windex;
01687 #ifdef USE_NGRAM
01688 #ifdef SEPARATE_BY_UNIGRAM
01689   LOGPROB separate_thres;
01690 #endif
01691 #endif
01692 
01693   /* lingustic infos must be set before build_wchmm() is called */
01694   /* check if necessary lingustic info is already assigned (for debug) */
01695   if (wchmm->winfo == NULL
01696 #ifdef USE_NGRAM
01697       || wchmm->ngram == NULL
01698 #endif
01699 #ifdef USE_DFA
01700       || wchmm->dfa == NULL
01701 #endif
01702       ) {
01703     j_error("InternalError: build_wchmm: lingustic info not available!!\n");
01704   }
01705   
01706   separated_word_count = 0;
01707   count_step = wchmm->winfo->num / 10;
01708   counter = count_step;
01709   
01710   j_printerr("Building HMM lexicon tree");
01711   
01712 #ifdef USE_NGRAM
01713 #ifdef SEPARATE_BY_UNIGRAM
01714   /* compute score threshold beforehand to separate words from tree */
01715   /* here we will separate best [separate_wnum] words from tree */
01716   separate_thres = get_nbest_uniprob(wchmm->winfo, separate_wnum);
01717 #endif
01718 #endif
01719 
01720 #ifdef PASS1_IWCD
01721 #ifdef CATEGORY_TREE
01722   if (ccd_flag && !old_iwcd_flag) {
01723     /* when Julian mode (category-tree) and triphone is used,
01724        make all category-indexed context-dependent phone set (cdset) here */
01725     /* these will be assigned on the last phone of each word on tree */
01726     lcdset_register_with_category_all(wchmm, hmminfo, winfo, dfa);
01727   }
01728 #endif /* CATEGORY_TREE */
01729 #endif /* PASS1_IWCD */
01730   
01731   /* initialize wchmm */
01732   wchmm_init(wchmm);
01733 
01734   /* make sorted word index ordered by phone sequence */
01735   windex = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wchmm->winfo->num);
01736   for(i=0;i<wchmm->winfo->num;i++) windex[i] = i;
01737 #ifdef CATEGORY_TREE
01738   /* sort by category -> sort by word ID in each category */
01739   wchmm_sort_idx_by_category(wchmm->winfo, windex, wchmm->winfo->num);
01740   {
01741     int last_cate;
01742     last_i = 0;
01743     last_cate = wchmm->winfo->wton[windex[0]];
01744     for(i = 1;i<wchmm->winfo->num;i++) {
01745       if (wchmm->winfo->wton[windex[i]] != last_cate) {
01746         wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, i - last_i);
01747         last_cate = wchmm->winfo->wton[windex[i]];
01748         last_i = i;
01749       }
01750     }
01751     wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, wchmm->winfo->num - last_i);
01752   }
01753 #else
01754   /* sort by word ID for whole vocabulary */
01755   wchmm_sort_idx_by_wseq(wchmm->winfo, windex, 0, wchmm->winfo->num);
01756 #endif
01757 
01758 /* 
01759  *   {
01760  *     int i,w;
01761  *     for(i=0;i<wchmm->winfo->num;i++) {
01762  *       w = windex[i];
01763  *       printf("%d: cate=%4d wid=%4d %s\n",i, wchmm->winfo->wton[w], w, wchmm->winfo->woutput[w]);
01764  *     }
01765  *   }
01766  */
01767 
01768   /* incrementaly add words to lexicon tree */
01769   /* now for each word, the previous word (last_i) is always the most matched one */
01770   last_i = WORD_INVALID;
01771   for (j=0;j<wchmm->winfo->num;j++) {
01772     i = windex[j];
01773     if (j >= counter) {
01774       /*j_printerr("\r %5d words proceeded (%6d nodes)",j, wchmm->n);*/
01775       j_printerr(".");
01776       counter += count_step;
01777     }
01778 #ifdef USE_NGRAM
01779     /* start/end silence word should not be shared */
01780     if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01781       wchmm_add_word(wchmm, i,0,0); /* add whole word as new (sharelen=0) */
01782       continue;
01783     }
01784 #ifndef NO_SEPARATE_SHORT_WORD
01785     /* separate short words from tree */
01786     if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01787       wchmm_add_word(wchmm, i,0,0);
01788       separated_word_count++;
01789       continue;
01790     }
01791 #endif
01792 #ifdef SEPARATE_BY_UNIGRAM
01793     /* separate high-frequent words from tree (threshold = separate_thres) */
01794     if (
01795         uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01796 #ifdef CLASS_NGRAM
01797         + wchmm->winfo->cprob[i]
01798 #endif
01799         >= separate_thres && separated_word_count < separate_wnum) {
01800       wchmm_add_word(wchmm, i,0,0);
01801       separated_word_count++;
01802       continue;
01803     }
01804 #endif
01805 #endif /* USE_NGRAM */
01806     if (last_i == WORD_INVALID) { /* first word */
01807       wchmm_add_word(wchmm, i,0,0);
01808     } else {
01809       /* the previous word (last_i) is always the most matched one */
01810 #ifdef CATEGORY_TREE
01811       if (wchmm->winfo->wton[i] != wchmm->winfo->wton[last_i]) {
01812         wchmm_add_word(wchmm, i,0,0);
01813       } else {
01814         wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i);
01815       }
01816 #else
01817       wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i);
01818 #endif
01819     }
01820     last_i = i;
01821     
01822   } /* end of add word loop */
01823   
01824   /*j_printerr("\r %5d words ended     (%6d nodes)\n",j,wchmm->n);*/
01825 
01826   /* free work area */
01827   free(windex);
01828 
01829 #ifdef MULTIPATH_VERSION
01830   j_printerr("%d nodes\n", wchmm->n);
01831 #else
01832   /* duplicate leaf nodes of homophone/embedded words */
01833   j_printerr("%d", wchmm->n);
01834   wchmm_duplicate_leafnode(wchmm);
01835   j_printerr("+%d=%d nodes\n",dupcount, wchmm->n);
01836 #endif
01837 
01838 #ifndef MULTIPATH_VERSION
01839   /* calculate transition probability of word end node to outside */
01840   wchmm_calc_wordend_arc(wchmm);
01841 #endif
01842 
01843   /* check wchmm coherence (internal debug) */
01844   check_wchmm(wchmm);
01845 
01846 #ifndef MULTIPATH_VERSION
01847   /* make index of word-beginning nodes (for inter-word transition) */
01848   wchmm_index_ststart(wchmm);
01849 #endif
01850 
01851   /* make successor list for all branch nodes for N-gram factoring */
01852 #ifndef CATEGORY_TREE
01853   make_successor_list(wchmm);
01854 #ifdef UNIGRAM_FACTORING
01855   /* for 1-gram factoring, we can compute the values before search */
01856   calc_all_unigram_factoring_values(wchmm);
01857   j_printerr("  1-gram factoring values has been pre-computed\n");
01858 #endif /* UNIGRAM_FACTORING */
01859 #ifdef MULTIPATH_VERSION
01860   /* Copy the factoring data according to the skip transitions and startword nodes */
01861   adjust_sc_index(wchmm);
01862 #endif
01863 #ifdef UNIGRAM_FACTORING
01864   /* make list of start nodes that needs inter-word LM cache */
01865   make_iwcache_index(wchmm);
01866 #endif /* UNIGRAM_FACTORING */
01867 #endif
01868 
01869   j_printerr("done\n");
01870 
01871   /* go into interactive check mode ("-check" on start) */
01872   if (wchmm_check_flag) {
01873     wchmm_check_interactive(wchmm);
01874   }
01875 
01876 #ifdef WCHMM_SIZE_CHECK
01877   /* detailed check of lexicon tree size (inaccurate!) */
01878   j_printf("--- estimated size of word lexicon ---\n");
01879   j_printf("wchmm: %d words, %d nodes\n", wchmm->winfo->num, wchmm->n);
01880   j_printf("%9d bytes: wchmm->state[node] (exclude ac, sc)\n", sizeof(WCHMM_STATE) * wchmm->n);
01881 #ifndef MULTIPATH_VERSION
01882   j_printf("%9d bytes: wchmm->ststart[node]\n", sizeof(WORD_ID) * wchmm->n);
01883 #endif
01884   printf("%9d bytes: wchmm->stend[node]\n", sizeof(WORD_ID) * wchmm->n);
01885   {
01886     int w,count;
01887     count = 0;
01888     for(w=0;w<wchmm->winfo->num;w++) {
01889       count += wchmm->winfo->wlen[w] * sizeof(int) + sizeof(int *);
01890     }
01891     printf("%9d bytes: wchmm->offset[w][]\n", count);
01892   }
01893 #ifdef MULTIPATH_VERSION
01894   j_printf("%9d bytes: wchmm->wordbegin[w]\n", wchmm->winfo->num * sizeof(int));
01895 #endif
01896   j_printf("%9d bytes: wchmm->wordend[w]\n", wchmm->winfo->num * sizeof(int));
01897   j_printf("%9d bytes: wchmm->startnode[]\n", wchmm->startnum * sizeof(int));
01898 #ifdef MULTIPATH_VERSION
01899 #ifdef CATEGORY_TREE
01900   j_printf("%9d bytes: wchmm->start2wid[]\n", wchmm->startnum * sizeof(WORD_ID));
01901 #endif
01902 #endif
01903 #ifdef UNIGRAM_FACTORING
01904   printf("%9d bytes: wchmm->start2isolate[]\n", wchmm->isolatenum * sizeof(int));
01905 #endif
01906   printf("%9d bytes: wchmm->wordend_a[]\n", wchmm->winfo->num * sizeof(LOGPROB));
01907   printf("%9d bytes: wchmm->outstyle[]\n", wchmm->n * sizeof(unsigned char));
01908 #ifndef CATEGORY_TREE
01909   printf("%9d bytes: wchmm->sclist[]\n", wchmm->scnum * sizeof(S_CELL *));
01910   printf("%9d bytes: wchmm->sclist2node[]\n", wchmm->scnum * sizeof(int));
01911 #ifdef UNIGRAM_FACTORING
01912   printf("%9d bytes: wchmm->fscore[]\n", wchmm->fsnum * sizeof(LOGPROB));
01913 #endif  
01914 #endif
01915   
01916   printf("under state[]:\n");
01917   {
01918     A_CELL *ac;
01919     int count,n;
01920     count = 0;
01921     for(n=0;n<wchmm->n;n++) {
01922       for(ac=wchmm->state[n].ac;ac;ac=ac->next) {
01923         count += sizeof(A_CELL);
01924       }
01925     }
01926     printf("\t%9d: ac\n", count);
01927   }
01928 #ifndef CATEGORY_TREE
01929   {
01930     S_CELL *sc;
01931     int count,n;
01932     count = 0;
01933     for(n=0;n<wchmm->n;n++) {
01934       for(sc=wchmm->state[n].sc;sc;sc=sc->next) {
01935         count += sizeof(S_CELL);
01936       }
01937     }
01938     printf("\t%9d: sc\n", count);
01939   }
01940 #endif
01941 #endif /* WCHMM_SIZE_CHECK */
01942 }
01943 
01944 
01957 void
01958 print_wchmm_info(WCHMM_INFO *wchmm)
01959 {
01960   int n,i, rootnum;
01961 
01962 #ifdef MULTIPATH_VERSION
01963   rootnum = wchmm->startnum;
01964 #else
01965 #ifdef USE_NGRAM
01966   rootnum = wchmm->startnum + 1;        /* including winfo->head_silwid */
01967 #else
01968   rootnum = wchmm->startnum;
01969 #endif /* USE_NGRAM */
01970 #endif /* MULTIPATH_VERSION */
01971   
01972   j_printf("Lexicon tree info:\n");
01973   j_printf("\t total node num = %6d\n", wchmm->n);
01974 #ifdef USE_NGRAM
01975   j_printf("\t  root node num = %6d\n", rootnum);
01976 #ifdef NO_SEPARATE_SHORT_WORD
01977 #ifdef SEPARATE_BY_UNIGRAM
01978   j_printf("\t(%d hi-freq. words are separated from tree lexicon)\n", separated_word_count);
01979 #else
01980   j_printf(" (no words are separated from tree)\n");
01981 #endif /* SEPARATE_BY_UNIGRAM */
01982 #else
01983   j_printf(" (%d short words (<= %d phonemes) are separated from tree)\n", separated_word_count, SHORT_WORD_LEN);
01984 #endif /* NO_SEPARATE_SHORT_WORD */
01985 #else /* USE_NGRAM */
01986   j_printf("\t  root node num = %6d\n", rootnum);
01987 #endif /* USE_NGRAM */
01988   for(n=0,i=0;i<wchmm->n;i++) {
01989     if (wchmm->stend[i] != WORD_INVALID) n++;
01990   }
01991   j_printf("\t  leaf node num = %6d\n", n);
01992 #ifndef CATEGORY_TREE
01993   j_printf("\t fact. node num = %6d\n", wchmm->scnum - 1);
01994 #endif /* CATEGORY_TREE */
01995 }

Generated on Tue Mar 28 16:01:39 2006 for Julius by  doxygen 1.4.2