libjulius/src/wchmm.c

Go to the documentation of this file.
00001 
00037 /*
00038  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00039  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00040  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00041  * All rights reserved
00042  */
00043 
00044 /* wchmm = word conjunction HMM = lexicon tree */
00045 
00046 #include <julius/julius.h>
00047 
00048 
00049 #define WCHMM_SIZE_CHECK                
00050 
00051 /**************************************************************/
00052 /*********** Initialization of tree lexicon *******************/
00053 /**************************************************************/
00054 
00069 WCHMM_INFO *
00070 wchmm_new()
00071 {
00072   WCHMM_INFO *w;
00073   w = (WCHMM_INFO *)mymalloc(sizeof(WCHMM_INFO));
00074   w->lmtype = LM_UNDEF;
00075   w->lmvar  = LM_UNDEF;
00076   w->ngram = NULL;
00077   w->dfa = NULL;
00078   w->winfo = NULL;
00079   w->malloc_root = NULL;
00080 #ifdef PASS1_IWCD
00081   w->lcdset_category_root = NULL;
00082 #endif /* PASS1_IWCD */
00083   w->wrk.out_from_len = 0;
00084   /* reset user function entry point */
00085   w->uni_prob_user = NULL;
00086   w->bi_prob_user = NULL;
00087   return w;
00088 }
00089 
00102 static void
00103 wchmm_init(WCHMM_INFO *wchmm)
00104 {
00105   /* the resulting tree size is typically half of total state num */
00106   wchmm->maxwcn = wchmm->winfo->totalstatenum / 2;
00107   wchmm->state = (WCHMM_STATE *)mymalloc(sizeof(WCHMM_STATE)*wchmm->maxwcn);
00108   wchmm->self_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn);
00109   wchmm->next_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn);
00110   wchmm->ac = (A_CELL2 **)mymalloc(sizeof(A_CELL2 *)*wchmm->maxwcn);
00111   wchmm->stend = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn);
00112   wchmm->offset = (int **)mymalloc(sizeof(int *)*wchmm->winfo->num);
00113   wchmm->wordend = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00114   wchmm->maxstartnum = STARTNODE_STEP;
00115   wchmm->startnode = (int *)mymalloc(sizeof(int)*STARTNODE_STEP);
00116   wchmm->startnum = 0;
00117   if (wchmm->category_tree) {
00118     wchmm->start2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*STARTNODE_STEP);
00119   }
00120   if (wchmm->hmminfo->multipath) {
00121     wchmm->wordbegin = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00122     wchmm->wrk.out_from = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00123     wchmm->wrk.out_from_next = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00124     wchmm->wrk.out_a = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00125     wchmm->wrk.out_a_next = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00126     wchmm->wrk.out_from_len = wchmm->winfo->maxwn;
00127   } else {
00128     wchmm->wordend_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->winfo->num);
00129   }
00130 #ifdef PASS1_IWCD
00131   wchmm->outstyle = (unsigned char *)mymalloc(sizeof(unsigned char)*wchmm->maxwcn);
00132 #endif
00133 #ifdef UNIGRAM_FACTORING
00134   wchmm->start2isolate = NULL;
00135   wchmm->isolatenum = 0;
00136 #endif
00137   if (!wchmm->category_tree) {
00138     wchmm->sclist = NULL;
00139     wchmm->sclist2node = NULL;
00140 #ifdef UNIGRAM_FACTORING
00141     wchmm->fscore = NULL;
00142 #endif
00143   }
00144 
00145   wchmm->n = 0;
00146 }
00147 
00160 static void
00161 wchmm_extend(WCHMM_INFO *wchmm)
00162 {
00163   /* practical value! */
00164   wchmm->maxwcn += wchmm->winfo->totalstatenum / 6;
00165   wchmm->state = (WCHMM_STATE *)myrealloc(wchmm->state, sizeof(WCHMM_STATE)*wchmm->maxwcn);
00166   wchmm->self_a = (LOGPROB *)myrealloc(wchmm->self_a, sizeof(LOGPROB)*wchmm->maxwcn);
00167   wchmm->next_a = (LOGPROB *)myrealloc(wchmm->next_a, sizeof(LOGPROB)*wchmm->maxwcn);
00168   wchmm->ac = (A_CELL2 **)myrealloc(wchmm->ac, sizeof(A_CELL2 *)*wchmm->maxwcn);
00169   wchmm->stend = (WORD_ID *)myrealloc(wchmm->stend, sizeof(WORD_ID)*wchmm->maxwcn);
00170 #ifdef PASS1_IWCD
00171   wchmm->outstyle = (unsigned char *)myrealloc(wchmm->outstyle, sizeof(unsigned char)*wchmm->maxwcn);
00172 #endif
00173 }
00174 
00187 static void
00188 wchmm_extend_startnode(WCHMM_INFO *wchmm)
00189 {
00190   wchmm->maxstartnum += STARTNODE_STEP;
00191   wchmm->startnode = (int *)myrealloc(wchmm->startnode, sizeof(int) * wchmm->maxstartnum);
00192   if (wchmm->category_tree) {
00193     wchmm->start2wid = (WORD_ID *)myrealloc(wchmm->start2wid, sizeof(WORD_ID) * wchmm->maxstartnum);
00194   }
00195 }
00196 
00211 void
00212 wchmm_free(WCHMM_INFO *w)
00213 {
00214   S_CELL *sc, *sctmp;
00215   int i;
00216   /* wchmm->state[i].ac malloced by mybmalloc2() */
00217   /* wchmm->offset[][] malloced by mybmalloc2() */
00218 #ifdef PASS1_IWCD
00219   /* LRC_INFO, RC_INFO in wchmm->state[i].outsty malloced by mybmalloc2() */
00220 #endif
00221   /* they all will be freed by a single mybfree2() call */
00222   mybfree2(&(w->malloc_root));
00223   if (!w->category_tree) {
00224     if (w->sclist != NULL) {
00225       for(i=1;i<w->scnum;i++) {
00226         sc = w->sclist[i];
00227         while(sc) {
00228           sctmp = sc->next;
00229           free(sc);
00230           sc = sctmp;
00231         }
00232       }
00233       free(w->sclist);
00234     }
00235     if (w->sclist2node != NULL) free(w->sclist2node);
00236 #ifdef UNIGRAM_FACTORING
00237     if (w->fscore != NULL) free(w->fscore);
00238 #endif
00239   }
00240 #ifdef UNIGRAM_FACTORING
00241   if (w->start2isolate != NULL) free(w->start2isolate);
00242 #endif
00243 #ifdef PASS1_IWCD
00244   free(w->outstyle);
00245 #endif
00246   if (w->hmminfo->multipath) {
00247     free(w->wordbegin);
00248   } else {
00249     free(w->wordend_a);
00250   }
00251   if (w->category_tree) free(w->start2wid);
00252   free(w->startnode);
00253   free(w->wordend);
00254   free(w->offset);
00255   free(w->stend);
00256   free(w->ac);
00257   free(w->next_a);
00258   free(w->self_a);
00259   free(w->state);
00260 #ifdef PASS1_IWCD
00261   if (w->category_tree) lcdset_remove_with_category_all(w);
00262 #endif /* PASS1_IWCD */
00263   if (w->wrk.out_from_len != 0) {
00264     free(w->wrk.out_from);
00265     free(w->wrk.out_from_next);
00266     free(w->wrk.out_a);
00267     free(w->wrk.out_a_next);
00268     w->wrk.out_from_len = 0;
00269   }
00270   free(w);
00271 }
00272 
00273 
00274 /**************************************************************/
00275 /*********** Word sort functions for tree construction ********/
00276 /**************************************************************/
00277 
00278 static WORD_INFO *local_winfo;  
00279 
00298 static int
00299 compare_wseq(WORD_ID *widx1, WORD_ID *widx2)
00300 {
00301   int len1, len2, n;
00302   int p=0;
00303   
00304   len1 = local_winfo->wlen[*widx1];
00305   len2 = local_winfo->wlen[*widx2];
00306 
00307   n=0;
00308   /*  while (n < len1 && n < len2 && (p = (int)winfo->wseq[*widx1][n] - (int)winfo->wseq[*widx2][n]) == 0 ) n++;*/
00309   while (n < len1 && n < len2 && (p = strcmp((local_winfo->wseq[*widx1][n])->name, (local_winfo->wseq[*widx2][n])->name)) == 0 ) n++;
00310   if (n < len1) {
00311     if (n < len2) {
00312       /* differ */
00313       return(p);
00314     } else {
00315       /* 2 is part of 1 */
00316       return(1);
00317     }
00318   } else {
00319     if (n < len2) {
00320       /* 1 is part of 2 */
00321       return(-1);
00322     } else {
00323       /* same */
00324       return(0);
00325     }
00326   }
00327 }
00328 
00347 static void
00348 wchmm_sort_idx_by_wseq(WORD_INFO *winfo, WORD_ID *windex, WORD_ID bgn, WORD_ID len)
00349 {
00350   local_winfo = winfo;
00351   qsort(&(windex[bgn]), len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_wseq);
00352 }
00353 
00372 static int
00373 compare_category(WORD_ID *widx1, WORD_ID *widx2)
00374 {
00375   int c1,c2;
00376   c1 = local_winfo->wton[*widx1];
00377   c2 = local_winfo->wton[*widx2];
00378   return(c1 - c2);
00379 }
00380 
00397 static void
00398 wchmm_sort_idx_by_category(WORD_INFO *winfo, WORD_ID *windex, WORD_ID len)
00399 {
00400   local_winfo = winfo;
00401   qsort(windex, len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_category);
00402 }
00403   
00404 
00405 /**********************************************************************/
00406 /************** Subroutines to link part of words  ********************/
00407 /**********************************************************************/
00408 
00430 static int
00431 wchmm_check_match(WORD_INFO *winfo, int i, int j)
00432 {
00433   int k,tmplen;
00434 
00435   for (tmplen=0,k=0;k<winfo->wlen[i];k++) {
00436     if (k > winfo->wlen[j]-1)
00437       break;
00438     if (! (strmatch(winfo->wseq[i][k]->name, winfo->wseq[j][k]->name)))
00439       break;
00440     tmplen++;
00441   }
00442   return(tmplen);
00443 }
00444 
00457 static void
00458 acc_init(WCHMM_INFO *wchmm, int node)
00459 {
00460   wchmm->self_a[node] = LOG_ZERO;
00461   wchmm->next_a[node] = LOG_ZERO;
00462   wchmm->ac[node] = NULL;
00463 }
00464 
00481 static void
00482 add_ac(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00483 {
00484   A_CELL2 *ac2;
00485 
00486   for(ac2=wchmm->ac[node];ac2;ac2=ac2->next) {
00487     if (ac2->n < A_CELL2_ALLOC_STEP) break;
00488   }
00489   if (ac2 == NULL) {
00490     ac2 = (A_CELL2 *)mybmalloc2(sizeof(A_CELL2), &(wchmm->malloc_root));
00491     ac2->n = 0;
00492     ac2->next = wchmm->ac[node];
00493     wchmm->ac[node] = ac2;
00494   }
00495   ac2->arc[ac2->n] = arc;
00496   ac2->a[ac2->n]   = a;
00497   ac2->n++;
00498 }
00499 
00518 static void
00519 add_wacc(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00520 {
00521   if (arc == node) {
00522     wchmm->self_a[node] = a;
00523   } else if (arc == node + 1) {
00524     wchmm->next_a[node] = a;
00525   } else {
00526     add_ac(wchmm, node, a, arc);
00527   }
00528 }
00529 
00556 static void
00557 get_outtrans_list(WCHMM_INFO *wchmm, WORD_ID w, int pos, int *node, LOGPROB *a, int *num, int maxnum, boolean insert_sp)
00558 {
00559   HMM_Logical *ltmp;
00560   int states;
00561   int k;
00562   LOGPROB prob;
00563   int oldnum;
00564 
00565   if (pos < 0) {
00566     
00567     /* set the word-beginning node, and return */
00568     node[*num] = wchmm->wordbegin[w];
00569     a[*num] = 0.0;
00570     (*num)++;
00571     
00572   } else {
00573 
00574     ltmp = wchmm->winfo->wseq[w][pos];
00575     states = hmm_logical_state_num(ltmp);
00576 
00577     /* check initial->final state */
00578     if ((hmm_logical_trans(ltmp))->a[0][states-1] != LOG_ZERO) {
00579       /* recursive call for previous phone */
00580       oldnum = *num;
00581       get_outtrans_list(wchmm, w, pos-1, node, a, num, maxnum, FALSE); /* previous phone should not be an sp-inserted phone */
00582       /* add probability of the skip transition to all the previous ones */
00583       for(k=oldnum;k<*num;k++) {
00584         a[k] += (hmm_logical_trans(ltmp))->a[0][states-1];
00585       }
00586     }
00587     /* add to list the arcs from output state to final state */
00588     for (k = 1; k < states - 1; k++) {
00589       prob = (hmm_logical_trans(ltmp))->a[k][states-1];
00590       if (prob != LOG_ZERO) {
00591         if (*num >= maxnum) {
00592           j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum);
00593         }
00594         node[*num] = wchmm->offset[w][pos] + k - 1;
00595         a[*num] = prob;
00596         (*num)++;
00597       }
00598     }
00599     /* for -iwsp, add outgoing arc from the tail sp model
00600        only if need_sp == TRUE.
00601        need_sp should be TRUE only when the connecting [pos] phone is also an end phone of the to-be-added word (i.e. homophone word)
00602      */
00603     /*  */
00604     if (insert_sp) {
00605       /* consider sp */
00606       for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
00607         prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][hmm_logical_state_num(wchmm->hmminfo->sp)-1];
00608         if (prob != LOG_ZERO) {
00609           if (*num >= maxnum) {
00610             j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum);
00611           }
00612           node[*num] = wchmm->offset[w][pos] + (states - 2) + k - 1;
00613           a[*num] = prob;
00614           (*num)++;
00615         }
00616       }
00617     }
00618   }
00619   /*printf("   %d(%s)-%d:\"%s\", num=%d\n", w, wchmm->winfo->woutput[w], pos,
00620     (pos < 0) ? "BGN" : wchmm->winfo->wseq[w][pos]->name, *num);*/
00621   return;
00622 }  
00623 
00642 static void
00643 wchmm_link_hmm(WCHMM_INFO *wchmm, int from_node, int to_node, HTK_HMM_Trans *tinfo)
00644 {     
00645   A_CELL2 *actmp;
00646   LOGPROB a;
00647   int i, j;
00648   boolean tflag;
00649 
00650   /* get transition probability to outer state in tinfo */
00651   for(i = tinfo->statenum - 2; i >= 0; i--) {
00652     if ((a = tinfo->a[i][tinfo->statenum-1]) != LOG_ZERO) { /* found */
00653       /* check if the arc already exist */
00654       tflag = FALSE;
00655       if (to_node == from_node && wchmm->self_a[from_node] == a) {
00656         tflag = TRUE;
00657       } else if (to_node == from_node + 1 && wchmm->next_a[from_node] == a) {
00658         tflag = TRUE;
00659       } else {
00660         for (actmp = wchmm->ac[from_node]; actmp; actmp = actmp->next) {
00661           for(j=0;j<actmp->n;j++) {
00662             if (actmp->arc[j] == to_node && actmp->a[j] == a) {
00663               tflag = TRUE;
00664               break;
00665             }
00666           }
00667           if (tflag == TRUE) break;
00668         }
00669       }
00670       if (tflag) break;
00671       /* add the arc to wchmm */
00672       add_wacc(wchmm, from_node, a, to_node);
00673       return;                   /* exit function here */
00674     }
00675   }      
00676   j_internal_error("wchmm_link_hmm: No arc to endstate?\n");
00677 }
00678 
00699 static void
00700 wchmm_link_subword(WCHMM_INFO *wchmm, int from_word, int from_seq, int to_word, int to_seq)
00701 {     
00702   HMM_Logical *last;
00703   int lastp;
00704 
00705   last = wchmm->winfo->wseq[from_word][from_seq];
00706   lastp = wchmm->offset[from_word][from_seq] + hmm_logical_state_num(last)-2 -1;
00707   wchmm_link_hmm(wchmm, lastp, wchmm->offset[to_word][to_seq],
00708                  hmm_logical_trans(last));
00709 }
00710 
00711 /**************************************************************/
00712 /******** homophone processing: duplicating leaf nodes ********/
00713 /**************************************************************/
00714 
00754 static void
00755 wchmm_duplicate_state(WCHMM_INFO *wchmm, int node, int word) /* source node, new word */
00756 {
00757   int j, n;
00758   int n_src, n_prev;
00759   A_CELL2       *ac;
00760   HMM_Logical *lastphone;
00761 
00762   /* 1 state will newly created: expand tree if needed */
00763   if (wchmm->n + 1 >= wchmm->maxwcn) {
00764     wchmm_extend(wchmm);
00765   }
00766   /* n: the target new node to which 'node' is copied */
00767   n = wchmm->n;
00768 
00769   n_src = node;
00770 
00771   /* copy output probability info */
00772 #ifdef PASS1_IWCD
00773   {
00774     RC_INFO *rcnew;
00775     LRC_INFO *lrcnew;
00776     wchmm->outstyle[n] = wchmm->outstyle[n_src];
00777     if (wchmm->outstyle[n] == AS_RSET) {
00778       /* duplicate RC_INFO because it has its own cache */
00779       rcnew = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
00780       memcpy(rcnew, wchmm->state[n_src].out.rset, sizeof(RC_INFO));
00781       wchmm->state[n].out.rset = rcnew;
00782     } else if (wchmm->outstyle[n] == AS_LRSET) {
00783       /* duplicate LRC_INFO because it has its own cache */
00784       lrcnew = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
00785       memcpy(lrcnew, wchmm->state[n_src].out.lrset, sizeof(LRC_INFO));
00786       wchmm->state[n].out.lrset = lrcnew;
00787     } else {
00788       /* share same info, simply copy the pointer */
00789       memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(ACOUSTIC_SPEC));
00790     }
00791   }
00792 #else  /* ~PASS1_IWCD */
00793   memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(HTK_HMM_State *));
00794 #endif
00795 
00796   lastphone = wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-1];
00797   acc_init(wchmm, n);
00798 
00799   /* add self transition arc */
00800   wchmm->self_a[n] = wchmm->self_a[n_src];
00801   
00802   /* copy transition arcs whose destination is the source node to new node */
00803   if (hmm_logical_state_num(lastphone) == 3) { /* = 1 state */
00804     /* phone with only 1 state should be treated carefully */
00805     if (wchmm->winfo->wlen[word] == 1) { /* word consists of only this phone */
00806       /* no arcs need to be copied: this is also a start node of a word */
00807       wchmm->offset[word][0] = n;
00808       /* index the new word-beginning node as startnode (old ststart) */
00809       if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) {
00810         wchmm->startnode[wchmm->startnum] = n;
00811         if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
00812         /* expand data area if necessary */
00813         if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
00814       }
00815     } else {
00816       /* copy arcs from the last state of the previous phone */
00817       n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-2]
00818         + hmm_logical_state_num(wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-2]) - 3;
00819       if(n_src == n_prev + 1) {
00820         add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n);
00821       } else {
00822         for(ac=wchmm->ac[n_prev];ac;ac=ac->next) {
00823           for(j=0;j<ac->n;j++) {
00824             if (ac->arc[j] == n_src) {
00825               add_wacc(wchmm, n_prev, ac->a[j], n);
00826             }
00827           }
00828         }
00829       }
00830       /* also update the last offset (== wordend in this case) */
00831       wchmm->offset[word][wchmm->winfo->wlen[word]-1] = n;
00832     }
00833   } else {                      /* phone with more than 2 states */
00834     /* copy arcs from/to the source node to new node */
00835     for (n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-1]; n_prev < n_src; n_prev++) {
00836       if (n_src == n_prev + 1) {
00837         add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n);
00838       } else {
00839         for(ac=wchmm->ac[n_prev];ac;ac=ac->next) {
00840           for(j=0;j<ac->n;j++) {
00841             if (ac->arc[j] == n_src) {
00842               add_wacc(wchmm, n_prev, ac->a[j], n);
00843             }
00844           }
00845         }
00846       }
00847       if (n_prev == n_src + 1) {
00848         add_wacc(wchmm, n, wchmm->next_a[n_src], n_prev);
00849       } else {
00850         for(ac=wchmm->ac[n_src];ac;ac=ac->next) {
00851           for(j=0;j<ac->n;j++) {
00852             if (ac->arc[j] == n_prev) {
00853               add_wacc(wchmm, n, ac->a[j], n_prev);
00854             }
00855           }
00856         }
00857       }
00858     }
00859   }
00860 
00861   /* map word <-> node */
00862   wchmm->stend[n]   = word;     /* 'n' is an end node of word 'word' */
00863   wchmm->wordend[word] = n;     /* the word end node of 'word' is 'n' */
00864 
00865   /* new state has been created: increment the size */
00866   wchmm->n++;
00867   
00868 }
00869 
00884 static int
00885 wchmm_duplicate_leafnode(WCHMM_INFO *wchmm)
00886 {
00887   int w, nlast, n, narc, narc_model;
00888   boolean *dupw;                /* node marker */
00889   A_CELL2 *actmp;
00890   int dupcount;
00891 
00892   dupcount = 0;
00893 
00894   nlast = wchmm->n;
00895   dupw = (boolean *)mymalloc(sizeof(boolean) * nlast);
00896   for(n=0;n<nlast;n++) dupw[n] = FALSE; /* initialize all marker */
00897 
00898   for (w=0;w<wchmm->winfo->num;w++) {
00899     n = wchmm->wordend[w];
00900     if (dupw[n]) {              /* if already marked (2nd time or later */
00901       wchmm_duplicate_state(wchmm, n, w); dupcount++; /* duplicate */
00902     } else {                    /* if not marked yet (1st time) */
00903       /* try to find an arc outside the word */
00904       {
00905         /* count number of model-internal arc from the last state */
00906         HMM_Logical *lastphone;
00907         HTK_HMM_Trans *tinfo;
00908         int laststate, i;
00909         lastphone = wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1];
00910         laststate = hmm_logical_state_num(lastphone) - 2;
00911         tinfo = hmm_logical_trans(lastphone);
00912         narc_model=0;
00913         for(i=1;i<hmm_logical_state_num(lastphone)-1;i++) {
00914           if (tinfo->a[laststate][i] != LOG_ZERO) narc_model++;
00915         }
00916         /* count number of actual arc from the last state in the tree */
00917         narc = 0;
00918         if (wchmm->self_a[n] != LOG_ZERO) narc++;
00919         if (wchmm->next_a[n] != LOG_ZERO) narc++;
00920         for(actmp=wchmm->ac[n];actmp;actmp=actmp->next) narc += actmp->n;
00921       }
00922       /* if both number does not match, it means it is not a single word tail */
00923       if (narc_model != narc) {
00924         /* word 'w' is embedded as part of other words at this node 'n' */
00925         /* duplicate this node now */
00926         wchmm_duplicate_state(wchmm, n, w); dupcount++;
00927         /* as new node has been assigned as word end node of word 'w',
00928            reset this source node as it is not the word end node */
00929         wchmm->stend[n] = WORD_INVALID;
00930       } else {
00931         /* no arc to other node found, it means it is a single word tail */
00932         /* as this is first time, only make sure that this node is word end of [w] */
00933         wchmm->stend[n] = w;
00934       }
00935       /* mark node 'n' */
00936       dupw[n] = TRUE;
00937     }
00938   }
00939   free(dupw);
00940 
00941   return(dupcount);
00942 }
00943 
00944 /**************************************************************/
00945 /*************** add a word to wchmm lexicon tree *************/
00946 /**************************************************************/
00947 
00972 static boolean
00973 wchmm_add_word(WCHMM_INFO *wchmm, int word, int matchlen, int matchword, boolean enable_iwsp)
00974 {
00975   boolean ok_p;
00976   int   j,k,n;
00977   int   add_head, add_tail, add_to;
00978   int   word_len, matchword_len;
00979   HMM_Logical *ltmp;
00980   int ato;
00981   LOGPROB prob;
00982   int ntmp;
00983   int ltmp_state_num;
00984 #ifdef PASS1_IWCD
00985   CD_Set *lcd = NULL;
00986 #endif
00987   int *out_from;
00988   int *out_from_next;
00989   LOGPROB *out_a;
00990   LOGPROB *out_a_next;
00991 
00992   
00993   /* for multipath handling */
00994   int out_num_prev, out_num_next;
00995   int kkk;
00996 
00997   ok_p = TRUE;
00998   if (wchmm->hmminfo->multipath) {
00999     out_from = wchmm->wrk.out_from;
01000     out_from_next = wchmm->wrk.out_from_next;
01001     out_a = wchmm->wrk.out_a;
01002     out_a_next = wchmm->wrk.out_a_next;
01003   }
01004   
01005 /* 
01006  *   if (matchlen > 0) {
01007  *     printf("--\n");
01008  *     put_voca(stdout, wchmm->winfo, word);
01009  *     put_voca(stdout, wchmm->winfo, matchword);
01010  *     printf("matchlen=%d\n", matchlen);
01011  *   }
01012  */
01013   
01014   /* variable abbreviations */
01015   n = wchmm->n;
01016   word_len      = wchmm->winfo->wlen[word];
01017   matchword_len = wchmm->winfo->wlen[matchword];
01018 
01019   /* malloc phone offset area */
01020   wchmm->offset[word] = (int *)mybmalloc2(sizeof(int)*word_len, &(wchmm->malloc_root));
01021 
01022   /* allocate unshared (new) part */
01023   add_head = matchlen;
01024   add_tail = word_len - 1;
01025   add_to   = matchlen - 1;
01026 
01027   if (wchmm->hmminfo->multipath) {
01028     /* make word-beginning node if needed */
01029     if (matchlen == 0) {
01030       /* create word-beginning node */
01031       wchmm->wordbegin[word] = n;
01032       wchmm->stend[n] = WORD_INVALID;
01033       acc_init(wchmm, n);
01034       wchmm->state[n].out.state = NULL;
01035       /* index the new word-beginning node as startnode (old ststart) */
01036       wchmm->startnode[wchmm->startnum] = n;
01037       if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
01038       /* expand data area if necessary */
01039       if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
01040       if (++n >= wchmm->maxwcn) wchmm_extend(wchmm);
01041     } else {
01042       wchmm->wordbegin[word] = wchmm->wordbegin[matchword];
01043     }
01044 
01045     /* now n is at beginning of output state */
01046 
01047     /* store the initial outgoing arcs to out_from[] and out_a[] */
01048     out_num_prev = 0;
01049     if (matchlen == 0) {
01050       /* set the word-beginning node */
01051       out_from[0] = wchmm->wordbegin[word];
01052       out_a[0] = 0.0;
01053       out_num_prev = 1;
01054     } else {
01055       /*printf("%d(%s)\n", word, wchmm->winfo->woutput[word]);*/
01056       /* on -iwsp, trailing sp is needed only when no phone will be created */
01057       get_outtrans_list(wchmm, matchword, add_to, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, (enable_iwsp && add_tail - add_head + 1 <= 0) ? TRUE : FALSE);
01058       /*printf("NUM=%d\n", out_num_prev);*/
01059     }
01060   } else { /*  end of multipath block */
01061     if (matchlen == 0) {
01062       if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) {
01063         /* index the new word-beginning node as startnode (old ststart) */
01064         wchmm->startnode[wchmm->startnum] = n;
01065         if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word;
01066         /* expand data area if necessary */
01067         if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
01068       }
01069     }
01070   }
01071   
01072   if (add_tail - add_head + 1 > 0) { /* there are new phones to be created */
01073       ntmp = n;
01074       for (j=add_head; j <= add_tail; j++) { /* for each new phones */
01075         ltmp = wchmm->winfo->wseq[word][j];
01076         ltmp_state_num = hmm_logical_state_num(ltmp);
01077 #ifdef PASS1_IWCD
01078         if (wchmm->ccd_flag) {
01079           /* in the triphone lexicon tree, the last phone of a word has
01080              left-context cdset */
01081           if (wchmm->winfo->wlen[word] > 1 && j == wchmm->winfo->wlen[word] - 1) {
01082             if (wchmm->category_tree) {
01083 #ifdef USE_OLD_IWCD
01084               lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01085 #else
01086               lcd = lcdset_lookup_with_category(wchmm, ltmp, wchmm->winfo->wton[word]);
01087               if (lcd == NULL) {
01088                 /* no category-aware cdset found.  This is case when no word
01089                    can follow this word grammatically.
01090                    so fallback to normal state */
01091                 jlog("WARNING: wchmm: no lcdset found for [%s::%04d], fallback to [%s]\n", ltmp->name, wchmm->winfo->wton[word], ltmp->name);
01092                 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01093               }
01094 #endif
01095             } else {
01096               lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01097             }
01098             if (lcd == NULL) {
01099               jlog("ERROR: wchmm: at word #%d: no lcdset found for [%s]\n", word, ltmp->name);
01100               ok_p = FALSE;
01101             }
01102           }
01103         }
01104 #endif /* PASS1_IWCD */
01105         for (k = 1; k < ltmp_state_num - 1; k++) { /* for each state in the phone */
01106           /* set state output prob info */
01107 #ifdef PASS1_IWCD
01108           if (wchmm->ccd_flag) {
01109             /* output info of triphones needs special handling */
01110             if (wchmm->winfo->wlen[word] == 1) { /* word with only 1 phone */
01111               wchmm->outstyle[ntmp] = AS_LRSET;
01112               wchmm->state[ntmp].out.lrset = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
01113               (wchmm->state[ntmp].out.lrset)->hmm       = ltmp;
01114               (wchmm->state[ntmp].out.lrset)->state_loc = k;
01115               if (wchmm->category_tree) {
01116                 (wchmm->state[ntmp].out.lrset)->category  = wchmm->winfo->wton[word];
01117               }
01118             } else if (j == 0) {        /* head phone of a word */
01119               wchmm->outstyle[ntmp] = AS_RSET;
01120               wchmm->state[ntmp].out.rset = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
01121               (wchmm->state[ntmp].out.rset)->hmm       = ltmp;
01122               (wchmm->state[ntmp].out.rset)->state_loc = k;
01123             } else if (j == wchmm->winfo->wlen[word] - 1) { /* last phone of a word */
01124               wchmm->outstyle[ntmp] = AS_LSET;
01125               wchmm->state[ntmp].out.lset = &(lcd->stateset[k]);
01126             } else {
01127               wchmm->outstyle[ntmp] = AS_STATE;
01128               if (ltmp->is_pseudo) {
01129                 jlog("WARNING: wchmm: word-internal phone should not be pseudo\n");
01130                 put_voca(stdout, wchmm->winfo, word);
01131                 ok_p = FALSE;
01132               }
01133               wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01134             }
01135           } else {
01136             /* monophone */
01137             if (ltmp->is_pseudo) {
01138               j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n");
01139               put_voca(stdout, wchmm->winfo, word);
01140               ok_p = FALSE;
01141             }
01142             wchmm->outstyle[ntmp] = AS_STATE;
01143             wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01144           }
01145 #else  /* ~PASS1_IWCD */
01146           if (ltmp->is_pseudo) {
01147             j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n");
01148             put_voca(stdout, wchmm->winfo, word);
01149             ok_p = FALSE;
01150           }
01151           wchmm->state[ntmp].out = ltmp->body.defined->s[k];
01152 #endif /* PASS1_IWCD */
01153           
01154           /* initialize other info */
01155           acc_init(wchmm, ntmp);
01156           wchmm->stend[ntmp] = WORD_INVALID;
01157           if (! wchmm->hmminfo->multipath) {
01158             /* make transition arc from HMM transition info */
01159             for (ato = 1; ato < ltmp_state_num; ato++) {
01160               prob = (hmm_logical_trans(ltmp))->a[k][ato];
01161               if (prob != LOG_ZERO) {
01162                   if (j == add_tail && k == ltmp_state_num - 2 && ato == ltmp_state_num - 1) {
01163                     /* arc outside new part will be handled later */
01164                   } else {
01165                     add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01166                   }
01167                 }
01168             }
01169           }
01170           
01171           ntmp++;
01172           /* expand wchmm if neccesary */
01173           if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01174         } /* end of state loop */
01175       } /* end of phone loop */
01176 
01177       if (wchmm->hmminfo->multipath) {
01178           
01179         /* On multipath version, the skip transition should be handled! */
01180       
01181         /* make transition arc from HMM transition info */
01182         ntmp = n;
01183         for (j = add_head; j <= add_tail; j++) {
01184           ltmp = wchmm->winfo->wseq[word][j];
01185           ltmp_state_num = hmm_logical_state_num(ltmp);
01186           out_num_next = 0;
01187           /* arc from initial state ... need arc expansion from precious phone */
01188           for (ato = 1; ato < ltmp_state_num; ato++) {
01189             prob = (hmm_logical_trans(ltmp))->a[0][ato];
01190             if (prob != LOG_ZERO) {
01191               /* expand arc from previous HMM */
01192               if (ato == ltmp_state_num - 1) {
01193                 /* to final state ... just register states for next expansion */
01194                 for(kkk=0; kkk<out_num_prev; kkk++) {
01195                   out_from_next[out_num_next] = out_from[kkk];
01196                   out_a_next[out_num_next] = out_a[kkk] + prob;
01197                   out_num_next++;
01198                 }
01199               } else {
01200                 for(kkk=0; kkk<out_num_prev; kkk++) {
01201                   add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01202                 }
01203               }
01204             }
01205           } /* end of state loop */
01206           /* from outprob state */
01207           for(k = 1; k < ltmp_state_num - 1; k++) {
01208             for (ato = 1; ato < ltmp_state_num; ato++) {
01209               prob = (hmm_logical_trans(ltmp))->a[k][ato];
01210               if (prob != LOG_ZERO) {
01211                 if (ato == ltmp_state_num - 1) {
01212                   /* to final state ... register states for next expansion */
01213                   out_from_next[out_num_next] = ntmp;
01214                   out_a_next[out_num_next] = prob;
01215                   out_num_next++;
01216                 } else {
01217                   add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01218                 }
01219               }
01220             }
01221             ntmp++;
01222           } /* end of state loop */
01223           /* swap out list for next phone */
01224           for(kkk=0;kkk<out_num_next;kkk++) {
01225             out_from[kkk] = out_from_next[kkk];
01226             out_a[kkk] = out_a_next[kkk];
01227           }
01228           out_num_prev = out_num_next;
01229         }       /* end of phone loop */
01230       } /* end of multipath block */
01231       
01232   } /* new phone node creation loop for this word */
01233 
01234 
01235   /*************************************/
01236   /* Short Pause appending (multipath) */
01237   /*************************************/
01238   
01239   /* if -iwsp, add noise model to the end of word at ntmp */
01240   if (wchmm->hmminfo->multipath && enable_iwsp && add_tail - add_head + 1 > 0) { /* there are new phones to be created */
01241     int ntmp_bak;
01242     
01243     /* set short pause state info */
01244     ntmp_bak = ntmp;
01245     if (wchmm->hmminfo->sp->is_pseudo) {
01246       for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01247         wchmm->outstyle[ntmp] = AS_LSET;
01248         wchmm->state[ntmp].out.lset = &(wchmm->hmminfo->sp->body.pseudo->stateset[k]);
01249         acc_init(wchmm, ntmp);
01250         wchmm->stend[ntmp] = WORD_INVALID;
01251         ntmp++;
01252         if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01253       }
01254     } else {
01255       for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01256         wchmm->outstyle[ntmp] = AS_STATE;
01257         wchmm->state[ntmp].out.state = wchmm->hmminfo->sp->body.defined->s[k];
01258         acc_init(wchmm, ntmp);
01259         wchmm->stend[ntmp] = WORD_INVALID;
01260         ntmp++;
01261         if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01262       }
01263     }
01264     ntmp = ntmp_bak;
01265     /* connect incoming arcs from previous phone */
01266     out_num_next = 0;
01267     for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01268       prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[0][ato];
01269       if (prob != LOG_ZERO) {
01270         /* to control short pause insertion, transition probability toward
01271          the word-end short pause will be given a penalty */
01272         prob += wchmm->hmminfo->iwsp_penalty;
01273         if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01274           /* model has a model skip transition, just inherit them to next */
01275           for(kkk=0; kkk<out_num_prev; kkk++) {
01276             out_from_next[out_num_next] = out_from[kkk];
01277             out_a_next[out_num_next] = out_a[kkk] + prob;
01278             out_num_next++;
01279           }
01280         } else {
01281           /* connect incoming arcs from previous phone to this phone */
01282           for(kkk=0; kkk<out_num_prev; kkk++) {
01283             add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01284           }
01285         }
01286       }
01287     }
01288     /* if short pause model doesn't have a model skip transition, also add it */
01289     if (hmm_logical_trans(wchmm->hmminfo->sp)->a[0][hmm_logical_state_num(wchmm->hmminfo->sp)-1] == LOG_ZERO) {
01290       /* to make insertion sp model to have no effect on the original path,
01291          the skip transition probability should be 0.0 (=100%) */
01292       prob = 0.0;
01293       for(kkk=0; kkk<out_num_prev; kkk++) {
01294         out_from_next[out_num_next] = out_from[kkk];
01295         out_a_next[out_num_next] = out_a[kkk] + prob;
01296         out_num_next++;
01297       }
01298     }
01299     /* connect arcs within model, and store new outgoing arcs for wordend node */
01300     for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01301       for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01302         prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][ato];
01303         if (prob != LOG_ZERO) {
01304           if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01305             out_from_next[out_num_next] = ntmp;
01306             out_a_next[out_num_next] = prob;
01307             out_num_next++;
01308           } else {
01309             add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01310           }
01311         }
01312       }
01313       ntmp++;
01314     }
01315     /* swap work area for next */
01316     for(kkk=0;kkk<out_num_next;kkk++) {
01317       out_from[kkk] = out_from_next[kkk];
01318       out_a[kkk] = out_a_next[kkk];
01319     }
01320     out_num_prev = out_num_next;
01321 
01322   } /* end of inter-word short pause appending block */
01323 
01324   /* make mapping: word <-> node on wchmm */
01325   for (j=0;j<word_len;j++) {
01326     if (j < add_head) { /* shared part */
01327       wchmm->offset[word][j] = wchmm->offset[matchword][j];
01328     } else if (add_tail < j) { /* shared tail part (should not happen..) */
01329       wchmm->offset[word][j] = wchmm->offset[matchword][j+(matchword_len-word_len)];
01330     } else {                    /* newly created part */
01331       wchmm->offset[word][j] = n;
01332       n += hmm_logical_state_num(wchmm->winfo->wseq[word][j]) - 2;
01333     }
01334   }
01335 
01336 
01337   if (wchmm->hmminfo->multipath) {
01338     /* create word-end node */
01339 
01340     /* paranoia check if the short-pause addition has been done well */
01341     if (enable_iwsp && add_tail - add_head + 1 > 0) {
01342       n += hmm_logical_state_num(wchmm->hmminfo->sp) - 2;
01343       if (n != ntmp) j_internal_error("wchmm_add_word: cannot match\n");
01344     }
01345     
01346     /* create word-end node */
01347     wchmm->wordend[word] = n;   /* tail node of 'word' is 'n' */
01348     wchmm->stend[n] = word;     /* node 'k' is a tail node of 'word' */
01349     acc_init(wchmm, n);
01350     wchmm->state[n].out.state = NULL;
01351     
01352     /* connect the final outgoing arcs in out_from[] to the word end node */
01353     for(k = 0; k < out_num_prev; k++) {
01354       add_wacc(wchmm, out_from[k], out_a[k], n);
01355     }
01356     n++;
01357     if (n >= wchmm->maxwcn) wchmm_extend(wchmm);
01358     
01359     if (matchlen == 0) {
01360       /* check if the new word has whole word-skipping transition */
01361       /* (use out_from and out_num_prev temporary) */
01362       out_num_prev = 0;
01363       get_outtrans_list(wchmm, word, word_len-1, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, enable_iwsp);
01364       for(k=0;k<out_num_prev;k++) {
01365         if (out_from[k] == wchmm->wordbegin[word]) {
01366           jlog("ERROR: *** ERROR: WORD SKIPPING TRANSITION NOT ALLOWED ***\n");
01367           jlog("ERROR:   Word id=%d (%s[%s]) has \"word skipping transition\".\n", word, wchmm->winfo->wname[word], wchmm->winfo->woutput[word]);
01368           jlog("ERROR:   All HMMs in the word:\n    ");
01369           for(kkk=0;kkk<wchmm->winfo->wlen[word];kkk++) {
01370             jlog("%s ", wchmm->winfo->wseq[word][kkk]->name);
01371           }
01372           jlog("\n");
01373           jlog("ERROR:  has transitions from initial state to final state.\n");
01374           jlog("ERROR:  This type of word skipping is not supported.\n");
01375           ok_p = FALSE;
01376         }
01377       }
01378     }
01379 
01380     wchmm->n = n;
01381 
01382   } else {
01383 
01384     wchmm->n = n;
01385     k = wchmm->offset[word][word_len-1] + hmm_logical_state_num(wchmm->winfo->wseq[word][word_len-1])-2 -1;
01386     wchmm->wordend[word] = k;   /* tail node of 'word' is 'k' */
01387     wchmm->stend[k] = word;     /* node 'k' is a tail node of 'word' */
01388     
01389     if (matchlen != 0 && add_tail - add_head + 1 > 0) {
01390       /* new part has been created in the above procedure: */
01391       /* now make link from shared part to the new part */
01392       wchmm_link_subword(wchmm, matchword,add_to,word,add_head);        
01393     }
01394 
01395   }
01396 
01397   return(ok_p);
01398   
01399 }
01400 
01401 /*************************************************************/
01402 /**** parse whole structure (after wchmm has been built) *****/
01403 /*************************************************************/
01404 
01419 static void
01420 wchmm_calc_wordend_arc(WCHMM_INFO *wchmm)
01421 {
01422   WORD_ID w;
01423   HTK_HMM_Trans *tr;
01424   LOGPROB a;
01425 
01426   for (w=0;w<wchmm->winfo->num;w++) {
01427     tr = hmm_logical_trans(wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]);
01428     a = tr->a[tr->statenum-2][tr->statenum-1];
01429     wchmm->wordend_a[w] = a;
01430   }
01431 }
01432 
01433 #ifdef SEPARATE_BY_UNIGRAM
01434 
01435 /********************************************************************/
01436 /****** for separation (linearization) of high-frequent words *******/
01437 /********************************************************************/
01438 
01457 static int
01458 compare_prob(LOGPROB *a, LOGPROB *b)
01459 {
01460   if (*a < *b)  return (1);
01461   if (*a > *b)  return (-1);
01462   return(0);
01463 }
01464 
01483 static LOGPROB
01484 get_nbest_uniprob(WCHMM_INFO *wchmm, int n)
01485 {
01486   LOGPROB *u_p;
01487   WORD_ID w;
01488   LOGPROB x;
01489   WORD_INFO *winfo;
01490   NGRAM_INFO *ngram;
01491 
01492   winfo = wchmm->winfo;
01493   ngram = wchmm->ngram;
01494 
01495   if (n < 1) n = 1;
01496   if (n > winfo->num) n = winfo->num;
01497 
01498   /* store all unigram probability to u_p[] */
01499   u_p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * winfo->num);
01500   for(w=0;w<winfo->num;w++) {
01501     if (ngram) {
01502       x = uni_prob(ngram, winfo->wton[w])
01503 #ifdef CLASS_NGRAM
01504         + winfo->cprob[w]
01505 #endif
01506         ;
01507     } else {
01508       x = LOG_ZERO;
01509     }
01510     if (wchmm->lmvar == LM_NGRAM_USER) {
01511       x = (*(wchmm->uni_prob_user))(wchmm->winfo, w, x);
01512     }
01513     u_p[w] = x;
01514   }
01515 
01516   /* sort them downward */
01517   qsort(u_p, winfo->num, sizeof(LOGPROB),
01518         (int (*)(const void *,const void *))compare_prob);
01519 
01520   /* return the Nth value */
01521   x = u_p[n-1];
01522   free(u_p);
01523   return(x);
01524 }
01525 
01526 #endif
01527 
01528 /**********************************************************/
01529 /****** MAKE WCHMM (LEXICON TREE) --- main function *******/
01530 /**********************************************************/
01531 
01532 #define COUNT_STEP 500         
01533 
01534 
01555 boolean
01556 build_wchmm(WCHMM_INFO *wchmm, JCONF_LM *lmconf)
01557 {
01558   int i,j;
01559   int matchword=0, sharelen=0, maxsharelen=0;
01560   int num_duplicated;
01561 #ifdef SEPARATE_BY_UNIGRAM
01562   LOGPROB separate_thres;
01563   LOGPROB p;
01564 #endif
01565   boolean ok_p;
01566 
01567   /* lingustic infos must be set before build_wchmm() is called */
01568   /* check if necessary lingustic info is already assigned (for debug) */
01569   if (wchmm->winfo == NULL
01570       || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL)
01571       || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL)
01572       ) {
01573     jlog("ERROR: wchmm: linguistic info not available!!\n");
01574     return FALSE;
01575   }
01576 
01577   ok_p = TRUE;
01578   
01579 #ifdef SEPARATE_BY_UNIGRAM
01580   /* 上位[separate_wnum]番目の1-gramスコアを求める */
01581   /* 1-gramスコアがこの値以上のものは木から分ける */
01582   separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);
01583 #endif
01584 
01585 #ifdef PASS1_IWCD
01586 #ifndef USE_OLD_IWCD
01587   if (wchmm->category_tree) {
01588     if (wchmm->ccd_flag) {
01589       /* 全てのカテゴリID付き lcd_set を作成 */
01590       lcdset_register_with_category_all(wchmm);
01591     }
01592   }
01593 #endif
01594 #endif /* PASS1_IWCD */
01595   
01596 
01597   /* wchmmを初期化 */
01598   wchmm_init(wchmm);
01599 
01600   /* カウンタリセット */
01601   wchmm->separated_word_count=0;
01602 
01603   jlog("STAT: wchmm: Building HMM lexicon tree (left-to-right)\n");
01604   for (i=0;i<wchmm->winfo->num;i++) {
01605 
01606     if (wchmm->lmtype == LM_PROB) {
01607       if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01608         /* 先頭/末尾の無音モデルは木構造化せず,
01609          * 先頭の無音単語の先頭への遷移,末尾単語の末尾からの遷移は作らない*/
01610         /* sharelen=0でそのまま */
01611         if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01612           jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01613           ok_p = FALSE;
01614         }
01615         continue;
01616       }
01617 #ifndef NO_SEPARATE_SHORT_WORD
01618       if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01619         /* 長さの短い単語を木構造化しない(ここでは1音節) */
01620         /* sharelen=0でそのまま */
01621         if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01622           jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01623           ok_p = FALSE;
01624         }
01625         wchmm->separated_word_count++;
01626         continue;
01627       }
01628 #endif
01629 #ifdef SEPARATE_BY_UNIGRAM
01630       if (wchmm->ngram) {
01631         p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01632 #ifdef CLASS_NGRAM
01633           + wchmm->winfo->cprob[i]
01634 #endif
01635           ;
01636       } else {
01637         p = LOG_ZERO;
01638       }
01639       if (wchmm->lmvar == LM_NGRAM_USER) {
01640         p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p);
01641       }
01642       if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) {
01643         /* 頻度の高い単語を木構造化しない */
01644         /* separate_thres は上位separate_wnum番目のスコア */
01645         if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01646           jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01647           ok_p = FALSE;
01648         }
01649         wchmm->separated_word_count++;
01650         continue;
01651       }
01652 #endif
01653     }
01654 
01655     /* 最も長く音素を共有出来る単語を探す */
01656     maxsharelen=0;
01657     for (j=0;j<i;j++) {
01658       if (wchmm->category_tree && wchmm->lmvar == LM_DFA_GRAMMAR) {
01659         if (wchmm->winfo->wton[i] != wchmm->winfo->wton[j]) continue;
01660       }
01661       sharelen = wchmm_check_match(wchmm->winfo, i, j);
01662       if (sharelen == wchmm->winfo->wlen[i] && sharelen == wchmm->winfo->wlen[j]) {
01663        /* word に同音語が存在する */
01664        /* 必ず最大の長さであり,重複カウントを避けるためここで抜ける */
01665        maxsharelen = sharelen;
01666        matchword = j;
01667        break;
01668       }
01669       if (sharelen > maxsharelen) {
01670        matchword = j;
01671        maxsharelen = sharelen;
01672       }
01673     }
01674     if (wchmm_add_word(wchmm, i, maxsharelen, matchword, lmconf->enable_iwsp) == FALSE) {
01675       jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01676       ok_p = FALSE;
01677     }
01678   }
01679 
01680 #if 0
01681   /* 木構造を作らない */
01682   for (i=0;i<wchmm->winfo->num;i++) {
01683     if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01684       jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01685       ok_p = FALSE;
01686     }
01687   }
01688 #endif  
01689   jlog("STAT:  %5d words ended     (%6d nodes)\n",i,wchmm->n);
01690 
01691   if (! wchmm->hmminfo->multipath) {
01692     /* 同一音素系列を持つ単語同士の leaf node を2重化して区別する */
01693     num_duplicated = wchmm_duplicate_leafnode(wchmm);
01694     jlog("STAT:  %d leaf nodes are made unshared\n", num_duplicated);
01695     
01696     /* 単語の終端から外への遷移確率を求めておく */
01697     wchmm_calc_wordend_arc(wchmm);
01698   }
01699 
01700   /* wchmmの整合性をチェックする */
01701   check_wchmm(wchmm);
01702 
01703   /* factoring用に各状態に後続単語のリストを付加する */
01704   if (!wchmm->category_tree) {
01705     make_successor_list(wchmm);
01706 
01707 #ifdef UNIGRAM_FACTORING
01708     if (wchmm->lmtype == LM_PROB) {
01709       /* 前もってfactoring値を計算 */
01710       /* 末端以外のscは必要ないのでフリーする */
01711       calc_all_unigram_factoring_values(wchmm);
01712       jlog("STAT:  1-gram factoring values has been pre-computed\n");
01713     }
01714 #endif /* UNIGRAM_FACTORING */
01715     
01716     if (wchmm->hmminfo->multipath) {
01717       /* 構築された factoring 情報をスキップ遷移および文頭文法ノードにコピー */
01718       adjust_sc_index(wchmm);
01719     }
01720     
01721 #ifdef UNIGRAM_FACTORING
01722     if (wchmm->lmtype == LM_PROB) {
01723       /* 単語間LMキャッシュが必要なノードのリストを作る */
01724       make_iwcache_index(wchmm);
01725     }
01726 #endif /* UNIGRAM_FACTORING */
01727 
01728   }
01729 
01730   jlog("STAT: done\n");
01731 
01732   return ok_p;
01733 }
01734 
01760 boolean
01761 build_wchmm2(WCHMM_INFO *wchmm, JCONF_LM *lmconf)
01762 {
01763   int i,j, last_i;
01764   int num_duplicated;
01765   WORD_ID *windex;
01766 #ifdef SEPARATE_BY_UNIGRAM
01767   LOGPROB separate_thres;
01768   LOGPROB p;
01769 #endif
01770   boolean ok_p;
01771   boolean ret;
01772 
01773   /* lingustic infos must be set before build_wchmm() is called */
01774   /* check if necessary lingustic info is already assigned (for debug) */
01775   if (wchmm->winfo == NULL
01776       || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL)
01777       || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL)
01778       ) {
01779     jlog("ERROR: wchmm: linguistic info not available!!\n");
01780     return FALSE;
01781   }
01782 
01783   ok_p = TRUE;
01784   
01785   wchmm->separated_word_count = 0;
01786   
01787   jlog("STAT: Building HMM lexicon tree\n");
01788   
01789   if (wchmm->lmtype == LM_PROB) {
01790 #ifdef SEPARATE_BY_UNIGRAM
01791     /* compute score threshold beforehand to separate words from tree */
01792     /* here we will separate best [separate_wnum] words from tree */
01793     separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum);
01794 #endif
01795   }
01796 
01797 #ifdef PASS1_IWCD
01798 #ifndef USE_OLD_IWCD
01799   if (wchmm->category_tree) {
01800     if (wchmm->ccd_flag) {
01801       /* when Julian mode (category-tree) and triphone is used,
01802          make all category-indexed context-dependent phone set (cdset) here */
01803       /* these will be assigned on the last phone of each word on tree */
01804       lcdset_register_with_category_all(wchmm);
01805     }
01806   }
01807 #endif
01808 #endif /* PASS1_IWCD */
01809 
01810  /* initialize wchmm */
01811   wchmm_init(wchmm);
01812 
01813   /* make sorted word index ordered by phone sequence */
01814   windex = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wchmm->winfo->num);
01815   for(i=0;i<wchmm->winfo->num;i++) windex[i] = i;
01816 
01817   if (wchmm->category_tree && wchmm->lmvar == LM_DFA_GRAMMAR) {
01818 
01819     /* sort by category -> sort by word ID in each category */
01820     wchmm_sort_idx_by_category(wchmm->winfo, windex, wchmm->winfo->num);
01821     {
01822       int last_cate;
01823       last_i = 0;
01824       last_cate = wchmm->winfo->wton[windex[0]];
01825       for(i = 1;i<wchmm->winfo->num;i++) {
01826         if (wchmm->winfo->wton[windex[i]] != last_cate) {
01827           wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, i - last_i);
01828           last_cate = wchmm->winfo->wton[windex[i]];
01829           last_i = i;
01830         }
01831       }
01832       wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, wchmm->winfo->num - last_i);
01833     }
01834 
01835   } else {
01836 
01837     /* sort by word ID for whole vocabulary */
01838     wchmm_sort_idx_by_wseq(wchmm->winfo, windex, 0, wchmm->winfo->num);
01839 
01840   }
01841 
01842 /* 
01843  *   {
01844  *     int i,w;
01845  *     for(i=0;i<wchmm->winfo->num;i++) {
01846  *       w = windex[i];
01847  *       printf("%d: cate=%4d wid=%4d %s\n",i, wchmm->winfo->wton[w], w, wchmm->winfo->woutput[w]);
01848  *     }
01849  *   }
01850  */
01851 
01852   /* incrementaly add words to lexicon tree */
01853   /* now for each word, the previous word (last_i) is always the most matched one */
01854   last_i = WORD_INVALID;
01855   for (j=0;j<wchmm->winfo->num;j++) {
01856     i = windex[j];
01857 
01858     if (wchmm->lmtype == LM_PROB) {
01859 
01860       /* start/end silence word should not be shared */
01861       if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01862         /* add whole word as new (sharelen=0) */
01863         if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01864           jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01865           ok_p = FALSE;
01866         }
01867         continue;
01868       }
01869 #ifndef NO_SEPARATE_SHORT_WORD
01870       /* separate short words from tree */
01871       if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01872         if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01873           jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01874           ok_p = FALSE;
01875         }
01876         wchmm->separated_word_count++;
01877         continue;
01878       }
01879 #endif
01880 #ifdef SEPARATE_BY_UNIGRAM
01881       if (wchmm->ngram) {
01882         p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01883 #ifdef CLASS_NGRAM
01884           + wchmm->winfo->cprob[i]
01885 #endif
01886           ;
01887       } else {
01888         p = LOG_ZERO;
01889       }
01890       if (wchmm->lmvar == LM_NGRAM_USER) {
01891         p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p);
01892       }
01893       /* separate high-frequent words from tree (threshold = separate_thres) */
01894       if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) {
01895         if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) {
01896           jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01897           ok_p = FALSE;
01898         }
01899         wchmm->separated_word_count++;
01900         continue;
01901       }
01902 #endif
01903     }
01904 
01905     if (last_i == WORD_INVALID) { /* first word */
01906       ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp);
01907     } else {
01908       /* the previous word (last_i) is always the most matched one */
01909       if (wchmm->category_tree && wchmm->lmvar == LM_DFA_GRAMMAR) {
01910         if (wchmm->winfo->wton[i] != wchmm->winfo->wton[last_i]) {
01911           ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp);
01912         } else {
01913           ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp);
01914         }
01915       } else {
01916         ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp);
01917       }
01918     }
01919     if (ret == FALSE) {
01920       jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n");
01921       ok_p = FALSE;
01922     }
01923     last_i = i;
01924     
01925   } /* end of add word loop */
01926   
01927   /*j_printerr("\r %5d words ended     (%6d nodes)\n",j,wchmm->n);*/
01928 
01929   /* free work area */
01930   free(windex);
01931 
01932   if (wchmm->hmminfo->multipath) {
01933     jlog("STAT: lexicon size: %d nodes\n", wchmm->n);
01934   } else {
01935     /* duplicate leaf nodes of homophone/embedded words */
01936     jlog("STAT: lexicon size: %d", wchmm->n);
01937     num_duplicated = wchmm_duplicate_leafnode(wchmm);
01938     jlog("+%d=%d\n", num_duplicated, wchmm->n);
01939   }
01940 
01941   if (! wchmm->hmminfo->multipath) {
01942     /* calculate transition probability of word end node to outside */
01943     wchmm_calc_wordend_arc(wchmm);
01944   }
01945 
01946   /* check wchmm coherence (internal debug) */
01947   check_wchmm(wchmm);
01948 
01949   /* make successor list for all branch nodes for N-gram factoring */
01950   if (!wchmm->category_tree) {
01951     make_successor_list(wchmm);
01952 
01953 #ifdef UNIGRAM_FACTORING
01954     if (wchmm->lmtype == LM_PROB) {
01955       /* for 1-gram factoring, we can compute the values before search */
01956       calc_all_unigram_factoring_values(wchmm);
01957       jlog("STAT:  1-gram factoring values has been pre-computed\n");
01958     }
01959 #endif /* UNIGRAM_FACTORING */
01960     if (wchmm->hmminfo->multipath) {
01961       /* Copy the factoring data according to the skip transitions and startword nodes */
01962       adjust_sc_index(wchmm);
01963     }
01964 #ifdef UNIGRAM_FACTORING
01965     if (wchmm->lmtype == LM_PROB) {
01966       /* make list of start nodes that needs inter-word LM cache */
01967       make_iwcache_index(wchmm);
01968     }
01969 #endif /* UNIGRAM_FACTORING */
01970 
01971   }
01972 
01973   //jlog("STAT: done\n");
01974 
01975 #ifdef WCHMM_SIZE_CHECK
01976   if (debug2_flag) {
01977     /* detailed check of lexicon tree size (inaccurate!) */
01978     jlog("STAT: --- memory size of word lexicon ---\n");
01979     jlog("STAT: wchmm: %d words, %d nodes\n", wchmm->winfo->num, wchmm->n);
01980     jlog("STAT: %9d bytes: wchmm->state[node] (exclude ac, sc)\n", sizeof(WCHMM_STATE) * wchmm->n);
01981     {
01982       int count1 = 0;
01983       int count2 = 0;
01984       int count3 = 0;
01985       for(i=0;i<wchmm->n;i++) {
01986         if (wchmm->self_a[i] != LOG_ZERO) count1++;
01987         if (wchmm->next_a[i] != LOG_ZERO) count2++;
01988         if (wchmm->ac[i] != NULL) count3++;
01989       }
01990       jlog("STAT: %9d bytes: wchmm->self_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count1 / (float)wchmm->n);
01991       jlog("STAT: %9d bytes: wchmm->next_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count2 / (float)wchmm->n);
01992       jlog("STAT: %9d bytes: wchmm->ac[node] (%4.1f%% used)\n", sizeof(A_CELL2 *) * wchmm->n, 100.0 * count3 / (float)wchmm->n);
01993     }
01994     jlog("STAT: %9d bytes: wchmm->stend[node]\n", sizeof(WORD_ID) * wchmm->n);
01995     {
01996       int w,count;
01997       count = 0;
01998       for(w=0;w<wchmm->winfo->num;w++) {
01999         count += wchmm->winfo->wlen[w] * sizeof(int) + sizeof(int *);
02000       }
02001       jlog("STAT: %9d bytes: wchmm->offset[w][]\n", count);
02002     }
02003     if (wchmm->hmminfo->multipath) {
02004       jlog("STAT: %9d bytes: wchmm->wordbegin[w]\n", wchmm->winfo->num * sizeof(int));
02005     }
02006     jlog("STAT: %9d bytes: wchmm->wordend[w]\n", wchmm->winfo->num * sizeof(int));
02007     jlog("STAT: %9d bytes: wchmm->startnode[]\n", wchmm->startnum * sizeof(int));
02008     if (wchmm->category_tree) {
02009       jlog("STAT: %9d bytes: wchmm->start2wid[]\n", wchmm->startnum * sizeof(WORD_ID));
02010     }
02011 #ifdef UNIGRAM_FACTORING
02012     if (wchmm->lmtype == LM_PROB) {
02013       jlog("STAT: %9d bytes: wchmm->start2isolate[]\n", wchmm->isolatenum * sizeof(int));
02014     }
02015 #endif
02016     if (!wchmm->hmminfo->multipath) {
02017       jlog("STAT: %9d bytes: wchmm->wordend_a[]\n", wchmm->winfo->num * sizeof(LOGPROB));
02018     }
02019 #ifdef PASS1_IWCD
02020     jlog("STAT: %9d bytes: wchmm->outstyle[]\n", wchmm->n * sizeof(unsigned char));
02021     {
02022       int c;
02023       c = 0;
02024       for(i=0;i<wchmm->n;i++) {
02025         switch(wchmm->outstyle[i]) {
02026         case AS_RSET:
02027           c += sizeof(RC_INFO);
02028           break;
02029         case AS_LRSET:
02030           c += sizeof(LRC_INFO);
02031           break;
02032       }
02033       }
02034       if (c > 0) jlog("STAT: %9d bytes: wchmm->out (RC_INFO / LRC_INFO)\n", c);
02035     }
02036 #endif
02037     if (!wchmm->category_tree) {
02038       jlog("STAT: %9d bytes: wchmm->sclist[]\n", wchmm->scnum * sizeof(S_CELL *));
02039       jlog("STAT: %9d bytes: wchmm->sclist2node[]\n", wchmm->scnum * sizeof(int));
02040 #ifdef UNIGRAM_FACTORING
02041       if (wchmm->lmtype == LM_PROB) {
02042         jlog("STAT: %9d bytes: wchmm->fscore[]\n", wchmm->fsnum * sizeof(LOGPROB));
02043       }
02044 #endif  
02045     }
02046     
02047     {
02048       int count, n;
02049       A_CELL2 *ac;
02050       count = 0;
02051       for(n=0;n<wchmm->n;n++) {
02052         for(ac=wchmm->ac[n];ac;ac=ac->next) {
02053           count += sizeof(A_CELL2);
02054         }
02055       }
02056       jlog("STAT: %9d bytes: A_CELL2\n", count);
02057     }
02058     if (!wchmm->category_tree) {
02059       jlog("STAT: %9d bytes: sclist\n", wchmm->scnum * sizeof(S_CELL *));
02060       jlog("STAT: %9d bytes: sclist2node\n", wchmm->scnum * sizeof(int));
02061     }
02062 
02063   }
02064 
02065 #endif /* WCHMM_SIZE_CHECK */
02066 
02067 
02068   return ok_p;
02069 
02070 }
02071 
02072 
02087 void
02088 print_wchmm_info(WCHMM_INFO *wchmm)
02089 {
02090   int n,i, rootnum;
02091 
02092   if (wchmm->hmminfo->multipath) {
02093     rootnum = wchmm->startnum;
02094   } else {
02095     if (wchmm->lmtype == LM_PROB) {
02096       rootnum = wchmm->startnum + 1;    /* including winfo->head_silwid */
02097     } else if (wchmm->lmtype == LM_DFA) {
02098       rootnum = wchmm->startnum;
02099     }
02100   }
02101   
02102   jlog(" Lexicon tree:\n");
02103   jlog("\t total node num = %6d\n", wchmm->n);
02104   if (wchmm->lmtype == LM_PROB) {
02105     jlog("\t  root node num = %6d\n", rootnum);
02106 #ifdef NO_SEPARATE_SHORT_WORD
02107 #ifdef SEPARATE_BY_UNIGRAM
02108     jlog("\t(%d hi-freq. words are separated from tree lexicon)\n", wchmm->separated_word_count);
02109 #else
02110     jlog(" (no words are separated from tree)\n");
02111 #endif /* SEPARATE_BY_UNIGRAM */
02112 #else
02113     jlog(" (%d short words (<= %d phonemes) are separated from tree)\n", wchmm->separated_word_count, SHORT_WORD_LEN);
02114 #endif /* NO_SEPARATE_SHORT_WORD */
02115   }
02116   if (wchmm->lmtype == LM_DFA) {
02117     jlog("\t  root node num = %6d\n", rootnum);
02118   }
02119   for(n=0,i=0;i<wchmm->n;i++) {
02120     if (wchmm->stend[i] != WORD_INVALID) n++;
02121   }
02122   jlog("\t  leaf node num = %6d\n", n);
02123   if (!wchmm->category_tree) {
02124     jlog("\t fact. node num = %6d\n", wchmm->scnum - 1);
02125   }
02126 }
02127 
02128 /* end of file */

Generated on Tue Dec 18 15:59:53 2007 for Julius by  doxygen 1.5.4