libjulius/src/word_align.c

Go to the documentation of this file.
00001 
00039 /*
00040  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00041  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00042  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00043  * All rights reserved
00044  */
00045 
00046 #include <julius/julius.h>
00047 
00077 static HMM_Logical **
00078 make_phseq(WORD_ID *wseq, short num, boolean **has_sp_ret, int *num_ret, int **end_ret, int per_what, 
00079            RecogProcess *r)
00080 {
00081   HMM_Logical **ph;             /* phoneme sequence */
00082   boolean *has_sp;
00083   int k;
00084   int phnum;                    /* num of above */
00085   WORD_ID tmpw, w;
00086   int i, j, pn, st, endn;
00087   HMM_Logical *tmpp, *ret;
00088   WORD_INFO *winfo;
00089   HTK_HMM_INFO *hmminfo;
00090   boolean enable_iwsp;          /* for multipath */
00091 
00092   winfo = r->lm->winfo;
00093   hmminfo = r->am->hmminfo;
00094   if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
00095 
00096   /* make ph[] from wseq[] */
00097   /* 1. calc total phone num and malloc */
00098   phnum = 0;
00099   for (w=0;w<num;w++) phnum += winfo->wlen[wseq[w]];
00100   ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * phnum);
00101   
00102   if (hmminfo->multipath) {
00103     has_sp = (boolean *)mymalloc(sizeof(boolean) * phnum);
00104   }
00105   /* 2. make phoneme sequence */
00106   st = 0;
00107   if (hmminfo->multipath) st++;
00108   pn = 0;
00109   endn = 0;
00110   for (w=0;w<num;w++) {
00111     tmpw = wseq[w];
00112     for (i=0;i<winfo->wlen[tmpw];i++) {
00113       tmpp = winfo->wseq[tmpw][i];
00114       /* handle cross-word context dependency */
00115       if (r->ccd_flag) {
00116         if (w > 0 && i == 0) {  /* word head */
00117           
00118           if ((ret = get_left_context_HMM(tmpp, ph[pn-1]->name, hmminfo)) != NULL) {
00119             tmpp = ret;
00120           }
00121           /* if triphone not found, fallback to bi/mono-phone  */
00122           /* use pseudo phone when no bi-phone found in alignment... */
00123         }
00124         if (w < num-1 && i == winfo->wlen[tmpw] - 1) { /* word tail */
00125           if ((ret = get_right_context_HMM(tmpp, winfo->wseq[wseq[w+1]][0]->name, hmminfo)) != NULL) {
00126             tmpp = ret;
00127           }
00128         }
00129       }
00130       ph[pn] = tmpp;
00131       if (hmminfo->multipath) {
00132         if (enable_iwsp && i == winfo->wlen[tmpw] - 1) {
00133           has_sp[pn] = TRUE;
00134         } else {
00135           has_sp[pn] = FALSE;
00136         }
00137       }
00138       if (per_what == PER_STATE) {
00139         for (j=0;j<hmm_logical_state_num(tmpp)-2;j++) {
00140           (*end_ret)[endn++] = st + j;
00141         }
00142         if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
00143           for (k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
00144             (*end_ret)[endn++] = st + j + k;
00145           }
00146         }
00147       }
00148       st += hmm_logical_state_num(tmpp) - 2;
00149       if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
00150         st += hmm_logical_state_num(hmminfo->sp) - 2;
00151       }
00152       if (per_what == PER_PHONEME) (*end_ret)[endn++] = st - 1;
00153       pn++;
00154     }
00155     if (per_what == PER_WORD) (*end_ret)[endn++] = st - 1;
00156   }
00157   *num_ret = phnum;
00158   if (hmminfo->multipath) *has_sp_ret = has_sp;
00159   return ph;
00160 }
00161 
00162 
00185 static void
00186 do_align(WORD_ID *words, short wnum, HTK_Param *param, int per_what, Sentence *s, RecogProcess *r)
00187 {
00188   HMM_Logical **phones;         /* phoneme sequence */
00189   boolean *has_sp;              /* whether phone can follow short pause */
00190   int k;
00191   int phonenum;                 /* num of above */
00192   HMM *shmm;                    /* sentence HMM */
00193   int *end_state;               /* state number of word ends */
00194   int *end_frame;               /* segmented last frame of words */
00195   LOGPROB *end_score;           /* normalized score of each words */
00196   LOGPROB allscore;             /* total score of this word sequence */
00197   WORD_ID w;
00198   int i, rlen;
00199   int end_num = 0;
00200   int *id_seq, *phloc = NULL, *stloc = NULL;
00201   int j,n,p;
00202   WORD_INFO *winfo;
00203   HTK_HMM_INFO *hmminfo;
00204   boolean enable_iwsp;          /* for multipath */
00205 
00206   winfo = r->lm->winfo;
00207   hmminfo = r->am->hmminfo;
00208   if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
00209 
00210   /* initialize result storage buffer */
00211   switch(per_what) {
00212   case PER_WORD:
00213     jlog("ALIGN: === word alignment begin ===\n");
00214     end_num = wnum;
00215     phloc = (int *)mymalloc(sizeof(int)*wnum);
00216     i = 0;
00217     for(w=0;w<wnum;w++) {
00218       phloc[w] = i;
00219       i += winfo->wlen[words[w]];
00220     }
00221     break;
00222   case PER_PHONEME:
00223     jlog("ALIGN: === phoneme alignment begin ===\n");
00224     end_num = 0;
00225     for(w=0;w<wnum;w++) end_num += winfo->wlen[words[w]];
00226     break;
00227   case PER_STATE:
00228     jlog("ALIGN: === state alignment begin ===\n");
00229     end_num = 0;
00230     for(w=0;w<wnum;w++) {
00231       for (i=0;i<winfo->wlen[words[w]]; i++) {
00232         end_num += hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2;
00233       }
00234       if (hmminfo->multipath && enable_iwsp) {
00235         end_num += hmm_logical_state_num(hmminfo->sp) - 2;
00236       }
00237     }
00238     phloc = (int *)mymalloc(sizeof(int)*end_num);
00239     stloc = (int *)mymalloc(sizeof(int)*end_num);
00240     {
00241       n = 0;
00242       p = 0;
00243       for(w=0;w<wnum;w++) {
00244         for(i=0;i<winfo->wlen[words[w]]; i++) {
00245           for(j=0; j<hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2; j++) {
00246             phloc[n] = p;
00247             stloc[n] = j + 1;
00248             n++;
00249           }
00250           if (hmminfo->multipath && enable_iwsp && i == winfo->wlen[words[w]] - 1) {
00251             for(k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
00252               phloc[n] = p;
00253               stloc[n] = j + 1 + k + end_num;
00254               n++;
00255             }
00256           }
00257           p++;
00258         }
00259       }
00260     }
00261     
00262     break;
00263   }
00264   end_state = (int *)mymalloc(sizeof(int) * end_num);
00265 
00266   /* make phoneme sequence word sequence */
00267   phones = make_phseq(words, wnum, hmminfo->multipath ? &has_sp : NULL, &phonenum, &end_state, per_what, r);
00268   /* build the sentence HMMs */
00269   shmm = new_make_word_hmm(hmminfo, phones, phonenum, hmminfo->multipath ? has_sp : NULL);
00270   if (shmm == NULL) {
00271     j_internal_error("Error: failed to make word hmm for alignment\n");
00272   }
00273 
00274   /* call viterbi segmentation function */
00275   allscore = viterbi_segment(shmm, param, r->wchmm->hmmwrk, hmminfo->multipath, end_state, end_num, &id_seq, &end_frame, &end_score, &rlen);
00276 
00277   /* store result to s */
00278   s->align.num = rlen;
00279   s->align.unittype = per_what;
00280   s->align.begin_frame = (int *)mymalloc(sizeof(int) * rlen);
00281   s->align.end_frame   = (int *)mymalloc(sizeof(int) * rlen);
00282   s->align.avgscore    = (LOGPROB *)mymalloc(sizeof(LOGPROB) * rlen);
00283   for(i=0;i<rlen;i++) {
00284     s->align.begin_frame[i] = (i == 0) ? 0 : end_frame[i-1] + 1;
00285     s->align.end_frame[i]   = end_frame[i];
00286     s->align.avgscore[i]    = end_score[i];
00287   }
00288   switch(per_what) {
00289   case PER_WORD:
00290     s->align.w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * rlen);
00291     for(i=0;i<rlen;i++) {
00292       s->align.w[i] = words[id_seq[i]];
00293     }
00294     break;
00295   case PER_PHONEME:
00296     s->align.ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
00297     for(i=0;i<rlen;i++) {
00298       s->align.ph[i] = phones[id_seq[i]];
00299     }
00300     break;
00301   case PER_STATE:
00302     s->align.ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
00303     s->align.loc = (short *)mymalloc(sizeof(short) * rlen);
00304     if (hmminfo->multipath) s->align.is_iwsp = (boolean *)mymalloc(sizeof(boolean) * rlen);
00305     for(i=0;i<rlen;i++) {
00306       s->align.ph[i]  = phones[phloc[id_seq[i]]];
00307       if (hmminfo->multipath) {
00308         if (enable_iwsp && stloc[id_seq[i]] > end_num) {
00309           s->align.loc[i] = stloc[id_seq[i]] - end_num;
00310           s->align.is_iwsp[i] = TRUE;
00311         } else {
00312           s->align.loc[i] = stloc[id_seq[i]];
00313           s->align.is_iwsp[i] = FALSE;
00314         }
00315       } else {
00316         s->align.loc[i] = stloc[id_seq[i]];
00317       }
00318     }
00319     break;
00320   }
00321 
00322   s->align.allscore = allscore;
00323 
00324   s->align.filled = TRUE;
00325 
00326   free_hmm(shmm);
00327   free(id_seq);
00328   free(phones);
00329   if (hmminfo->multipath) free(has_sp);
00330   free(end_score);
00331   free(end_frame);
00332   free(end_state);
00333 
00334   switch(per_what) {
00335   case PER_WORD:
00336     free(phloc);
00337     break;
00338   case PER_PHONEME:
00339     break;
00340   case PER_STATE:
00341     free(phloc);
00342     free(stloc);
00343   }
00344   
00345 }
00346 
00369 void
00370 word_align(WORD_ID *words, short wnum, HTK_Param *param, Sentence *s, RecogProcess *r)
00371 {
00372   do_align(words, wnum, param, PER_WORD, s, r);
00373 }
00374 
00397 void
00398 word_rev_align(WORD_ID *revwords, short wnum, HTK_Param *param, Sentence *s, RecogProcess *r)
00399 {
00400   WORD_ID *words;               /* word sequence (true order) */
00401   int w;
00402   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wnum);
00403   for (w=0;w<wnum;w++) words[w] = revwords[wnum-w-1];
00404   do_align(words, wnum, param, PER_WORD, s, r);
00405   free(words);
00406 }
00407 
00430 void
00431 phoneme_align(WORD_ID *words, short num, HTK_Param *param, Sentence *s, RecogProcess *r)
00432 {
00433   do_align(words, num, param, PER_PHONEME, s, r);
00434 }
00435 
00458 void
00459 phoneme_rev_align(WORD_ID *revwords, short num, HTK_Param *param, Sentence *s, RecogProcess *r)
00460 {
00461   WORD_ID *words;               /* word sequence (true order) */
00462   int p;
00463   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
00464   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
00465   do_align(words, num, param, PER_PHONEME, s, r);
00466   free(words);
00467 }
00468 
00491 void
00492 state_align(WORD_ID *words, short num, HTK_Param *param, Sentence *s, RecogProcess *r)
00493 {
00494   do_align(words, num, param, PER_STATE, s, r);
00495 }
00496 
00519 void
00520 state_rev_align(WORD_ID *revwords, short num, HTK_Param *param, Sentence *s, RecogProcess *r)
00521 {
00522   WORD_ID *words;               /* word sequence (true order) */
00523   int p;
00524   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
00525   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
00526   do_align(words, num, param, PER_STATE, s, r);
00527   free(words);
00528 }
00529 
00530 /* end of file */

Generated on Tue Dec 18 15:59:53 2007 for Julius by  doxygen 1.5.4