julius/word_align.c

Go to the documentation of this file.
00001 
00038 /*
00039  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00040  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00041  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology
00042  * All rights reserved
00043  */
00044 
00045 #include <julius.h>
00046 
00047 #define PER_WORD 1
00048 #define PER_PHONEME 2
00049 #define PER_STATE 3
00050 
00076 static HMM_Logical **
00077 make_phseq(WORD_ID *wseq, short num
00078 #ifdef MULTIPATH_VERSION
00079            , boolean **has_sp_ret
00080 #endif
00081            , int *num_ret, int **end_ret, int per_what)
00082 {
00083   HMM_Logical **ph;             /* phoneme sequence */
00084 #ifdef MULTIPATH_VERSION
00085   boolean *has_sp;
00086   int k;
00087 #endif
00088   int phnum;                    /* num of above */
00089   WORD_ID tmpw, w;
00090   int i, j, pn, st, endn;
00091   HMM_Logical *tmpp, *ret;
00092 
00093   /* make ph[] from wseq[] */
00094   /* 1. calc total phone num and malloc */
00095   phnum = 0;
00096   for (w=0;w<num;w++) phnum += winfo->wlen[wseq[w]];
00097   ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * phnum);
00098 #ifdef MULTIPATH_VERSION
00099   has_sp = (boolean *)mymalloc(sizeof(boolean) * phnum);
00100 #endif
00101   /* 2. make phoneme sequence */
00102 #ifdef MULTIPATH_VERSION
00103   st = 1;
00104 #else
00105   st = 0;
00106 #endif
00107   pn = 0;
00108   endn = 0;
00109   for (w=0;w<num;w++) {
00110     tmpw = wseq[w];
00111     for (i=0;i<winfo->wlen[tmpw];i++) {
00112       tmpp = winfo->wseq[tmpw][i];
00113       /* handle cross-word context dependency */
00114       if (ccd_flag) {
00115         if (w > 0 && i == 0) {  /* word head */
00116           
00117           if ((ret = get_left_context_HMM(tmpp, ph[pn-1]->name, hmminfo)) != NULL) {
00118             tmpp = ret;
00119           }
00120           /* if triphone not found, fallback to bi/mono-phone  */
00121           /* use pseudo phone when no bi-phone found in alignment... */
00122         }
00123         if (w < num-1 && i == winfo->wlen[tmpw] - 1) { /* word tail */
00124           if ((ret = get_right_context_HMM(tmpp, winfo->wseq[wseq[w+1]][0]->name, hmminfo)) != NULL) {
00125             tmpp = ret;
00126           }
00127         }
00128       }
00129       ph[pn] = tmpp;
00130 #ifdef MULTIPATH_VERSION
00131       if (enable_iwsp && i == winfo->wlen[tmpw] - 1) {
00132         has_sp[pn] = TRUE;
00133       } else {
00134         has_sp[pn] = FALSE;
00135       }
00136 #endif
00137       if (per_what == PER_STATE) {
00138         for (j=0;j<hmm_logical_state_num(tmpp)-2;j++) {
00139           (*end_ret)[endn++] = st + j;
00140         }
00141 #ifdef MULTIPATH_VERSION
00142         if (enable_iwsp && has_sp[pn]) {
00143           for (k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
00144             (*end_ret)[endn++] = st + j + k;
00145           }
00146         }
00147 #endif
00148       }
00149       st += hmm_logical_state_num(tmpp) - 2;
00150 #ifdef MULTIPATH_VERSION
00151       if (enable_iwsp && has_sp[pn]) {
00152         st += hmm_logical_state_num(hmminfo->sp) - 2;
00153       }
00154 #endif
00155       if (per_what == PER_PHONEME) (*end_ret)[endn++] = st - 1;
00156       pn++;
00157     }
00158     if (per_what == PER_WORD) (*end_ret)[endn++] = st - 1;
00159   }
00160   *num_ret = phnum;
00161 #ifdef MULTIPATH_VERSION
00162   *has_sp_ret = has_sp;
00163 #endif
00164   return ph;
00165 }
00166 
00167 
00186 static void
00187 do_align(WORD_ID *words, short wnum, HTK_Param *param, int per_what)
00188 {
00189   HMM_Logical **phones;         /* phoneme sequence */
00190 #ifdef MULTIPATH_VERSION
00191   boolean *has_sp;              /* whether phone can follow short pause */
00192   int k;
00193 #endif
00194   int phonenum;                 /* num of above */
00195   HMM *shmm;                    /* sentence HMM */
00196   int *end_state;               /* state number of word ends */
00197   int *end_frame;               /* segmented last frame of words */
00198   LOGPROB *end_score;           /* normalized score of each words */
00199   LOGPROB allscore;             /* total score of this word sequence */
00200   WORD_ID w;
00201   int i, rlen;
00202   int end_num = 0;
00203   int *id_seq, *phloc = NULL, *stloc = NULL;
00204   int j,n,p;
00205 
00206   /* initialize result storage buffer */
00207   switch(per_what) {
00208   case PER_WORD:
00209     j_printf("=== word alignment begin ===\n");
00210     end_num = wnum;
00211     phloc = (int *)mymalloc(sizeof(int)*wnum);
00212     i = 0;
00213     for(w=0;w<wnum;w++) {
00214       phloc[w] = i;
00215       i += winfo->wlen[words[w]];
00216     }
00217     break;
00218   case PER_PHONEME:
00219     j_printf("=== phoneme alignment begin ===\n");
00220     end_num = 0;
00221     for(w=0;w<wnum;w++) end_num += winfo->wlen[words[w]];
00222     break;
00223   case PER_STATE:
00224     j_printf("=== state alignment begin ===\n");
00225     end_num = 0;
00226     for(w=0;w<wnum;w++) {
00227       for (i=0;i<winfo->wlen[words[w]]; i++) {
00228         end_num += hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2;
00229       }
00230 #ifdef MULTIPATH_VERSION
00231       if (enable_iwsp) {
00232         end_num += hmm_logical_state_num(hmminfo->sp) - 2;
00233       }
00234 #endif
00235     }
00236     phloc = (int *)mymalloc(sizeof(int)*end_num);
00237     stloc = (int *)mymalloc(sizeof(int)*end_num);
00238     {
00239       n = 0;
00240       p = 0;
00241       for(w=0;w<wnum;w++) {
00242         for(i=0;i<winfo->wlen[words[w]]; i++) {
00243           for(j=0; j<hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2; j++) {
00244             phloc[n] = p;
00245             stloc[n] = j + 1;
00246             n++;
00247           }
00248 #ifdef MULTIPATH_VERSION
00249           if (enable_iwsp && i == winfo->wlen[words[w]] - 1) {
00250             for(k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
00251               phloc[n] = p;
00252               stloc[n] = j + 1 + k + end_num;
00253               n++;
00254             }
00255           }
00256 #endif 
00257           p++;
00258         }
00259       }
00260     }
00261     
00262     break;
00263   }
00264   end_state = (int *)mymalloc(sizeof(int) * end_num);
00265 
00266   /* make phoneme sequence word sequence */
00267   phones = make_phseq(words, wnum
00268 #ifdef MULTIPATH_VERSION
00269                       , &has_sp
00270 #endif
00271                       , &phonenum, &end_state, per_what);
00272   /* build the sentence HMMs */
00273   shmm = new_make_word_hmm(hmminfo, phones, phonenum
00274 #ifdef MULTIPATH_VERSION
00275                            , has_sp
00276 #endif
00277                            );
00278 
00279   /* call viterbi segmentation function */
00280   allscore = viterbi_segment(shmm, param, end_state, end_num, &id_seq, &end_frame, &end_score, &rlen);
00281 
00282   /* print result */
00283   {
00284     int i,p,n;
00285     j_printf("id: from  to    n_score    applied HMMs (logical[physical] or {pseudo})\n");
00286     j_printf("------------------------------------------------------------\n");
00287     for (i=0;i<rlen;i++) {
00288       j_printf("%2d: %4d %4d  %f ", id_seq[i], (i == 0) ? 0 : end_frame[i-1]+1, end_frame[i], end_score[i]);
00289       switch(per_what) {
00290       case PER_WORD:
00291         for(p=0;p<winfo->wlen[words[id_seq[i]]];p++) {
00292           n = phloc[id_seq[i]] + p;
00293           if (phones[n]->is_pseudo) {
00294             j_printf(" %s{%s}", phones[n]->name, phones[n]->body.pseudo->name);
00295           } else if (strmatch(phones[n]->name, phones[n]->body.defined->name)) {
00296             j_printf(" %s", phones[n]->name);
00297           } else {
00298             j_printf(" %s[%s]", phones[n]->name, phones[n]->body.defined->name);
00299           }
00300         }
00301         break;
00302       case PER_PHONEME:
00303         n = id_seq[i];
00304         if (phones[n]->is_pseudo) {
00305           j_printf(" {%s}", phones[n]->name);
00306         } else if (strmatch(phones[n]->name, phones[n]->body.defined->name)) {
00307           j_printf(" %s", phones[n]->name);
00308         } else {
00309           j_printf(" %s[%s]", phones[n]->name, phones[n]->body.defined->name);
00310         }
00311         break;
00312       case PER_STATE:
00313         n = phloc[id_seq[i]];
00314         if (phones[n]->is_pseudo) {
00315           j_printf(" {%s}", phones[n]->name);
00316         } else if (strmatch(phones[n]->name, phones[n]->body.defined->name)) {
00317           j_printf(" %s", phones[n]->name);
00318         } else {
00319           j_printf(" %s[%s]", phones[n]->name, phones[n]->body.defined->name);
00320         }
00321 #ifdef MULTIPATH_VERSION
00322         if (enable_iwsp && stloc[id_seq[i]] > end_num) {
00323           j_printf(" #%d (sp)", stloc[id_seq[i]] - end_num);
00324         } else {
00325           j_printf(" #%d", stloc[id_seq[i]]);
00326         }
00327 #else
00328         j_printf(" #%d", stloc[id_seq[i]]);
00329 #endif
00330         break;
00331       }
00332       j_printf("\n");
00333     }
00334   }
00335   j_printf("re-computed AM score: %f\n", allscore);
00336 
00337   free_hmm(shmm);
00338   free(id_seq);
00339   free(phones);
00340 #ifdef MULTIPATH_VERSION
00341   free(has_sp);
00342 #endif
00343   free(end_score);
00344   free(end_frame);
00345   free(end_state);
00346 
00347   switch(per_what) {
00348   case PER_WORD:
00349     free(phloc);
00350     j_printf("=== word alignment end ===\n");
00351     break;
00352   case PER_PHONEME:
00353     j_printf("=== phoneme alignment end ===\n");
00354     break;
00355   case PER_STATE:
00356     free(phloc);
00357     free(stloc);
00358     j_printf("=== state alignment end ===\n");
00359   }
00360   
00361 }
00362 
00363 /* entry functions */
00380 void
00381 word_align(WORD_ID *words, short wnum, HTK_Param *param)
00382 {
00383   do_align(words, wnum, param, PER_WORD);
00384 }
00385 
00402 void
00403 word_rev_align(WORD_ID *revwords, short wnum, HTK_Param *param)
00404 {
00405   WORD_ID *words;               /* word sequence (true order) */
00406   int w;
00407   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wnum);
00408   for (w=0;w<wnum;w++) words[w] = revwords[wnum-w-1];
00409   do_align(words, wnum, param, PER_WORD);
00410   free(words);
00411 }
00412 
00429 void
00430 phoneme_align(WORD_ID *words, short num, HTK_Param *param)
00431 {
00432   do_align(words, num, param, PER_PHONEME);
00433 }
00434 
00451 void
00452 phoneme_rev_align(WORD_ID *revwords, short num, HTK_Param *param)
00453 {
00454   WORD_ID *words;               /* word sequence (true order) */
00455   int p;
00456   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
00457   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
00458   do_align(words, num, param, PER_PHONEME);
00459   free(words);
00460 }
00461 
00478 void
00479 state_align(WORD_ID *words, short num, HTK_Param *param)
00480 {
00481   do_align(words, num, param, PER_STATE);
00482 }
00483 
00500 void
00501 state_rev_align(WORD_ID *revwords, short num, HTK_Param *param)
00502 {
00503   WORD_ID *words;               /* word sequence (true order) */
00504   int p;
00505   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
00506   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
00507   do_align(words, num, param, PER_STATE);
00508   free(words);
00509 }

Generated on Tue Dec 26 16:16:33 2006 for Julius by  doxygen 1.5.0