00001 
00064 
00065 
00066 
00067 
00068 
00069 
00070 
00071 #include <julius/julius.h>
00072 
00073 
00097 boolean
00098 is_sil(WORD_ID w, RecogProcess *r)
00099 {
00100   WORD_INFO *winfo;
00101   HTK_HMM_INFO *hmm;
00102   int i;
00103 
00104   winfo = r->lm->winfo;
00105   hmm = r->am->hmminfo;
00106 
00107   
00108   if (winfo->wlen[w] > 1) return FALSE;
00109 
00110   if (r->pass1.pausemodel) {
00111     
00112     for(i=0;i<r->pass1.pausemodelnum;i++) {
00113       if (strmatch(winfo->wseq[w][0]->name, r->pass1.pausemodel[i])) {
00114         return TRUE;
00115       }
00116     }
00117   } else {
00118     
00119     if (winfo->wseq[w][0] == hmm->sp) return TRUE;
00120     
00121     if (r->lmtype == LM_PROB) {
00122       
00123       if (w == winfo->head_silwid || w == winfo->tail_silwid) return TRUE;
00124     }
00125   }
00126 
00127   return FALSE;
00128 }
00129 
00153 void
00154 mfcc_copy_to_rest_and_shrink(MFCCCalc *mfcc, int start, int end)
00155 {
00156   int t;
00157 
00158   
00159   mfcc->rest_param = new_param();
00160   memcpy(&(mfcc->rest_param->header), &(mfcc->param->header), sizeof(HTK_Param_Header));
00161   mfcc->rest_param->samplenum = mfcc->param->samplenum - start;
00162   mfcc->rest_param->header.samplenum = mfcc->rest_param->samplenum;
00163   mfcc->rest_param->veclen = mfcc->param->veclen;
00164   if (param_alloc(mfcc->rest_param, mfcc->rest_param->samplenum, mfcc->rest_param->veclen) == FALSE) {
00165     j_internal_error("ERROR: segmented: failed to allocate memory for rest param\n");
00166   }
00167   
00168   for(t=start;t<mfcc->param->samplenum;t++) {
00169     memcpy(mfcc->rest_param->parvec[t-start], mfcc->param->parvec[t], sizeof(VECT) * mfcc->rest_param->veclen);
00170   }
00171   
00172   
00173   
00174   mfcc->param->samplenum = end;
00175 }
00176 
00193 void
00194 mfcc_shrink(MFCCCalc *mfcc, int p)
00195 {
00196   int t;
00197   int len;
00198 
00199   if (p > 0) {
00200     
00201     for(t=p;t<mfcc->param->samplenum;t++) {
00202       memcpy(mfcc->param->parvec[t-p], mfcc->param->parvec[t], sizeof(VECT) * mfcc->param->veclen);
00203     }
00204     
00205     
00206     len = mfcc->param->samplenum - p;
00207     mfcc->param->samplenum = len;
00208     mfcc->param->header.samplenum = len;
00209   }
00210 }
00211 
00261 boolean
00262 detect_end_of_segment(RecogProcess *r, int time)
00263 {
00264   FSBeam *d;
00265   TRELLIS_ATOM *tre;
00266   LOGPROB maxscore = LOG_ZERO;
00267   TRELLIS_ATOM *tremax = NULL;
00268   int count = 0;
00269   boolean detected = FALSE;
00270 #ifdef SPSEGMENT_NAIST
00271   MFCCCalc *mfcc;
00272   WORD_ID wid;
00273   int j;
00274   TOKEN2 *tk;
00275   int startframe;
00276 #endif
00277 
00278   d = &(r->pass1);
00279 
00280 #ifdef SPSEGMENT_NAIST
00281 
00282   if (! d->after_trigger) {
00283     
00284 
00285     
00286     for (j = d->n_start; j <= d->n_end; j++) {
00287       tk = &(d->tlist[d->tn][d->tindex[d->tn][j]]);
00288       if (r->wchmm->stend[tk->node] != WORD_INVALID) {
00289         if (maxscore < tk->score) {
00290           maxscore = tk->score;
00291           wid = r->wchmm->stend[tk->node];
00292         }
00293       }
00294     }
00295     if (maxscore == LOG_ZERO) detected = TRUE;
00296     else if (is_sil(wid, r)) detected = TRUE;
00297  
00298     if (detected) {
00299       
00300       
00301       
00302 
00303       
00304       d->trigger_duration = 0;
00305       
00306       
00307 
00308       if (r->am->mfcc->f > SPSEGMENT_NAIST_AUTOSHRINK_LIMIT) {
00309         d->want_rewind = TRUE;
00310         d->rewind_frame = r->am->mfcc->f - r->config->successive.sp_margin;
00311         d->want_rewind_reprocess = FALSE;
00312         if (debug2_flag) {
00313           jlog("DEBUG: pause exceeded %d, rewind\n", SPSEGMENT_NAIST_AUTOSHRINK_LIMIT);
00314         }
00315         return FALSE;
00316       }
00317 
00318       
00319       d->want_rewind = FALSE;
00320 
00321     } else {
00322       
00323       
00324       
00325 
00326       
00327       d->trigger_duration++;
00328       
00329       
00330       if (d->trigger_duration < r->config->successive.sp_delay) {
00331         
00332         return FALSE;
00333       }
00334 
00335       
00336       
00337       
00338       
00339       if (r->am->mfcc->f < r->config->successive.sp_margin) {
00340         startframe = 0;
00341       } else {
00342         startframe = r->am->mfcc->f - r->config->successive.sp_margin;
00343       }
00344       if (debug2_flag) {
00345         jlog("DEBUG: speech triggered\n");
00346         jlog("DEBUG: word=[%s] dur=%d\n", r->lm->winfo->woutput[wid], d->trigger_duration);
00347         jlog("DEBUG: backstep behind %d (from %d to %d) frame and start process\n", r->config->successive.sp_margin, r->am->mfcc->f, startframe);
00348       }
00349 
00350       
00351 
00352       if (r->lmtype == LM_PROB) {
00353         if (startframe > 0) {
00354           r->sp_break_last_word = WORD_INVALID;
00355         }
00356       }
00357 
00358       
00359       d->sp_duration = 0;
00360 
00361       
00362 
00363       d->want_rewind = TRUE;
00364       d->rewind_frame = startframe;
00365       d->want_rewind_reprocess = TRUE;
00366       
00367       d->after_trigger = TRUE;
00368     }
00369     
00370     return FALSE;
00371   }
00372 
00373 #endif 
00374 
00375   
00376   for(tre = r->backtrellis->list; tre != NULL && tre->endtime == time; tre = tre->next) {
00377     if (maxscore < tre->backscore) {
00378       maxscore = tre->backscore;
00379       tremax = tre;
00380     }
00381     count++;
00382   }
00383   if (tremax == NULL) { 
00384     detected = TRUE;            
00385   } else if (count > 0) {       
00386     if (is_sil(tremax->wid, r)) {
00387       detected = TRUE;
00388     }
00389   }
00390 
00391 
00392 #ifdef SPSEGMENT_NAIST
00393   
00394   
00395 
00396   
00397 
00398   
00399   
00400   if (d->first_sparea) {
00401     
00402     if (d->in_sparea && detected) {
00403       
00404       d->sp_duration++;
00405       
00406 
00407 
00408       
00409       if (d->sp_duration > r->config->successive.sp_delay + r->config->successive.sp_margin + r->config->successive.sp_frame_duration) {
00410         d->in_sparea = FALSE;
00411         d->first_sparea = FALSE;
00412         if (debug2_flag) {
00413           jlog("DEBUG: no valid speech starts, force trigger at %d\n", r->am->mfcc->f);
00414         }
00415       }
00416     } else if (d->in_sparea && !detected) {
00417       
00418       d->in_sparea = FALSE;
00419       d->first_sparea = FALSE;
00420       if (debug2_flag) {
00421         jlog("DEBUG: speech segment start at %d\n", r->am->mfcc->f);
00422       }
00423     }
00424   } else {
00425     
00426     if (!d->in_sparea) {
00427       
00428       if (detected) {
00429         
00430         
00431         
00432         d->tmp_sparea_start = time;
00433 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00434         if (r->lmtype == LM_PROB) {
00435           
00436           
00437           d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID;
00438         }
00439 #endif
00440         d->in_sparea = TRUE;
00441         d->sp_duration = 1;
00442       } else {
00443         
00444         
00445       }
00446     } else {
00447       
00448       if (detected) {
00449         
00450         d->sp_duration++;
00451         
00452         if (r->lmtype == LM_PROB) {
00453 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00454           
00455 
00456 
00457           if (d->tmp_sp_break_last_word == WORD_INVALID) {
00458             if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid;
00459           }
00460 #else
00461           
00462           
00463           if (tremax != NULL) d->last_tre_word = tremax->wid;
00464 #endif
00465         }
00466 
00467         if (d->sp_duration >= r->config->successive.sp_frame_duration) {
00468           
00469           
00470           
00471           r->am->mfcc->sparea_start = time - r->config->successive.sp_frame_duration;
00472           if (r->lmtype == LM_PROB) {
00473 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00474             
00475             r->sp_break_last_word = d->tmp_sp_break_last_word;
00476 #else
00477             
00478             r->sp_break_last_word = d->last_tre_word;
00479 #endif
00480           }
00481 
00482           if (debug2_flag) {
00483             jlog("DEBUG: trailing silence end, end this segment at %d\n", r->am->mfcc->f);
00484           }
00485           
00486           d->after_trigger = FALSE;
00487           d->trigger_duration = 0;
00488           d->want_rewind = FALSE;
00489 
00490           
00491           return(TRUE);
00492         }
00493         
00494       } else {
00495         
00496         
00497         d->in_sparea = FALSE;
00498       }
00499     }
00500   }
00501 
00502   d->want_rewind = FALSE;
00503 
00504 
00505 #else  
00506   
00507   
00508 
00509   
00510   
00511   if (d->in_sparea && detected) {       
00512     d->sp_duration++;           
00513 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00514     
00515     
00516 
00517 
00518     if (d->tmp_sp_break_last_word == WORD_INVALID) {
00519       if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid;
00520     }
00521 #else
00522     
00523     
00524     if (tremax != NULL) d->last_tre_word = tremax->wid;
00525 #endif
00526   }
00527 
00528   
00529   
00530   else if (!d->in_sparea && detected) {
00531     
00532     
00533     d->tmp_sparea_start = time;
00534 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00535     
00536     
00537     d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID;
00538 #endif
00539     d->in_sparea = TRUE;                
00540     d->sp_duration = 1;         
00541 #ifdef SP_BREAK_DEBUG
00542     jlog("DEBUG: sp start %d\n", time);
00543 #endif 
00544   }
00545   
00546   
00547   
00548   else if (d->in_sparea && !detected) {
00549     
00550     d->in_sparea = FALSE;               
00551 #ifdef SP_BREAK_DEBUG
00552     jlog("DEBUG: sp end %d\n", time);
00553 #endif 
00554     
00555     
00556     if (d->sp_duration < r->config->successive.sp_frame_duration) {
00557       
00558       
00559 #ifdef SP_BREAK_DEBUG
00560       jlog("DEBUG: too short (%d<%d), ignored\n", d->sp_duration, r->config->successive.sp_frame_duration);
00561 #endif 
00562     } else if (d->first_sparea) {
00563       
00564       
00565       d->first_sparea = FALSE;
00566 #ifdef SP_BREAK_DEBUG
00567       jlog("DEBUG: first silence, ignored\n");
00568 #endif 
00569     } else {
00570       
00571       
00572 #ifdef SP_BREAK_DEBUG
00573       jlog("DEBUG: >> segment [%d..%d]\n", r->am->mfcc->sparea_start, time-1);
00574 #endif 
00575       
00576       r->am->mfcc->sparea_start = d->tmp_sparea_start;
00577 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00578       
00579       r->sp_break_last_word = d->tmp_sp_break_last_word;
00580 #else
00581       
00582       r->sp_break_last_word = d->last_tre_word;
00583 #endif
00584 
00585       
00586       return(TRUE);
00587     }
00588   }
00589 
00590 
00591 #endif  
00592 
00593     
00594 #ifdef SP_BREAK_EVAL
00595   jlog("DEBUG: [%d %d %d]\n", time, count, (detected) ? 50 : 0);
00596 #endif
00597   return (FALSE);
00598 }
00599 
00600 
00601 
00602 
00603 
00631 void
00632 finalize_segment(Recog *recog)
00633 {
00634   int t;
00635   int spstart;
00636   RecogProcess *r;
00637   MFCCCalc *mfcc;
00638   boolean ok_p;
00639 
00640   
00641   
00642 
00643   for(r=recog->process_list;r;r=r->next) {
00644     if (!r->live) continue;
00645     if (r->lmtype == LM_PROB) {
00646       set_terminal_words(r);
00647     }
00648   }
00649 
00650   
00651 
00652 
00653   
00654 
00655 
00656 
00657   
00658 
00659 
00660 
00661 
00662   ok_p = FALSE;
00663   for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00664     if (mfcc->segmented) {
00665       spstart = mfcc->sparea_start;
00666       ok_p = TRUE;
00667       break;
00668     }
00669   }
00670 
00671   if (ok_p) {
00672     
00673     
00674     
00675     if (verbose_flag) jlog("STAT: segmented: next decoding will restart from %d\n", spstart);
00676 
00677     for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00678       if (verbose_flag) jlog("STAT: MFCC%02d: segmented: processed length=%d\n", mfcc->id, mfcc->last_time);
00679 
00680       
00681       mfcc_copy_to_rest_and_shrink(mfcc, spstart, mfcc->last_time);
00682     }
00683 
00684     
00685     for(r=recog->process_list;r;r=r->next) {
00686       if (!r->live) continue;
00687       r->sp_break_last_nword_allow_override = TRUE;
00688     }
00689     
00690   } else {
00691     
00692     
00693     for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00694       mfcc->rest_param = NULL;
00695     }
00696     
00697     for(r=recog->process_list;r;r=r->next) {
00698       if (!r->live) continue;
00699       r->sp_break_2_begin_word = WORD_INVALID;
00700       r->sp_break_last_word = WORD_INVALID;
00701       r->sp_break_last_nword = WORD_INVALID;
00702       r->sp_break_last_nword_allow_override = FALSE;
00703     }
00704   }
00705 }
00706 
00707 #ifdef BACKEND_VAD
00708 
00724 void
00725 spsegment_init(Recog *recog)
00726 {
00727   RecogProcess *p;
00728   
00729 #ifdef SPSEGMENT_NAIST
00730   for(p=recog->process_list;p;p=p->next) {
00731     p->pass1.after_trigger = FALSE;
00732     p->pass1.trigger_duration = 0;
00733   }
00734 #endif
00735 #ifdef GMM_VAD
00736   if (recog->gmm) {
00737     recog->gc->after_trigger = FALSE;
00738     recog->gc->duration = 0;
00739   }
00740 #endif
00741   recog->triggered = FALSE;
00742 }
00743 
00768 boolean
00769 spsegment_trigger_sync(Recog *recog)
00770 {
00771   RecogProcess *p;
00772   boolean ok_p;
00773 
00774   ok_p = FALSE;
00775   if (recog->jconf->decodeopt.segment) {
00776 #ifdef SPSEGMENT_NAIST
00777     for(p = recog->process_list; p; p = p->next) {
00778       if (!p->live) continue;
00779       if (p->pass1.after_trigger) {
00780         ok_p = TRUE;
00781         break;
00782       }
00783     }
00784 #endif
00785 #ifdef GMM_VAD
00786     if (recog->gmm) {
00787       if (recog->gc->after_trigger) {
00788         ok_p = TRUE;
00789       }
00790     }
00791 #endif
00792   }
00793   if (ok_p) {
00794     
00795 #ifdef SPSEGMENT_NAIST
00796     for(p = recog->process_list; p; p = p->next) {
00797       if (!p->live) continue;
00798       p->pass1.after_trigger = TRUE;
00799     }
00800 #endif
00801 #ifdef GMM_VAD
00802     if (recog->gmm) {
00803       recog->gc->after_trigger = TRUE;
00804     }
00805 #endif
00806   }
00807   
00808   return ok_p;
00809 }
00810 
00811 #endif 
00812 
00837 boolean
00838 spsegment_need_restart(Recog *recog, int *rf_ret, boolean *repro_ret)
00839 {
00840   RecogProcess *p;
00841   boolean ok_p;
00842   int rewind_frame;
00843   boolean reprocess;
00844 
00845   ok_p = FALSE;
00846   if (recog->jconf->decodeopt.segment) {
00847 #ifdef SPSEGMENT_NAIST
00848     
00849     for(p = recog->process_list; p; p = p->next) {
00850       if (!p->live) continue;
00851       if (p->pass1.want_rewind) {
00852         p->pass1.want_rewind = FALSE;
00853         rewind_frame = p->pass1.rewind_frame;
00854         reprocess = p->pass1.want_rewind_reprocess;
00855         ok_p = TRUE;
00856         break;
00857       }
00858     }
00859 #endif 
00860 #ifdef GMM_VAD
00861     if (recog->gmm) {
00862       if (recog->gc->want_rewind) {
00863         recog->gc->want_rewind = FALSE;
00864 #ifdef SPSEGMENT_NAIST
00865         
00866         if (rewind_frame > recog->gc->rewind_frame) rewind_frame = recog->gc->rewind_frame;
00867 #else
00868         rewind_frame = recog->gc->rewind_frame;
00869 #endif
00870         reprocess = recog->gc->want_rewind_reprocess;
00871         ok_p = TRUE;
00872       }
00873     }
00874 #endif
00875     *rf_ret = rewind_frame;
00876     *repro_ret = reprocess;
00877   }
00878 
00879   return(ok_p);
00880 }
00881 
00908 void
00909 spsegment_restart_mfccs(Recog *recog, int rewind_frame, boolean reprocess)
00910 {
00911   MFCCCalc *mfcc;
00912 
00913   for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00914     if (!mfcc->valid) continue;
00915     
00916     mfcc->last_time = mfcc->f - 1;
00917     
00918     if (reprocess) {
00919       
00920       mfcc->f = -1;
00921     } else {
00922       
00923       mfcc->f -= rewind_frame;
00924     }
00925     
00926     mfcc_shrink(mfcc, rewind_frame);
00927   }
00928 }
00929 
00930