00001
00064
00065
00066
00067
00068
00069
00070
00071 #include <julius/julius.h>
00072
00073
00097 boolean
00098 is_sil(WORD_ID w, RecogProcess *r)
00099 {
00100 WORD_INFO *winfo;
00101 HTK_HMM_INFO *hmm;
00102 int i;
00103
00104 winfo = r->lm->winfo;
00105 hmm = r->am->hmminfo;
00106
00107
00108 if (winfo->wlen[w] > 1) return FALSE;
00109
00110 if (r->pass1.pausemodel) {
00111
00112 for(i=0;i<r->pass1.pausemodelnum;i++) {
00113 if (strmatch(winfo->wseq[w][0]->name, r->pass1.pausemodel[i])) {
00114 return TRUE;
00115 }
00116 }
00117 } else {
00118
00119 if (winfo->wseq[w][0] == hmm->sp) return TRUE;
00120
00121 if (r->lmtype == LM_PROB) {
00122
00123 if (w == winfo->head_silwid || w == winfo->tail_silwid) return TRUE;
00124 }
00125 }
00126
00127 return FALSE;
00128 }
00129
00153 void
00154 mfcc_copy_to_rest_and_shrink(MFCCCalc *mfcc, int start, int end)
00155 {
00156 int t;
00157
00158
00159 mfcc->rest_param = new_param();
00160 memcpy(&(mfcc->rest_param->header), &(mfcc->param->header), sizeof(HTK_Param_Header));
00161 mfcc->rest_param->samplenum = mfcc->param->samplenum - start;
00162 mfcc->rest_param->header.samplenum = mfcc->rest_param->samplenum;
00163 mfcc->rest_param->veclen = mfcc->param->veclen;
00164 if (param_alloc(mfcc->rest_param, mfcc->rest_param->samplenum, mfcc->rest_param->veclen) == FALSE) {
00165 j_internal_error("ERROR: segmented: failed to allocate memory for rest param\n");
00166 }
00167
00168 for(t=start;t<mfcc->param->samplenum;t++) {
00169 memcpy(mfcc->rest_param->parvec[t-start], mfcc->param->parvec[t], sizeof(VECT) * mfcc->rest_param->veclen);
00170 }
00171
00172
00173
00174 mfcc->param->samplenum = end;
00175 }
00176
00193 void
00194 mfcc_shrink(MFCCCalc *mfcc, int p)
00195 {
00196 int t;
00197 int len;
00198
00199 if (p > 0) {
00200
00201 for(t=p;t<mfcc->param->samplenum;t++) {
00202 memcpy(mfcc->param->parvec[t-p], mfcc->param->parvec[t], sizeof(VECT) * mfcc->param->veclen);
00203 }
00204
00205
00206 len = mfcc->param->samplenum - p;
00207 mfcc->param->samplenum = len;
00208 mfcc->param->header.samplenum = len;
00209 }
00210 }
00211
00261 boolean
00262 detect_end_of_segment(RecogProcess *r, int time)
00263 {
00264 FSBeam *d;
00265 TRELLIS_ATOM *tre;
00266 LOGPROB maxscore = LOG_ZERO;
00267 TRELLIS_ATOM *tremax = NULL;
00268 int count = 0;
00269 boolean detected = FALSE;
00270 #ifdef SPSEGMENT_NAIST
00271 MFCCCalc *mfcc;
00272 WORD_ID wid;
00273 int j;
00274 TOKEN2 *tk;
00275 int startframe;
00276 #endif
00277
00278 d = &(r->pass1);
00279
00280 #ifdef SPSEGMENT_NAIST
00281
00282 if (! d->after_trigger) {
00283
00284
00285
00286 for (j = d->n_start; j <= d->n_end; j++) {
00287 tk = &(d->tlist[d->tn][d->tindex[d->tn][j]]);
00288 if (r->wchmm->stend[tk->node] != WORD_INVALID) {
00289 if (maxscore < tk->score) {
00290 maxscore = tk->score;
00291 wid = r->wchmm->stend[tk->node];
00292 }
00293 }
00294 }
00295 if (maxscore == LOG_ZERO) detected = TRUE;
00296 else if (is_sil(wid, r)) detected = TRUE;
00297
00298 if (detected) {
00299
00300
00301
00302
00303
00304 d->trigger_duration = 0;
00305
00306
00307
00308 if (r->am->mfcc->f > SPSEGMENT_NAIST_AUTOSHRINK_LIMIT) {
00309 d->want_rewind = TRUE;
00310 d->rewind_frame = r->am->mfcc->f - r->config->successive.sp_margin;
00311 d->want_rewind_reprocess = FALSE;
00312 if (debug2_flag) {
00313 jlog("DEBUG: pause exceeded %d, rewind\n", SPSEGMENT_NAIST_AUTOSHRINK_LIMIT);
00314 }
00315 return FALSE;
00316 }
00317
00318
00319 d->want_rewind = FALSE;
00320
00321 } else {
00322
00323
00324
00325
00326
00327 d->trigger_duration++;
00328
00329
00330 if (d->trigger_duration < r->config->successive.sp_delay) {
00331
00332 return FALSE;
00333 }
00334
00335
00336
00337
00338
00339 if (r->am->mfcc->f < r->config->successive.sp_margin) {
00340 startframe = 0;
00341 } else {
00342 startframe = r->am->mfcc->f - r->config->successive.sp_margin;
00343 }
00344 if (debug2_flag) {
00345 jlog("DEBUG: speech triggered\n");
00346 jlog("DEBUG: word=[%s] dur=%d\n", r->lm->winfo->woutput[wid], d->trigger_duration);
00347 jlog("DEBUG: backstep behind %d (from %d to %d) frame and start process\n", r->config->successive.sp_margin, r->am->mfcc->f, startframe);
00348 }
00349
00350
00351
00352 if (r->lmtype == LM_PROB) {
00353 if (startframe > 0) {
00354 r->sp_break_last_word = WORD_INVALID;
00355 }
00356 }
00357
00358
00359 d->sp_duration = 0;
00360
00361
00362
00363 d->want_rewind = TRUE;
00364 d->rewind_frame = startframe;
00365 d->want_rewind_reprocess = TRUE;
00366
00367 d->after_trigger = TRUE;
00368 }
00369
00370 return FALSE;
00371 }
00372
00373 #endif
00374
00375
00376 for(tre = r->backtrellis->list; tre != NULL && tre->endtime == time; tre = tre->next) {
00377 if (maxscore < tre->backscore) {
00378 maxscore = tre->backscore;
00379 tremax = tre;
00380 }
00381 count++;
00382 }
00383 if (tremax == NULL) {
00384 detected = TRUE;
00385 } else if (count > 0) {
00386 if (is_sil(tremax->wid, r)) {
00387 detected = TRUE;
00388 }
00389 }
00390
00391
00392 #ifdef SPSEGMENT_NAIST
00393
00394
00395
00396
00397
00398
00399
00400 if (d->first_sparea) {
00401
00402 if (d->in_sparea && detected) {
00403
00404 d->sp_duration++;
00405
00406
00407
00408
00409 if (d->sp_duration > r->config->successive.sp_delay + r->config->successive.sp_margin + r->config->successive.sp_frame_duration) {
00410 d->in_sparea = FALSE;
00411 d->first_sparea = FALSE;
00412 if (debug2_flag) {
00413 jlog("DEBUG: no valid speech starts, force trigger at %d\n", r->am->mfcc->f);
00414 }
00415 }
00416 } else if (d->in_sparea && !detected) {
00417
00418 d->in_sparea = FALSE;
00419 d->first_sparea = FALSE;
00420 if (debug2_flag) {
00421 jlog("DEBUG: speech segment start at %d\n", r->am->mfcc->f);
00422 }
00423 }
00424 } else {
00425
00426 if (!d->in_sparea) {
00427
00428 if (detected) {
00429
00430
00431
00432 d->tmp_sparea_start = time;
00433 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00434 if (r->lmtype == LM_PROB) {
00435
00436
00437 d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID;
00438 }
00439 #endif
00440 d->in_sparea = TRUE;
00441 d->sp_duration = 1;
00442 } else {
00443
00444
00445 }
00446 } else {
00447
00448 if (detected) {
00449
00450 d->sp_duration++;
00451
00452 if (r->lmtype == LM_PROB) {
00453 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00454
00455
00456
00457 if (d->tmp_sp_break_last_word == WORD_INVALID) {
00458 if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid;
00459 }
00460 #else
00461
00462
00463 if (tremax != NULL) d->last_tre_word = tremax->wid;
00464 #endif
00465 }
00466
00467 if (d->sp_duration >= r->config->successive.sp_frame_duration) {
00468
00469
00470
00471 r->am->mfcc->sparea_start = time - r->config->successive.sp_frame_duration;
00472 if (r->lmtype == LM_PROB) {
00473 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00474
00475 r->sp_break_last_word = d->tmp_sp_break_last_word;
00476 #else
00477
00478 r->sp_break_last_word = d->last_tre_word;
00479 #endif
00480 }
00481
00482 if (debug2_flag) {
00483 jlog("DEBUG: trailing silence end, end this segment at %d\n", r->am->mfcc->f);
00484 }
00485
00486 d->after_trigger = FALSE;
00487 d->trigger_duration = 0;
00488 d->want_rewind = FALSE;
00489
00490
00491 return(TRUE);
00492 }
00493
00494 } else {
00495
00496
00497 d->in_sparea = FALSE;
00498 }
00499 }
00500 }
00501
00502 d->want_rewind = FALSE;
00503
00504
00505 #else
00506
00507
00508
00509
00510
00511 if (d->in_sparea && detected) {
00512 d->sp_duration++;
00513 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00514
00515
00516
00517
00518 if (d->tmp_sp_break_last_word == WORD_INVALID) {
00519 if (tremax != NULL) d->tmp_sp_break_last_word = tremax->wid;
00520 }
00521 #else
00522
00523
00524 if (tremax != NULL) d->last_tre_word = tremax->wid;
00525 #endif
00526 }
00527
00528
00529
00530 else if (!d->in_sparea && detected) {
00531
00532
00533 d->tmp_sparea_start = time;
00534 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00535
00536
00537 d->tmp_sp_break_last_word = tremax ? tremax->wid : WORD_INVALID;
00538 #endif
00539 d->in_sparea = TRUE;
00540 d->sp_duration = 1;
00541 #ifdef SP_BREAK_DEBUG
00542 jlog("DEBUG: sp start %d\n", time);
00543 #endif
00544 }
00545
00546
00547
00548 else if (d->in_sparea && !detected) {
00549
00550 d->in_sparea = FALSE;
00551 #ifdef SP_BREAK_DEBUG
00552 jlog("DEBUG: sp end %d\n", time);
00553 #endif
00554
00555
00556 if (d->sp_duration < r->config->successive.sp_frame_duration) {
00557
00558
00559 #ifdef SP_BREAK_DEBUG
00560 jlog("DEBUG: too short (%d<%d), ignored\n", d->sp_duration, r->config->successive.sp_frame_duration);
00561 #endif
00562 } else if (d->first_sparea) {
00563
00564
00565 d->first_sparea = FALSE;
00566 #ifdef SP_BREAK_DEBUG
00567 jlog("DEBUG: first silence, ignored\n");
00568 #endif
00569 } else {
00570
00571
00572 #ifdef SP_BREAK_DEBUG
00573 jlog("DEBUG: >> segment [%d..%d]\n", r->am->mfcc->sparea_start, time-1);
00574 #endif
00575
00576 r->am->mfcc->sparea_start = d->tmp_sparea_start;
00577 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00578
00579 r->sp_break_last_word = d->tmp_sp_break_last_word;
00580 #else
00581
00582 r->sp_break_last_word = d->last_tre_word;
00583 #endif
00584
00585
00586 return(TRUE);
00587 }
00588 }
00589
00590
00591 #endif
00592
00593
00594 #ifdef SP_BREAK_EVAL
00595 jlog("DEBUG: [%d %d %d]\n", time, count, (detected) ? 50 : 0);
00596 #endif
00597 return (FALSE);
00598 }
00599
00600
00601
00602
00603
00631 void
00632 finalize_segment(Recog *recog)
00633 {
00634 int t;
00635 int spstart;
00636 RecogProcess *r;
00637 MFCCCalc *mfcc;
00638 boolean ok_p;
00639
00640
00641
00642
00643 for(r=recog->process_list;r;r=r->next) {
00644 if (!r->live) continue;
00645 if (r->lmtype == LM_PROB) {
00646 set_terminal_words(r);
00647 }
00648 }
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662 ok_p = FALSE;
00663 for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00664 if (mfcc->segmented) {
00665 spstart = mfcc->sparea_start;
00666 ok_p = TRUE;
00667 break;
00668 }
00669 }
00670
00671 if (ok_p) {
00672
00673
00674
00675 if (verbose_flag) jlog("STAT: segmented: next decoding will restart from %d\n", spstart);
00676
00677 for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00678 if (verbose_flag) jlog("STAT: MFCC%02d: segmented: processed length=%d\n", mfcc->id, mfcc->last_time);
00679
00680
00681 mfcc_copy_to_rest_and_shrink(mfcc, spstart, mfcc->last_time);
00682 }
00683
00684
00685 for(r=recog->process_list;r;r=r->next) {
00686 if (!r->live) continue;
00687 r->sp_break_last_nword_allow_override = TRUE;
00688 }
00689
00690 } else {
00691
00692
00693 for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00694 mfcc->rest_param = NULL;
00695 }
00696
00697 for(r=recog->process_list;r;r=r->next) {
00698 if (!r->live) continue;
00699 r->sp_break_2_begin_word = WORD_INVALID;
00700 r->sp_break_last_word = WORD_INVALID;
00701 r->sp_break_last_nword = WORD_INVALID;
00702 r->sp_break_last_nword_allow_override = FALSE;
00703 }
00704 }
00705 }
00706
00707 #ifdef BACKEND_VAD
00708
00724 void
00725 spsegment_init(Recog *recog)
00726 {
00727 RecogProcess *p;
00728
00729 #ifdef SPSEGMENT_NAIST
00730 for(p=recog->process_list;p;p=p->next) {
00731 p->pass1.after_trigger = FALSE;
00732 p->pass1.trigger_duration = 0;
00733 }
00734 #endif
00735 #ifdef GMM_VAD
00736 if (recog->gmm) {
00737 recog->gc->after_trigger = FALSE;
00738 recog->gc->duration = 0;
00739 }
00740 #endif
00741 recog->triggered = FALSE;
00742 }
00743
00768 boolean
00769 spsegment_trigger_sync(Recog *recog)
00770 {
00771 RecogProcess *p;
00772 boolean ok_p;
00773
00774 ok_p = FALSE;
00775 if (recog->jconf->decodeopt.segment) {
00776 #ifdef SPSEGMENT_NAIST
00777 for(p = recog->process_list; p; p = p->next) {
00778 if (!p->live) continue;
00779 if (p->pass1.after_trigger) {
00780 ok_p = TRUE;
00781 break;
00782 }
00783 }
00784 #endif
00785 #ifdef GMM_VAD
00786 if (recog->gmm) {
00787 if (recog->gc->after_trigger) {
00788 ok_p = TRUE;
00789 }
00790 }
00791 #endif
00792 }
00793 if (ok_p) {
00794
00795 #ifdef SPSEGMENT_NAIST
00796 for(p = recog->process_list; p; p = p->next) {
00797 if (!p->live) continue;
00798 p->pass1.after_trigger = TRUE;
00799 }
00800 #endif
00801 #ifdef GMM_VAD
00802 if (recog->gmm) {
00803 recog->gc->after_trigger = TRUE;
00804 }
00805 #endif
00806 }
00807
00808 return ok_p;
00809 }
00810
00811 #endif
00812
00837 boolean
00838 spsegment_need_restart(Recog *recog, int *rf_ret, boolean *repro_ret)
00839 {
00840 RecogProcess *p;
00841 boolean ok_p;
00842 int rewind_frame;
00843 boolean reprocess;
00844
00845 ok_p = FALSE;
00846 if (recog->jconf->decodeopt.segment) {
00847 #ifdef SPSEGMENT_NAIST
00848
00849 for(p = recog->process_list; p; p = p->next) {
00850 if (!p->live) continue;
00851 if (p->pass1.want_rewind) {
00852 p->pass1.want_rewind = FALSE;
00853 rewind_frame = p->pass1.rewind_frame;
00854 reprocess = p->pass1.want_rewind_reprocess;
00855 ok_p = TRUE;
00856 break;
00857 }
00858 }
00859 #endif
00860 #ifdef GMM_VAD
00861 if (recog->gmm) {
00862 if (recog->gc->want_rewind) {
00863 recog->gc->want_rewind = FALSE;
00864 #ifdef SPSEGMENT_NAIST
00865
00866 if (rewind_frame > recog->gc->rewind_frame) rewind_frame = recog->gc->rewind_frame;
00867 #else
00868 rewind_frame = recog->gc->rewind_frame;
00869 #endif
00870 reprocess = recog->gc->want_rewind_reprocess;
00871 ok_p = TRUE;
00872 }
00873 }
00874 #endif
00875 *rf_ret = rewind_frame;
00876 *repro_ret = reprocess;
00877 }
00878
00879 return(ok_p);
00880 }
00881
00908 void
00909 spsegment_restart_mfccs(Recog *recog, int rewind_frame, boolean reprocess)
00910 {
00911 MFCCCalc *mfcc;
00912
00913 for (mfcc = recog->mfcclist; mfcc; mfcc = mfcc->next) {
00914 if (!mfcc->valid) continue;
00915
00916 mfcc->last_time = mfcc->f - 1;
00917
00918 if (reprocess) {
00919
00920 mfcc->f = -1;
00921 } else {
00922
00923 mfcc->f -= rewind_frame;
00924 }
00925
00926 mfcc_shrink(mfcc, rewind_frame);
00927 }
00928 }
00929
00930