00001
00022
00023
00024
00025
00026
00027
00028
00029 #include <julius/julius.h>
00030
00035 #undef CDEBUG
00036
00041 #undef CDEBUG2
00042
00052 #define PREFER_GRAPH_CM
00053
00061 #define BUNDLE_WORD_WITH_SAME_OUTPUT
00062
00063
00074 static boolean
00075 is_same_word(WORD_ID w1, WORD_ID w2, WORD_INFO *winfo)
00076 {
00077 if (w1 == w2
00078 #ifdef BUNDLE_WORD_WITH_SAME_OUTPUT
00079 || strmatch(winfo->woutput[w1], winfo->woutput[w2])
00080 #endif
00081 ) return TRUE;
00082 return FALSE;
00083 }
00084
00085
00086
00091 #define m2i(A, B) (B) * r->order_matrix_count + (A)
00092
00101 static boolean
00102 graph_ordered(RecogProcess *r, int i, int j)
00103 {
00104 if (i != j && r->order_matrix[m2i(i,j)] == 0 && r->order_matrix[m2i(j,i)] == 0) {
00105 return FALSE;
00106 }
00107 return TRUE;
00108 }
00109
00115 static void
00116 graph_update_order(RecogProcess *r)
00117 {
00118 int i, j, k;
00119 boolean changed;
00120 int count;
00121
00122 count = r->order_matrix_count;
00123
00124 do {
00125 changed = FALSE;
00126 for(i=0;i<count;i++) {
00127 for(j=0;j<count;j++) {
00128 if (r->order_matrix[m2i(i, j)] == 1) {
00129 for(k=0;k<count;k++) {
00130 if (r->order_matrix[m2i(j, k)] == 1) {
00131 if (r->order_matrix[m2i(i, k)] == 0) {
00132 r->order_matrix[m2i(i, k)] = 1;
00133 changed = TRUE;
00134 }
00135 }
00136 }
00137 }
00138 }
00139 }
00140 } while (changed == TRUE);
00141 }
00142
00153 void
00154 graph_make_order(WordGraph *root, RecogProcess *r)
00155 {
00156 int count;
00157 WordGraph *wg, *right;
00158 int i;
00159
00160
00161 count = 0;
00162 for(wg=root;wg;wg=wg->next) count++;
00163 if (count == 0) {
00164 r->order_matrix = NULL;
00165 return;
00166 }
00167 if (count != r->graph_totalwordnum) {
00168 jlog("Error: graph_make_order: r->graph_totalwordnum differ from actual number?\n");
00169 r->order_matrix = NULL;
00170 return;
00171 }
00172 r->order_matrix_count = count;
00173 for(wg=root;wg;wg=wg->next) {
00174 if (wg->id >= count) {
00175 jlog("Error: graph_make_order: wordgraph id >= count (%d >= %d)\n", wg->id, count);
00176 r->order_matrix = NULL;
00177 return;
00178 }
00179 }
00180
00181
00182 r->order_matrix = (char *)mymalloc(count * count);
00183 for(i=0;i<count*count;i++) r->order_matrix[i] = 0;
00184
00185
00186 for(wg=root;wg;wg=wg->next) {
00187 for(i=0;i<wg->rightwordnum;i++) {
00188 right = wg->rightword[i];
00189 r->order_matrix[m2i(wg->id, right->id)] = 1;
00190 }
00191 }
00192
00193
00194 graph_update_order(r);
00195 }
00196
00203 void
00204 graph_free_order(RecogProcess *r)
00205 {
00206 if (r->order_matrix) {
00207 free(r->order_matrix);
00208 r->order_matrix = NULL;
00209 }
00210 }
00211
00212
00213
00219 static CN_CLUSTER *
00220 cn_new()
00221 {
00222 CN_CLUSTER *new;
00223 new = (CN_CLUSTER *)mymalloc(sizeof(CN_CLUSTER));
00224 new->wg = (WordGraph **)mymalloc(sizeof(WordGraph *) * CN_CLUSTER_WG_STEP);
00225 new->wgnum_alloc = CN_CLUSTER_WG_STEP;
00226 new->wgnum = 0;
00227 new->words = NULL;
00228 new->pp = NULL;
00229 new->next = NULL;
00230 return new;
00231 }
00232
00239 static void
00240 cn_free(CN_CLUSTER *c)
00241 {
00242 free(c->wg);
00243 if (c->words) free(c->words);
00244 if (c->pp) free(c->pp);
00245 free(c);
00246 }
00247
00257 void
00258 cn_free_all(CN_CLUSTER **croot)
00259 {
00260 CN_CLUSTER *c, *ctmp;
00261 c = *croot;
00262 while(c) {
00263 ctmp = c->next;
00264 cn_free(c);
00265 c = ctmp;
00266 }
00267 *croot = NULL;
00268 }
00269
00276 static void
00277 cn_add_wg(CN_CLUSTER *c, WordGraph *wg)
00278 {
00279 if (c->wgnum >= c->wgnum_alloc) {
00280 c->wgnum_alloc += CN_CLUSTER_WG_STEP;
00281 c->wg = (WordGraph **)myrealloc(c->wg, sizeof(WordGraph *) * c->wgnum_alloc);
00282 }
00283 c->wg[c->wgnum] = wg;
00284 c->wgnum++;
00285 }
00286
00293 static void
00294 cn_merge(RecogProcess *r, CN_CLUSTER *dst, CN_CLUSTER *src)
00295 {
00296 WordGraph *wg;
00297 int i, j, n;
00298
00299
00300 for(i=0;i<src->wgnum;i++) {
00301 wg = src->wg[i];
00302 for(j=0;j<dst->wgnum;j++) {
00303 for(n=0;n<wg->leftwordnum;n++) {
00304 r->order_matrix[m2i(wg->leftword[n]->id, dst->wg[j]->id)] = 1;
00305 }
00306 for(n=0;n<wg->rightwordnum;n++) {
00307 r->order_matrix[m2i(dst->wg[j]->id, wg->rightword[n]->id)] = 1;
00308 }
00309 }
00310 }
00311 graph_update_order(r);
00312
00313 for(i=0;i<src->wgnum;i++) {
00314 cn_add_wg(dst, src->wg[i]);
00315 }
00316 }
00317
00324 static void
00325 cn_destroy(CN_CLUSTER *target, CN_CLUSTER **root)
00326 {
00327 CN_CLUSTER *c, *cprev;
00328
00329 cprev = NULL;
00330 for(c = *root; c; c = c->next) {
00331 if (c == target) {
00332 if (cprev) {
00333 cprev->next = c->next;
00334 } else {
00335 *root = c->next;
00336 }
00337 cn_free(c);
00338 break;
00339 }
00340 cprev = c;
00341 }
00342 }
00343
00350 static void
00351 cn_build_wordlist(CN_CLUSTER *c, WORD_INFO *winfo)
00352 {
00353 int i, j;
00354
00355 if (c->words) {
00356 free(c->words);
00357 }
00358 c->words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * c->wgnum + 1);
00359 c->wordsnum = 0;
00360 for(i=0;i<c->wgnum;i++) {
00361 for(j=0;j<c->wordsnum;j++) {
00362 if (is_same_word(c->words[j], c->wg[i]->wid, winfo)) break;
00363 }
00364 if (j>=c->wordsnum) {
00365 c->words[c->wordsnum] = c->wg[i]->wid;
00366 c->wordsnum++;
00367 }
00368 }
00369 }
00370
00380 static int
00381 compare_cluster(CN_CLUSTER **x, CN_CLUSTER **y, RecogProcess *r)
00382 {
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396 int i, j;
00397
00398 if (x == y) return 0;
00399 for(i=0;i<(*x)->wgnum;i++) {
00400 for(j=0;j<(*y)->wgnum;j++) {
00401
00402 if (r->order_matrix[m2i((*x)->wg[i]->id, (*y)->wg[j]->id)] == 1) {
00403 return -1;
00404 }
00405 }
00406 }
00407 return 1;
00408 }
00409
00410
00420 static PROB
00421 get_intraword_similarity(WordGraph *w1, WordGraph *w2)
00422 {
00423 PROB overlap;
00424 int overlap_frame, sum_len;
00425 PROB sim;
00426
00427
00428 if (w1->lefttime < w2->lefttime) {
00429 if (w1->righttime < w2->lefttime) {
00430 overlap_frame = 0;
00431 } else if (w1->righttime > w2->righttime) {
00432 overlap_frame = w2->righttime - w2->lefttime + 1;
00433 } else {
00434 overlap_frame = w1->righttime - w2->lefttime + 1;
00435 }
00436 } else if (w1->lefttime > w2->righttime) {
00437 overlap_frame = 0;
00438 } else {
00439 if (w1->righttime > w2->righttime) {
00440 overlap_frame = w2->righttime - w1->lefttime + 1;
00441 } else {
00442 overlap_frame = w1->righttime - w1->lefttime + 1;
00443 }
00444 }
00445 sum_len = (w1->righttime - w1->lefttime + 1) + (w2->righttime - w2->lefttime + 1);
00446 overlap = (PROB)overlap_frame / (PROB)sum_len;
00447 #ifdef CDEBUG2
00448 printf("[%d..%d] [%d..%d] overlap = %d / %d = %f",
00449 w1->lefttime, w1->righttime, w2->lefttime, w2->righttime,
00450 overlap_frame, sum_len, overlap);
00451 #endif
00452
00453 #ifdef PREFER_GRAPH_CM
00454 #ifdef CDEBUG2
00455 printf(" cm=%f, %f", w1->graph_cm, w2->graph_cm);
00456 #endif
00457 sim = overlap * w1->graph_cm * w2->graph_cm;
00458 #else
00459 #ifdef CDEBUG2
00460 printf(" cm=%f, %f", w1->cmscore, w2->cmscore);
00461 #endif
00462 sim = overlap * w1->cmscore * w2->cmscore;
00463 #endif
00464
00465 #ifdef CDEBUG2
00466 printf(" similarity=%f\n", sim);
00467 #endif
00468
00469 return sim;
00470 }
00471
00481 static PROB
00482 get_cluster_intraword_similarity(CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo)
00483 {
00484 int i1, i2;
00485 PROB simmax, sim;
00486
00487 simmax = 0.0;
00488 for(i1 = 0; i1 < c1->wgnum; i1++) {
00489 for(i2 = 0; i2 < c2->wgnum; i2++) {
00490 if (is_same_word(c1->wg[i1]->wid, c2->wg[i2]->wid, winfo)) {
00491
00492 sim = get_intraword_similarity(c1->wg[i1], c2->wg[i2]);
00493 if (simmax < sim) simmax = sim;
00494 }
00495 }
00496 }
00497 return(simmax);
00498 }
00499
00500 #ifdef CDEBUG
00501
00508 static void
00509 put_cluster(FILE *fp, CN_CLUSTER *c, WORD_INFO *winfo)
00510 {
00511 int i;
00512
00513 for(i=0;i<c->wgnum;i++) {
00514 fprintf(fp, "[%d:%s:%d..%d]", c->wg[i]->id, winfo->woutput[c->wg[i]->wid], c->wg[i]->lefttime, c->wg[i]->righttime);
00515 }
00516 printf("\n");
00517 }
00518 #endif
00519
00529 static int
00530 minimum(int a, int b, int c)
00531 {
00532 int min;
00533
00534 min = a;
00535 if (b < min)
00536 min = b;
00537 if (c < min)
00538 min = c;
00539 return min;
00540 }
00541
00551 static int
00552 edit_distance(WORD_ID w1, WORD_ID w2, WORD_INFO *winfo, char *b1, char *b2)
00553 {
00554 int i1, i2;
00555 int *d;
00556 int len1, len2;
00557 int j;
00558 int cost;
00559 int distance;
00560
00561 len1 = winfo->wlen[w1] + 1;
00562 len2 = winfo->wlen[w2] + 1;
00563 d = (int *)mymalloc(sizeof(int) * len1 * len2);
00564 for(j=0;j<len1;j++) d[j] = j;
00565 for(j=0;j<len2;j++) d[j*len1] = j;
00566
00567 for(i1=1;i1<len1;i1++) {
00568 center_name(winfo->wseq[w1][i1-1]->name, b1);
00569 for(i2=1;i2<len2;i2++) {
00570 center_name(winfo->wseq[w2][i2-1]->name, b2);
00571 if (strmatch(b1, b2)) {
00572 cost = 0;
00573 } else {
00574 cost = 1;
00575 }
00576 d[i2 * len1 + i1] = minimum(d[(i2-1) * len1 + i1] + 1, d[i2 * len1 + (i1-1)] + 1, d[(i2-1) * len1 + (i1-1)] + cost);
00577 }
00578 }
00579
00580 distance = d[len1 * len2 - 1];
00581
00582 free(d);
00583 return(distance);
00584 }
00585
00595 static PROB
00596 get_cluster_interword_similarity(RecogProcess *r, CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo, char *buf1, char *buf2)
00597 {
00598 int i1, i2, j;
00599 WORD_ID w1, w2;
00600 PROB p1, p2;
00601 PROB sim, simsum;
00602 int simsum_count;
00603 int dist;
00604
00605
00606 for(i1 = 0; i1 < c1->wgnum; i1++) {
00607 for(i2 = 0; i2 < c2->wgnum; i2++) {
00608 if (graph_ordered(r, c1->wg[i1]->id, c2->wg[i2]->id)) {
00609
00610
00611
00612
00613 return 0.0;
00614 }
00615 }
00616 }
00617
00618 #ifdef CDEBUG2
00619 printf("-----\n");
00620 printf("c1:\n"); put_cluster(stdout, c1, winfo);
00621 printf("c2:\n"); put_cluster(stdout, c2, winfo);
00622 #endif
00623
00624
00625 simsum = 0.0;
00626 simsum_count = 0;
00627 for(i1 = 0; i1 < c1->wordsnum; i1++) {
00628 w1 = c1->words[i1];
00629 p1 = 0.0;
00630 for(j = 0; j < c1->wgnum; j++) {
00631 if (is_same_word(c1->wg[j]->wid, w1, winfo)) {
00632 #ifdef PREFER_GRAPH_CM
00633 p1 += c1->wg[j]->graph_cm;
00634 #else
00635 p1 += c1->wg[j]->cmscore;
00636 #endif
00637 }
00638 }
00639 for(i2 = 0; i2 < c2->wordsnum; i2++) {
00640 w2 = c2->words[i2];
00641 p2 = 0.0;
00642 for(j = 0; j < c2->wgnum; j++) {
00643 if (is_same_word(c2->wg[j]->wid, w2, winfo)) {
00644 #ifdef PREFER_GRAPH_CM
00645 p2 += c2->wg[j]->graph_cm;
00646 #else
00647 p2 += c2->wg[j]->cmscore;
00648 #endif
00649 }
00650 }
00651 dist = edit_distance(w1, w2, winfo, buf1, buf2);
00652 #ifdef CDEBUG2
00653 for(j=0;j<winfo->wlen[w1];j++) {
00654 printf("%s ", winfo->wseq[w1][j]->name);
00655 }
00656 printf("\n");
00657 for(j=0;j<winfo->wlen[w2];j++) {
00658 printf("%s ", winfo->wseq[w2][j]->name);
00659 }
00660 printf("\n");
00661 printf("distance=%d\n", dist);
00662 #endif
00663
00664 sim = 1.0 - (float)dist / (float)(winfo->wlen[w1] + winfo->wlen[w2]);
00665
00666 #ifdef CDEBUG2
00667 printf("(%s) - (%s): sim = %f, p1 = %f, p2 = %f\n", winfo->woutput[w1], winfo->woutput[w2], sim, p1, p2);
00668 #endif
00669
00670 simsum += sim * p1 * p2;
00671 simsum_count++;
00672 }
00673 }
00674
00675 #ifdef CDEBUG2
00676 printf("SIM=%f\n", simsum / simsum_count);
00677 printf("-----\n");
00678 #endif
00679
00680 return(simsum / simsum_count);
00681 }
00682
00683
00696 CN_CLUSTER *
00697 confnet_create(WordGraph *root, RecogProcess *r)
00698 {
00699 CN_CLUSTER *croot;
00700 CN_CLUSTER *c, *cc, *cmax1, *cmax2;
00701 WordGraph *wg;
00702 PROB sim, max_sim;
00703 int wg_totalnum, n, i;
00704 char *buf1, *buf2;
00705
00706 buf1 = (char *)mymalloc(MAX_HMMNAME_LEN);
00707 buf2 = (char *)mymalloc(MAX_HMMNAME_LEN);
00708
00709
00710 croot = NULL;
00711 wg_totalnum = 0;
00712 for(wg=root;wg;wg=wg->next) {
00713 c = cn_new();
00714 cn_add_wg(c, wg);
00715 c->next = croot;
00716 croot = c;
00717 wg_totalnum++;
00718 }
00719
00720
00721 do {
00722
00723 max_sim = 0.0;
00724 for(c=croot;c;c=c->next) {
00725 for(cc=c->next;cc;cc=cc->next) {
00726 sim = get_cluster_intraword_similarity(c, cc, r->lm->winfo);
00727 if (max_sim < sim) {
00728 max_sim = sim;
00729 cmax1 = c;
00730 cmax2 = cc;
00731 }
00732 }
00733 }
00734
00735 if (max_sim != 0.0) {
00736 #ifdef CDEBUG
00737 printf(">>> max_sim = %f\n", max_sim);
00738 put_cluster(stdout, cmax1, r->lm->winfo);
00739 put_cluster(stdout, cmax2, r->lm->winfo);
00740 #endif
00741 cn_merge(r, cmax1, cmax2);
00742 cn_destroy(cmax2, &croot);
00743 }
00744 } while (max_sim != 0.0);
00745
00746 n = 0;
00747 for(c=croot;c;c=c->next) n++;
00748 if (verbose_flag) jlog("STAT: confnet: %d words -> %d clusters by intra-word clustering\n", wg_totalnum, n);
00749
00750 #ifdef CDEBUG
00751 printf("---- result of intra-word clustering ---\n");
00752 i = 0;
00753 for(c=croot;c;c=c->next) {
00754 printf("%d :", i);
00755 put_cluster(stdout, c, r->lm->winfo);
00756 #ifdef CDEBUG2
00757 for(i=0;i<c->wgnum;i++) {
00758 printf(" ");
00759 put_wordgraph(stdout, c->wg[i], r->lm->winfo);
00760 }
00761 #endif
00762 i++;
00763 }
00764 printf("----------------------------\n");
00765 #endif
00766
00767
00768 do {
00769
00770 for(c=croot;c;c=c->next) cn_build_wordlist(c, r->lm->winfo);
00771
00772 max_sim = 0.0;
00773 for(c=croot;c;c=c->next) {
00774 for(cc=c->next;cc;cc=cc->next) {
00775 sim = get_cluster_interword_similarity(r, c, cc, r->lm->winfo, buf1, buf2);
00776 if (max_sim < sim) {
00777 max_sim = sim;
00778 cmax1 = c;
00779 cmax2 = cc;
00780 }
00781 }
00782 }
00783
00784 if (max_sim != 0.0) {
00785 #ifdef CDEBUG
00786 printf(">>> max_sim = %f\n", max_sim);
00787 put_cluster(stdout, cmax1, r->lm->winfo);
00788 put_cluster(stdout, cmax2, r->lm->winfo);
00789 #endif
00790 cn_merge(r, cmax1, cmax2);
00791 cn_destroy(cmax2, &croot);
00792 }
00793 } while (max_sim != 0.0);
00794
00795 n = 0;
00796 for(c=croot;c;c=c->next) n++;
00797 if (verbose_flag) jlog("STAT: confnet: -> %d clusters by inter-word clustering\n", n);
00798
00799
00800 {
00801 PROB p, psum;
00802 int j;
00803
00804 for(c=croot;c;c=c->next) {
00805 psum = 0.0;
00806 c->pp = (LOGPROB *)mymalloc(sizeof(LOGPROB) * (c->wordsnum + 1));
00807 for(i=0;i<c->wordsnum;i++) {
00808 p = 0.0;
00809 for(j = 0; j < c->wgnum; j++) {
00810 if (is_same_word(c->wg[j]->wid, c->words[i], r->lm->winfo)) {
00811 #ifdef PREFER_GRAPH_CM
00812 p += c->wg[j]->graph_cm;
00813 #else
00814 p += c->wg[j]->cmscore;
00815 #endif
00816 }
00817 }
00818 c->pp[i] = p;
00819 psum += p;
00820 }
00821 if (psum < 1.0) {
00822 c->words[c->wordsnum] = WORD_INVALID;
00823 c->pp[c->wordsnum] = 1.0 - psum;
00824 c->wordsnum++;
00825 }
00826 }
00827 }
00828
00829
00830 {
00831 int j;
00832 WORD_ID wtmp;
00833 LOGPROB ltmp;
00834 for(c=croot;c;c=c->next) {
00835 for(i=0;i<c->wordsnum;i++) {
00836 for(j=c->wordsnum - 1;j>i;j--) {
00837 if (c->pp[j-1] < c->pp[j]) {
00838 ltmp = c->pp[j-1];
00839 c->pp[j-1] = c->pp[j];
00840 c->pp[j] = ltmp;
00841 wtmp = c->words[j-1];
00842 c->words[j-1] = c->words[j];
00843 c->words[j] = wtmp;
00844 }
00845 }
00846 }
00847 }
00848 }
00849
00850
00851 {
00852 CN_CLUSTER **clist;
00853 int k;
00854
00855
00856 clist = (CN_CLUSTER **)mymalloc(sizeof(CN_CLUSTER *) * n);
00857 for(i=0,c=croot;c;c=c->next) {
00858 clist[i++] = c;
00859 }
00860 qsort_reentrant(clist, n, sizeof(CN_CLUSTER *), (int (*)(const void *, const void *, void *))compare_cluster, r);
00861 croot = NULL;
00862 for(k=0;k<n;k++) {
00863 if (k == 0) croot = clist[k];
00864 if (k == n - 1) clist[k]->next = NULL;
00865 else clist[k]->next = clist[k+1];
00866 }
00867 free(clist);
00868 }
00869
00870 #if 0
00871
00872 printf("---- begin confusion network ---\n");
00873 for(c=croot;c;c=c->next) {
00874 for(i=0;i<c->wordsnum;i++) {
00875 printf("(%s:%.3f)", (c->words[i] == WORD_INVALID) ? "-" : r->lm->winfo->woutput[c->words[i]], c->pp[i]);
00876 if (i == 0) printf(" ");
00877 }
00878 printf("\n");
00879 }
00880 printf("---- end confusion network ---\n");
00881 #endif
00882
00883 free(buf2);
00884 free(buf1);
00885
00886 return(croot);
00887 }
00888
00889