00001
00022
00023
00024
00025
00026
00027
00028
00029 #include <julius/julius.h>
00030
00035 #undef CDEBUG
00036
00041 #undef CDEBUG2
00042
00052 #define PREFER_GRAPH_CM
00053
00061 #define BUNDLE_WORD_WITH_SAME_OUTPUT
00062
00063
00074 static boolean
00075 is_same_word(WORD_ID w1, WORD_ID w2, WORD_INFO *winfo)
00076 {
00077 if (w1 == w2
00078 #ifdef BUNDLE_WORD_WITH_SAME_OUTPUT
00079 || strmatch(winfo->woutput[w1], winfo->woutput[w2])
00080 #endif
00081 ) return TRUE;
00082 return FALSE;
00083 }
00084
00085
00086
00091 static char *order_matrix = NULL;
00092
00097 static int order_matrix_count;
00098
00103 #define m2i(A, B) (B) * order_matrix_count + (A)
00104
00113 static boolean
00114 graph_ordered(int i, int j)
00115 {
00116 if (i != j && order_matrix[m2i(i,j)] == 0 && order_matrix[m2i(j,i)] == 0) {
00117 return FALSE;
00118 }
00119 return TRUE;
00120 }
00121
00127 static void
00128 graph_update_order()
00129 {
00130 int i, j, k;
00131 boolean changed;
00132 int count;
00133
00134 count = order_matrix_count;
00135
00136 do {
00137 changed = FALSE;
00138 for(i=0;i<count;i++) {
00139 for(j=0;j<count;j++) {
00140 if (order_matrix[m2i(i, j)] == 1) {
00141 for(k=0;k<count;k++) {
00142 if (order_matrix[m2i(j, k)] == 1) {
00143 if (order_matrix[m2i(i, k)] == 0) {
00144 order_matrix[m2i(i, k)] = 1;
00145 changed = TRUE;
00146 }
00147 }
00148 }
00149 }
00150 }
00151 }
00152 } while (changed == TRUE);
00153 }
00154
00165 void
00166 graph_make_order(WordGraph *root, RecogProcess *r)
00167 {
00168 int count;
00169 WordGraph *wg, *right;
00170 int i;
00171
00172
00173 count = 0;
00174 for(wg=root;wg;wg=wg->next) count++;
00175 if (count == 0) {
00176 order_matrix = NULL;
00177 return;
00178 }
00179 if (count != r->graph_totalwordnum) {
00180 jlog("Error: graph_make_order: r->graph_totalwordnum differ from actual number?\n");
00181 order_matrix = NULL;
00182 return;
00183 }
00184 order_matrix_count = count;
00185 for(wg=root;wg;wg=wg->next) {
00186 if (wg->id >= count) {
00187 jlog("Error: graph_make_order: wordgraph id >= count (%d >= %d)\n", wg->id, count);
00188 order_matrix = NULL;
00189 return;
00190 }
00191 }
00192
00193
00194 order_matrix = (char *)mymalloc(count * count);
00195 for(i=0;i<count*count;i++) order_matrix[i] = 0;
00196
00197
00198 for(wg=root;wg;wg=wg->next) {
00199 for(i=0;i<wg->rightwordnum;i++) {
00200 right = wg->rightword[i];
00201 order_matrix[m2i(wg->id, right->id)] = 1;
00202 }
00203 }
00204
00205
00206 graph_update_order();
00207 }
00208
00215 void
00216 graph_free_order()
00217 {
00218 if (order_matrix) free(order_matrix);
00219 }
00220
00221
00222
00228 static CN_CLUSTER *
00229 cn_new()
00230 {
00231 CN_CLUSTER *new;
00232 new = (CN_CLUSTER *)mymalloc(sizeof(CN_CLUSTER));
00233 new->wg = (WordGraph **)mymalloc(sizeof(WordGraph *) * CN_CLUSTER_WG_STEP);
00234 new->wgnum_alloc = CN_CLUSTER_WG_STEP;
00235 new->wgnum = 0;
00236 new->words = NULL;
00237 new->pp = NULL;
00238 new->next = NULL;
00239 return new;
00240 }
00241
00248 static void
00249 cn_free(CN_CLUSTER *c)
00250 {
00251 free(c->wg);
00252 if (c->words) free(c->words);
00253 if (c->pp) free(c->pp);
00254 free(c);
00255 }
00256
00266 void
00267 cn_free_all(CN_CLUSTER **croot)
00268 {
00269 CN_CLUSTER *c, *ctmp;
00270 c = *croot;
00271 while(c) {
00272 ctmp = c->next;
00273 cn_free(c);
00274 c = ctmp;
00275 }
00276 *croot = NULL;
00277 }
00278
00285 static void
00286 cn_add_wg(CN_CLUSTER *c, WordGraph *wg)
00287 {
00288 if (c->wgnum >= c->wgnum_alloc) {
00289 c->wgnum_alloc += CN_CLUSTER_WG_STEP;
00290 c->wg = (WordGraph **)myrealloc(c->wg, sizeof(WordGraph *) * c->wgnum_alloc);
00291 }
00292 c->wg[c->wgnum] = wg;
00293 c->wgnum++;
00294 }
00295
00302 static void
00303 cn_merge(CN_CLUSTER *dst, CN_CLUSTER *src)
00304 {
00305 WordGraph *wg;
00306 int i, j, n;
00307
00308
00309 for(i=0;i<src->wgnum;i++) {
00310 wg = src->wg[i];
00311 for(j=0;j<dst->wgnum;j++) {
00312 for(n=0;n<wg->leftwordnum;n++) {
00313 order_matrix[m2i(wg->leftword[n]->id, dst->wg[j]->id)] = 1;
00314 }
00315 for(n=0;n<wg->rightwordnum;n++) {
00316 order_matrix[m2i(dst->wg[j]->id, wg->rightword[n]->id)] = 1;
00317 }
00318 }
00319 }
00320 graph_update_order();
00321
00322 for(i=0;i<src->wgnum;i++) {
00323 cn_add_wg(dst, src->wg[i]);
00324 }
00325 }
00326
00333 static void
00334 cn_destroy(CN_CLUSTER *target, CN_CLUSTER **root)
00335 {
00336 CN_CLUSTER *c, *cprev;
00337
00338 cprev = NULL;
00339 for(c = *root; c; c = c->next) {
00340 if (c == target) {
00341 if (cprev) {
00342 cprev->next = c->next;
00343 } else {
00344 *root = c->next;
00345 }
00346 cn_free(c);
00347 break;
00348 }
00349 cprev = c;
00350 }
00351 }
00352
00359 static void
00360 cn_build_wordlist(CN_CLUSTER *c, WORD_INFO *winfo)
00361 {
00362 int i, j;
00363
00364 if (c->words) {
00365 free(c->words);
00366 }
00367 c->words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * c->wgnum + 1);
00368 c->wordsnum = 0;
00369 for(i=0;i<c->wgnum;i++) {
00370 for(j=0;j<c->wordsnum;j++) {
00371 if (is_same_word(c->words[j], c->wg[i]->wid, winfo)) break;
00372 }
00373 if (j>=c->wordsnum) {
00374 c->words[c->wordsnum] = c->wg[i]->wid;
00375 c->wordsnum++;
00376 }
00377 }
00378 }
00379
00388 static int
00389 compare_cluster(CN_CLUSTER **x, CN_CLUSTER **y)
00390 {
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404 int i, j;
00405 int dir;
00406 dir = 0;
00407 for(i=0;i<(*x)->wgnum;i++) {
00408 for(j=0;j<(*y)->wgnum;j++) {
00409
00410 if (order_matrix[m2i((*x)->wg[i]->id, (*y)->wg[j]->id)] == 1) dir = -1;
00411 }
00412 }
00413 if (dir == 0) dir = 1;
00414 return dir;
00415 }
00416
00417
00427 static PROB
00428 get_intraword_similarity(WordGraph *w1, WordGraph *w2)
00429 {
00430 PROB overlap;
00431 int overlap_frame, sum_len;
00432 PROB sim;
00433
00434
00435 if (w1->lefttime < w2->lefttime) {
00436 if (w1->righttime < w2->lefttime) {
00437 overlap_frame = 0;
00438 } else if (w1->righttime > w2->righttime) {
00439 overlap_frame = w2->righttime - w2->lefttime + 1;
00440 } else {
00441 overlap_frame = w1->righttime - w2->lefttime + 1;
00442 }
00443 } else if (w1->lefttime > w2->righttime) {
00444 overlap_frame = 0;
00445 } else {
00446 if (w1->righttime > w2->righttime) {
00447 overlap_frame = w2->righttime - w1->lefttime + 1;
00448 } else {
00449 overlap_frame = w1->righttime - w1->lefttime + 1;
00450 }
00451 }
00452 sum_len = (w1->righttime - w1->lefttime + 1) + (w2->righttime - w2->lefttime + 1);
00453 overlap = (PROB)overlap_frame / (PROB)sum_len;
00454 #ifdef CDEBUG2
00455 printf("[%d..%d] [%d..%d] overlap = %d / %d = %f",
00456 w1->lefttime, w1->righttime, w2->lefttime, w2->righttime,
00457 overlap_frame, sum_len, overlap);
00458 #endif
00459
00460 #ifdef PREFER_GRAPH_CM
00461 #ifdef CDEBUG2
00462 printf(" cm=%f, %f", w1->graph_cm, w2->graph_cm);
00463 #endif
00464 sim = overlap * w1->graph_cm * w2->graph_cm;
00465 #else
00466 #ifdef CDEBUG2
00467 printf(" cm=%f, %f", w1->cmscore, w2->cmscore);
00468 #endif
00469 sim = overlap * w1->cmscore * w2->cmscore;
00470 #endif
00471
00472 #ifdef CDEBUG2
00473 printf(" similarity=%f\n", sim);
00474 #endif
00475
00476 return sim;
00477 }
00478
00488 static PROB
00489 get_cluster_intraword_similarity(CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo)
00490 {
00491 int i1, i2;
00492 PROB simmax, sim;
00493
00494 simmax = 0.0;
00495 for(i1 = 0; i1 < c1->wgnum; i1++) {
00496 for(i2 = 0; i2 < c2->wgnum; i2++) {
00497 if (is_same_word(c1->wg[i1]->wid, c2->wg[i2]->wid, winfo)) {
00498
00499 sim = get_intraword_similarity(c1->wg[i1], c2->wg[i2]);
00500 if (simmax < sim) simmax = sim;
00501 }
00502 }
00503 }
00504 return(simmax);
00505 }
00506
00507 #ifdef CDEBUG
00508
00515 static void
00516 put_cluster(FILE *fp, CN_CLUSTER *c, WORD_INFO *winfo)
00517 {
00518 int i;
00519
00520 for(i=0;i<c->wgnum;i++) {
00521 fprintf(fp, "[%d:%s:%d..%d]", c->wg[i]->id, winfo->woutput[c->wg[i]->wid], c->wg[i]->lefttime, c->wg[i]->righttime);
00522 }
00523 printf("\n");
00524 }
00525 #endif
00526
00536 static int
00537 minimum(int a, int b, int c)
00538 {
00539 int min;
00540
00541 min = a;
00542 if (b < min)
00543 min = b;
00544 if (c < min)
00545 min = c;
00546 return min;
00547 }
00548
00558 static int
00559 edit_distance(WORD_ID w1, WORD_ID w2, WORD_INFO *winfo)
00560 {
00561 static char b1[MAX_HMMNAME_LEN];
00562 static char b2[MAX_HMMNAME_LEN];
00563 int i1, i2;
00564 int *d;
00565 int len1, len2;
00566 int j;
00567 int cost;
00568 int distance;
00569
00570 len1 = winfo->wlen[w1] + 1;
00571 len2 = winfo->wlen[w2] + 1;
00572 d = (int *)mymalloc(sizeof(int) * len1 * len2);
00573 for(j=0;j<len1;j++) d[j] = j;
00574 for(j=0;j<len2;j++) d[j*len1] = j;
00575
00576 for(i1=1;i1<len1;i1++) {
00577 center_name(winfo->wseq[w1][i1-1]->name, b1);
00578 for(i2=1;i2<len2;i2++) {
00579 center_name(winfo->wseq[w2][i2-1]->name, b2);
00580 if (strmatch(b1, b2)) {
00581 cost = 0;
00582 } else {
00583 cost = 1;
00584 }
00585 d[i2 * len1 + i1] = minimum(d[(i2-1) * len1 + i1] + 1, d[i2 * len1 + (i1-1)] + 1, d[(i2-1) * len1 + (i1-1)] + cost);
00586 }
00587 }
00588
00589 distance = d[len1 * len2 - 1];
00590
00591 free(d);
00592 return(distance);
00593 }
00594
00604 static PROB
00605 get_cluster_interword_similarity(CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo)
00606 {
00607 int i1, i2, j;
00608 WORD_ID w1, w2;
00609 PROB p1, p2;
00610 PROB sim, simsum;
00611 int simsum_count;
00612 int dist;
00613
00614
00615 for(i1 = 0; i1 < c1->wgnum; i1++) {
00616 for(i2 = 0; i2 < c2->wgnum; i2++) {
00617 if (graph_ordered(c1->wg[i1]->id, c2->wg[i2]->id)) {
00618
00619
00620
00621
00622 return 0.0;
00623 }
00624 }
00625 }
00626
00627 #ifdef CDEBUG2
00628 printf("-----\n");
00629 printf("c1:\n"); put_cluster(stdout, c1, winfo);
00630 printf("c2:\n"); put_cluster(stdout, c2, winfo);
00631 #endif
00632
00633
00634 simsum = 0.0;
00635 simsum_count = 0;
00636 for(i1 = 0; i1 < c1->wordsnum; i1++) {
00637 w1 = c1->words[i1];
00638 p1 = 0.0;
00639 for(j = 0; j < c1->wgnum; j++) {
00640 if (is_same_word(c1->wg[j]->wid, w1, winfo)) {
00641 #ifdef PREFER_GRAPH_CM
00642 p1 += c1->wg[j]->graph_cm;
00643 #else
00644 p1 += c1->wg[j]->cmscore;
00645 #endif
00646 }
00647 }
00648 for(i2 = 0; i2 < c2->wordsnum; i2++) {
00649 w2 = c2->words[i2];
00650 p2 = 0.0;
00651 for(j = 0; j < c2->wgnum; j++) {
00652 if (is_same_word(c2->wg[j]->wid, w2, winfo)) {
00653 #ifdef PREFER_GRAPH_CM
00654 p2 += c2->wg[j]->graph_cm;
00655 #else
00656 p2 += c2->wg[j]->cmscore;
00657 #endif
00658 }
00659 }
00660 dist = edit_distance(w1, w2, winfo);
00661 #ifdef CDEBUG2
00662 for(j=0;j<winfo->wlen[w1];j++) {
00663 printf("%s ", winfo->wseq[w1][j]->name);
00664 }
00665 printf("\n");
00666 for(j=0;j<winfo->wlen[w2];j++) {
00667 printf("%s ", winfo->wseq[w2][j]->name);
00668 }
00669 printf("\n");
00670 printf("distance=%d\n", dist);
00671 #endif
00672
00673 sim = 1.0 - (float)dist / (float)(winfo->wlen[w1] + winfo->wlen[w2]);
00674
00675 #ifdef CDEBUG2
00676 printf("(%s) - (%s): sim = %f, p1 = %f, p2 = %f\n", winfo->woutput[w1], winfo->woutput[w2], sim, p1, p2);
00677 #endif
00678
00679 simsum += sim * p1 * p2;
00680 simsum_count++;
00681 }
00682 }
00683
00684 #ifdef CDEBUG2
00685 printf("SIM=%f\n", simsum / simsum_count);
00686 printf("-----\n");
00687 #endif
00688
00689 return(simsum / simsum_count);
00690 }
00691
00692
00705 CN_CLUSTER *
00706 confnet_create(WordGraph *root, RecogProcess *r)
00707 {
00708 CN_CLUSTER *croot;
00709 CN_CLUSTER *c, *cc, *cmax1, *cmax2;
00710 WordGraph *wg;
00711 PROB sim, max_sim;
00712 int wg_totalnum, n, i;
00713
00714
00715 croot = NULL;
00716 wg_totalnum = 0;
00717 for(wg=root;wg;wg=wg->next) {
00718 c = cn_new();
00719 cn_add_wg(c, wg);
00720 c->next = croot;
00721 croot = c;
00722 wg_totalnum++;
00723 }
00724
00725
00726 do {
00727
00728 max_sim = 0.0;
00729 for(c=croot;c;c=c->next) {
00730 for(cc=c->next;cc;cc=cc->next) {
00731 sim = get_cluster_intraword_similarity(c, cc, r->lm->winfo);
00732 if (max_sim < sim) {
00733 max_sim = sim;
00734 cmax1 = c;
00735 cmax2 = cc;
00736 }
00737 }
00738 }
00739
00740 if (max_sim != 0.0) {
00741 #ifdef CDEBUG
00742 printf(">>> max_sim = %f\n", max_sim);
00743 put_cluster(stdout, cmax1, r->lm->winfo);
00744 put_cluster(stdout, cmax2, r->lm->winfo);
00745 #endif
00746 cn_merge(cmax1, cmax2);
00747 cn_destroy(cmax2, &croot);
00748 }
00749 } while (max_sim != 0.0);
00750
00751 n = 0;
00752 for(c=croot;c;c=c->next) n++;
00753 if (verbose_flag) jlog("STAT: confnet: %d words -> %d clusters by intra-word clustering\n", wg_totalnum, n);
00754
00755 #ifdef CDEBUG
00756 printf("---- result of intra-word clustering ---\n");
00757 i = 0;
00758 for(c=croot;c;c=c->next) {
00759 printf("%d :", i);
00760 put_cluster(stdout, c, r->lm->winfo);
00761 #ifdef CDEBUG2
00762 for(i=0;i<c->wgnum;i++) {
00763 printf(" ");
00764 put_wordgraph(stdout, c->wg[i], r->lm->winfo);
00765 }
00766 #endif
00767 i++;
00768 }
00769 printf("----------------------------\n");
00770 #endif
00771
00772
00773 do {
00774
00775 for(c=croot;c;c=c->next) cn_build_wordlist(c, r->lm->winfo);
00776
00777 max_sim = 0.0;
00778 for(c=croot;c;c=c->next) {
00779 for(cc=c->next;cc;cc=cc->next) {
00780 sim = get_cluster_interword_similarity(c, cc, r->lm->winfo);
00781 if (max_sim < sim) {
00782 max_sim = sim;
00783 cmax1 = c;
00784 cmax2 = cc;
00785 }
00786 }
00787 }
00788
00789 if (max_sim != 0.0) {
00790 #ifdef CDEBUG
00791 printf(">>> max_sim = %f\n", max_sim);
00792 put_cluster(stdout, cmax1, r->lm->winfo);
00793 put_cluster(stdout, cmax2, r->lm->winfo);
00794 #endif
00795 cn_merge(cmax1, cmax2);
00796 cn_destroy(cmax2, &croot);
00797 }
00798 } while (max_sim != 0.0);
00799
00800 n = 0;
00801 for(c=croot;c;c=c->next) n++;
00802 if (verbose_flag) jlog("STAT: confnet: -> %d clusters by inter-word clustering\n", n);
00803
00804
00805 {
00806 PROB p, psum;
00807 int j;
00808
00809 for(c=croot;c;c=c->next) {
00810 psum = 0.0;
00811 c->pp = (LOGPROB *)mymalloc(sizeof(LOGPROB) * (c->wordsnum + 1));
00812 for(i=0;i<c->wordsnum;i++) {
00813 p = 0.0;
00814 for(j = 0; j < c->wgnum; j++) {
00815 if (is_same_word(c->wg[j]->wid, c->words[i], r->lm->winfo)) {
00816 #ifdef PREFER_GRAPH_CM
00817 p += c->wg[j]->graph_cm;
00818 #else
00819 p += c->wg[j]->cmscore;
00820 #endif
00821 }
00822 }
00823 c->pp[i] = p;
00824 psum += p;
00825 }
00826 if (psum < 1.0) {
00827 c->words[c->wordsnum] = WORD_INVALID;
00828 c->pp[c->wordsnum] = 1.0 - psum;
00829 c->wordsnum++;
00830 }
00831 }
00832 }
00833
00834
00835 {
00836 int j;
00837 WORD_ID wtmp;
00838 LOGPROB ltmp;
00839 for(c=croot;c;c=c->next) {
00840 for(i=0;i<c->wordsnum;i++) {
00841 for(j=c->wordsnum - 1;j>i;j--) {
00842 if (c->pp[j-1] < c->pp[j]) {
00843 ltmp = c->pp[j-1];
00844 c->pp[j-1] = c->pp[j];
00845 c->pp[j] = ltmp;
00846 wtmp = c->words[j-1];
00847 c->words[j-1] = c->words[j];
00848 c->words[j] = wtmp;
00849 }
00850 }
00851 }
00852 }
00853 }
00854
00855
00856 {
00857 CN_CLUSTER **clist;
00858 int k;
00859
00860
00861 clist = (CN_CLUSTER **)mymalloc(sizeof(CN_CLUSTER *) * n);
00862 for(i=0,c=croot;c;c=c->next) clist[i++] = c;
00863 qsort(clist, n, sizeof(CN_CLUSTER *), (int (*)(const void *, const void *))compare_cluster);
00864 croot = NULL;
00865 for(k=0;k<n;k++) {
00866 if (k == 0) croot = clist[k];
00867 if (k == n - 1) clist[k]->next = NULL;
00868 else clist[k]->next = clist[k+1];
00869 }
00870 }
00871
00872
00873 #if 0
00874
00875 printf("---- begin confusion network ---\n");
00876 for(c=croot;c;c=c->next) {
00877 for(i=0;i<c->wordsnum;i++) {
00878 printf("(%s:%.3f)", (c->words[i] == WORD_INVALID) ? "-" : r->lm->winfo->woutput[c->words[i]], c->pp[i]);
00879 if (i == 0) printf(" ");
00880 }
00881 printf("\n");
00882 }
00883 printf("---- end confusion network ---\n");
00884 #endif
00885
00886 return(croot);
00887 }
00888
00889