Julius: libjulius/src/confnet.c Source File

00001 
00022 /*
00023  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00024  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00025  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00026  * All rights reserved
00027  */
00028 
00029 #include <julius/julius.h>
00030 
00035 #undef CDEBUG
00036 
00041 #undef CDEBUG2
00042 
00052 #define PREFER_GRAPH_CM
00053 
00061 #define BUNDLE_WORD_WITH_SAME_OUTPUT
00062 
00063 
00074 static boolean
00075 is_same_word(WORD_ID w1, WORD_ID w2, WORD_INFO *winfo)
00076 {
00077   if (w1 == w2
00078 #ifdef BUNDLE_WORD_WITH_SAME_OUTPUT
00079       || strmatch(winfo->woutput[w1], winfo->woutput[w2])
00080 #endif
00081       ) return TRUE;
00082   return FALSE;
00083 }
00084 
00085 /**************************************************************/
00086 
00091 static char *order_matrix = NULL;
00092 
00097 static int order_matrix_count;
00098 
00103 #define m2i(A, B) (B) * order_matrix_count + (A)
00104 
00113 static boolean
00114 graph_ordered(int i, int j) 
00115 {
00116   if (i != j  && order_matrix[m2i(i,j)] == 0 && order_matrix[m2i(j,i)] == 0) {
00117     return FALSE;
00118   }
00119   return TRUE;
00120 }  
00121 
00127 static void
00128 graph_update_order()
00129 {
00130   int i, j, k;
00131   boolean changed;
00132   int count;
00133 
00134   count = order_matrix_count;
00135   
00136   do {
00137     changed = FALSE;
00138     for(i=0;i<count;i++) {
00139       for(j=0;j<count;j++) {
00140         if (order_matrix[m2i(i, j)] == 1) {
00141           for(k=0;k<count;k++) {
00142             if (order_matrix[m2i(j, k)] == 1) {
00143               if (order_matrix[m2i(i, k)] == 0) {
00144                 order_matrix[m2i(i, k)] = 1;
00145                 changed = TRUE;
00146               }
00147             }
00148           }
00149         }
00150       }
00151     }
00152   } while (changed == TRUE);
00153 }
00154 
00165 void
00166 graph_make_order(WordGraph *root, RecogProcess *r)
00167 {
00168   int count;
00169   WordGraph *wg, *right;
00170   int i;
00171   
00172   /* make sure total num and id are valid */
00173   count = 0;
00174   for(wg=root;wg;wg=wg->next) count++;
00175   if (count == 0) {
00176     order_matrix = NULL;
00177     return;
00178   }
00179   if (count != r->graph_totalwordnum) {
00180     jlog("Error: graph_make_order: r->graph_totalwordnum differ from actual number?\n");
00181     order_matrix = NULL;
00182     return;
00183   }
00184   order_matrix_count = count;
00185   for(wg=root;wg;wg=wg->next) {
00186     if (wg->id >= count) {
00187       jlog("Error: graph_make_order: wordgraph id >= count (%d >= %d)\n", wg->id, count);
00188       order_matrix = NULL;
00189       return;
00190     }
00191   }
00192 
00193   /* allocate and clear matrix */
00194   order_matrix = (char *)mymalloc(count * count);
00195   for(i=0;i<count*count;i++) order_matrix[i] = 0;
00196   
00197   /* set initial order info */
00198   for(wg=root;wg;wg=wg->next) {
00199     for(i=0;i<wg->rightwordnum;i++) {
00200       right = wg->rightword[i];
00201       order_matrix[m2i(wg->id, right->id)] = 1;
00202     }
00203   }
00204 
00205   /* right propagate loop */
00206   graph_update_order();
00207 }
00208 
00215 void
00216 graph_free_order()
00217 {
00218   if (order_matrix) free(order_matrix);
00219 }
00220 
00221 /**************************************************************/
00222 
00228 static CN_CLUSTER *
00229 cn_new()
00230 {
00231   CN_CLUSTER *new;
00232   new = (CN_CLUSTER *)mymalloc(sizeof(CN_CLUSTER));
00233   new->wg = (WordGraph **)mymalloc(sizeof(WordGraph *) * CN_CLUSTER_WG_STEP);
00234   new->wgnum_alloc = CN_CLUSTER_WG_STEP;
00235   new->wgnum = 0;
00236   new->words = NULL;
00237   new->pp = NULL;
00238   new->next = NULL;
00239   return new;
00240 }
00241 
00248 static void
00249 cn_free(CN_CLUSTER *c)
00250 {
00251   free(c->wg);
00252   if (c->words) free(c->words);
00253   if (c->pp) free(c->pp);
00254   free(c);
00255 }
00256 
00266 void
00267 cn_free_all(CN_CLUSTER **croot)
00268 {
00269   CN_CLUSTER *c, *ctmp;
00270   c = *croot;
00271   while(c) {
00272     ctmp = c->next;
00273     cn_free(c);
00274     c = ctmp;
00275   }
00276   *croot = NULL;
00277 }
00278 
00285 static void
00286 cn_add_wg(CN_CLUSTER *c, WordGraph *wg)
00287 {
00288   if (c->wgnum >= c->wgnum_alloc) {
00289     c->wgnum_alloc += CN_CLUSTER_WG_STEP;
00290     c->wg = (WordGraph **)myrealloc(c->wg, sizeof(WordGraph *) * c->wgnum_alloc);
00291   }
00292   c->wg[c->wgnum] = wg;
00293   c->wgnum++;
00294 }
00295 
00302 static void
00303 cn_merge(CN_CLUSTER *dst, CN_CLUSTER *src)
00304 {
00305   WordGraph *wg;
00306   int i, j, n;
00307 
00308   /* update order matrix */
00309   for(i=0;i<src->wgnum;i++) {
00310     wg = src->wg[i];
00311     for(j=0;j<dst->wgnum;j++) {
00312       for(n=0;n<wg->leftwordnum;n++) {
00313         order_matrix[m2i(wg->leftword[n]->id, dst->wg[j]->id)] = 1;
00314       }
00315       for(n=0;n<wg->rightwordnum;n++) {
00316         order_matrix[m2i(dst->wg[j]->id, wg->rightword[n]->id)] = 1;
00317       }
00318     }
00319   }
00320   graph_update_order();
00321   /* add words in the source cluster to target cluster */
00322   for(i=0;i<src->wgnum;i++) {
00323     cn_add_wg(dst, src->wg[i]);
00324   }
00325 }
00326 
00333 static void
00334 cn_destroy(CN_CLUSTER *target, CN_CLUSTER **root)
00335 {
00336   CN_CLUSTER *c, *cprev;
00337 
00338   cprev = NULL;
00339   for(c = *root; c; c = c->next) {
00340     if (c == target) {
00341       if (cprev) {
00342         cprev->next = c->next;
00343       } else {
00344         *root = c->next;
00345       }
00346       cn_free(c);
00347       break;
00348     }
00349     cprev = c;
00350   }
00351 }
00352 
00359 static void
00360 cn_build_wordlist(CN_CLUSTER *c, WORD_INFO *winfo)
00361 {
00362   int i, j;
00363 
00364   if (c->words) {
00365     free(c->words);
00366   }
00367   c->words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * c->wgnum + 1);
00368   c->wordsnum = 0;
00369   for(i=0;i<c->wgnum;i++) {
00370     for(j=0;j<c->wordsnum;j++) {
00371       if (is_same_word(c->words[j], c->wg[i]->wid, winfo)) break;
00372     }
00373     if (j>=c->wordsnum) {
00374       c->words[c->wordsnum] = c->wg[i]->wid;
00375       c->wordsnum++;
00376     }
00377   }
00378 }
00379 
00388 static int
00389 compare_cluster(CN_CLUSTER **x, CN_CLUSTER **y)
00390 {
00391   //int i, min1, min2;
00392 /* 
00393  * 
00394  *   for(i=0;i<(*x)->wgnum;i++) {
00395  *     if (i == 0 || min1 > (*x)->wg[i]->lefttime) min1 = (*x)->wg[i]->lefttime;
00396  *   }
00397  *   for(i=0;i<(*y)->wgnum;i++) {
00398  *     if (i == 0 || min2 > (*y)->wg[i]->lefttime) min2 = (*y)->wg[i]->lefttime;
00399  *   }
00400  *   if (min1 < min2) return -1;
00401  *   else if (min1 > min2) return 1;
00402  *   else return 0;
00403  */
00404   int i, j;
00405   int dir;
00406   dir = 0;
00407   for(i=0;i<(*x)->wgnum;i++) {
00408     for(j=0;j<(*y)->wgnum;j++) {
00409       //if (graph_ordered((*x)->wg[i]->id, (*y)->wg[j]->id)) dir = 1;
00410       if (order_matrix[m2i((*x)->wg[i]->id, (*y)->wg[j]->id)] == 1) dir = -1;
00411     }
00412   }
00413   if (dir == 0) dir = 1;
00414   return dir;
00415 }
00416 
00417 
00427 static PROB
00428 get_intraword_similarity(WordGraph *w1, WordGraph *w2)
00429 {
00430   PROB overlap;
00431   int overlap_frame, sum_len;
00432   PROB sim;
00433 
00434   /* compute overlap_frame */
00435   if (w1->lefttime < w2->lefttime) {
00436     if (w1->righttime < w2->lefttime) {
00437       overlap_frame = 0;
00438     } else if (w1->righttime > w2->righttime) {
00439       overlap_frame = w2->righttime - w2->lefttime + 1;
00440     } else {
00441       overlap_frame = w1->righttime - w2->lefttime + 1;
00442     }
00443   } else if (w1->lefttime > w2->righttime) {
00444     overlap_frame = 0;
00445   } else {
00446     if (w1->righttime > w2->righttime) {
00447       overlap_frame = w2->righttime - w1->lefttime + 1;
00448     } else {
00449       overlap_frame = w1->righttime - w1->lefttime + 1;
00450     }
00451   }
00452   sum_len = (w1->righttime - w1->lefttime + 1) + (w2->righttime - w2->lefttime + 1);
00453   overlap = (PROB)overlap_frame / (PROB)sum_len;
00454 #ifdef CDEBUG2
00455   printf("[%d..%d] [%d..%d]  overlap = %d / %d = %f",
00456          w1->lefttime, w1->righttime, w2->lefttime, w2->righttime,
00457          overlap_frame, sum_len, overlap);
00458 #endif
00459 
00460 #ifdef PREFER_GRAPH_CM
00461 #ifdef CDEBUG2
00462   printf("  cm=%f, %f", w1->graph_cm, w2->graph_cm);
00463 #endif
00464   sim = overlap * w1->graph_cm * w2->graph_cm;
00465 #else 
00466 #ifdef CDEBUG2
00467   printf("  cm=%f, %f", w1->cmscore, w2->cmscore);
00468 #endif
00469   sim = overlap * w1->cmscore * w2->cmscore;
00470 #endif
00471 
00472 #ifdef CDEBUG2
00473   printf("  similarity=%f\n", sim);
00474 #endif
00475 
00476   return sim;
00477 }
00478 
00488 static PROB
00489 get_cluster_intraword_similarity(CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo)
00490 {
00491   int i1, i2;
00492   PROB simmax, sim;
00493 
00494   simmax = 0.0;
00495   for(i1 = 0; i1 < c1->wgnum; i1++) {
00496     for(i2 = 0; i2 < c2->wgnum; i2++) {
00497       if (is_same_word(c1->wg[i1]->wid, c2->wg[i2]->wid, winfo)) {
00498         //if (graph_ordered(c1->wg[i1]->id, c2->wg[i2]->id)) continue;
00499         sim = get_intraword_similarity(c1->wg[i1], c2->wg[i2]);
00500         if (simmax < sim) simmax = sim;
00501       }
00502     }
00503   }
00504   return(simmax);
00505 }
00506 
00507 #ifdef CDEBUG
00508 
00515 static void
00516 put_cluster(FILE *fp, CN_CLUSTER *c, WORD_INFO *winfo)
00517 {
00518   int i;
00519 
00520   for(i=0;i<c->wgnum;i++) {
00521     fprintf(fp, "[%d:%s:%d..%d]", c->wg[i]->id, winfo->woutput[c->wg[i]->wid], c->wg[i]->lefttime, c->wg[i]->righttime);
00522   }
00523   printf("\n");
00524 }
00525 #endif
00526 
00536 static int
00537 minimum(int a, int b, int c)
00538 {
00539   int min;
00540 
00541   min = a;
00542   if (b < min)
00543     min = b;
00544   if (c < min)
00545     min = c;
00546   return min;
00547 }
00548 
00558 static int
00559 edit_distance(WORD_ID w1, WORD_ID w2, WORD_INFO *winfo)
00560 {
00561   static char b1[MAX_HMMNAME_LEN];
00562   static char b2[MAX_HMMNAME_LEN];
00563   int i1, i2;
00564   int *d;
00565   int len1, len2;
00566   int j;
00567   int cost;
00568   int distance;
00569 
00570   len1 = winfo->wlen[w1] + 1;
00571   len2 = winfo->wlen[w2] + 1;
00572   d = (int *)mymalloc(sizeof(int) * len1 * len2);
00573   for(j=0;j<len1;j++) d[j] = j;
00574   for(j=0;j<len2;j++) d[j*len1] = j;
00575 
00576   for(i1=1;i1<len1;i1++) {
00577     center_name(winfo->wseq[w1][i1-1]->name, b1);
00578     for(i2=1;i2<len2;i2++) {
00579       center_name(winfo->wseq[w2][i2-1]->name, b2);
00580       if (strmatch(b1, b2)) {
00581         cost = 0;
00582       } else {
00583         cost = 1;
00584       }
00585       d[i2 * len1 + i1] = minimum(d[(i2-1) * len1 + i1] + 1, d[i2 * len1 + (i1-1)] + 1, d[(i2-1) * len1 + (i1-1)] + cost);
00586     }
00587   }
00588 
00589   distance = d[len1 * len2 - 1];
00590 
00591   free(d);
00592   return(distance);
00593 }
00594 
00604 static PROB
00605 get_cluster_interword_similarity(CN_CLUSTER *c1, CN_CLUSTER *c2, WORD_INFO *winfo)
00606 {
00607   int i1, i2, j;
00608   WORD_ID w1, w2;
00609   PROB p1, p2;
00610   PROB sim, simsum;
00611   int simsum_count;
00612   int dist;
00613 
00614   /* order check */
00615   for(i1 = 0; i1 < c1->wgnum; i1++) {
00616     for(i2 = 0; i2 < c2->wgnum; i2++) {
00617       if (graph_ordered(c1->wg[i1]->id, c2->wg[i2]->id)) {
00618         /* ordered clusters should not be merged */
00619         //printf("Ordered:\n");
00620         //printf("c1:\n"); put_cluster(stdout, c1, winfo);
00621         //printf("c2:\n"); put_cluster(stdout, c2, winfo);
00622         return 0.0;
00623       }
00624     }
00625   }
00626 
00627 #ifdef CDEBUG2
00628   printf("-----\n");
00629   printf("c1:\n"); put_cluster(stdout, c1, winfo);
00630   printf("c2:\n"); put_cluster(stdout, c2, winfo);
00631 #endif
00632 
00633   /* compute similarity */
00634   simsum = 0.0;
00635   simsum_count = 0;
00636   for(i1 = 0; i1 < c1->wordsnum; i1++) {
00637     w1 = c1->words[i1];
00638     p1 = 0.0;
00639     for(j = 0; j < c1->wgnum; j++) {
00640       if (is_same_word(c1->wg[j]->wid, w1, winfo)) {
00641 #ifdef PREFER_GRAPH_CM
00642         p1 += c1->wg[j]->graph_cm;
00643 #else
00644         p1 += c1->wg[j]->cmscore;
00645 #endif
00646       }
00647     }
00648     for(i2 = 0; i2 < c2->wordsnum; i2++) {
00649       w2 = c2->words[i2];
00650       p2 = 0.0;
00651       for(j = 0; j < c2->wgnum; j++) {
00652         if (is_same_word(c2->wg[j]->wid, w2, winfo)) {
00653 #ifdef PREFER_GRAPH_CM
00654           p2 += c2->wg[j]->graph_cm;
00655 #else
00656           p2 += c2->wg[j]->cmscore;
00657 #endif
00658         }
00659       }
00660       dist = edit_distance(w1, w2, winfo);
00661 #ifdef CDEBUG2
00662       for(j=0;j<winfo->wlen[w1];j++) {
00663         printf("%s ", winfo->wseq[w1][j]->name);
00664       }
00665       printf("\n");
00666       for(j=0;j<winfo->wlen[w2];j++) {
00667         printf("%s ", winfo->wseq[w2][j]->name);
00668       }
00669       printf("\n");
00670       printf("distance=%d\n", dist);
00671 #endif
00672 
00673       sim = 1.0 - (float)dist / (float)(winfo->wlen[w1] + winfo->wlen[w2]);
00674 
00675 #ifdef CDEBUG2
00676       printf("(%s) - (%s): sim = %f, p1 = %f, p2 = %f\n", winfo->woutput[w1], winfo->woutput[w2], sim, p1, p2);
00677 #endif
00678 
00679       simsum += sim * p1 * p2;
00680       simsum_count++;
00681     }
00682   }
00683 
00684 #ifdef CDEBUG2
00685   printf("SIM=%f\n", simsum / simsum_count);
00686   printf("-----\n");
00687 #endif
00688 
00689   return(simsum / simsum_count);
00690 }
00691 
00692 
00705 CN_CLUSTER *
00706 confnet_create(WordGraph *root, RecogProcess *r)
00707 {
00708   CN_CLUSTER *croot;
00709   CN_CLUSTER *c, *cc, *cmax1, *cmax2;
00710   WordGraph *wg;
00711   PROB sim, max_sim;
00712   int wg_totalnum, n, i;
00713 
00714   /* make initial confnet instances from word graph */
00715   croot = NULL;
00716   wg_totalnum = 0;
00717   for(wg=root;wg;wg=wg->next) {
00718     c = cn_new();
00719     cn_add_wg(c, wg);
00720     c->next = croot;
00721     croot = c;
00722     wg_totalnum++;
00723   }
00724 
00725   /* intraword clustering iteration */
00726   do {
00727     /* find most similar pair */
00728     max_sim = 0.0;
00729     for(c=croot;c;c=c->next) {
00730       for(cc=c->next;cc;cc=cc->next) {
00731         sim = get_cluster_intraword_similarity(c, cc, r->lm->winfo);
00732         if (max_sim < sim) {
00733           max_sim = sim;
00734           cmax1 = c;
00735           cmax2 = cc;
00736         }
00737       }
00738     }
00739     /* merge the maximum one if exist */
00740     if (max_sim != 0.0) {
00741 #ifdef CDEBUG
00742       printf(">>> max_sim = %f\n", max_sim);
00743       put_cluster(stdout, cmax1, r->lm->winfo);
00744       put_cluster(stdout, cmax2, r->lm->winfo);
00745 #endif
00746       cn_merge(cmax1, cmax2);
00747       cn_destroy(cmax2, &croot);
00748     }
00749   } while (max_sim != 0.0); /* loop until no more similar pair exists */
00750 
00751   n = 0;
00752   for(c=croot;c;c=c->next) n++;
00753   if (verbose_flag) jlog("STAT: confnet: %d words -> %d clusters by intra-word clustering\n", wg_totalnum, n);
00754 
00755 #ifdef CDEBUG
00756   printf("---- result of intra-word clustering ---\n");
00757   i = 0;
00758   for(c=croot;c;c=c->next) {
00759     printf("%d :", i);
00760     put_cluster(stdout, c, r->lm->winfo);
00761 #ifdef CDEBUG2
00762     for(i=0;i<c->wgnum;i++) {
00763       printf("    ");
00764       put_wordgraph(stdout, c->wg[i], r->lm->winfo);
00765     }
00766 #endif
00767     i++;
00768   }
00769   printf("----------------------------\n");
00770 #endif
00771 
00772   /* inter-word clustering */
00773   do {
00774     /* build word list for each cluster */
00775     for(c=croot;c;c=c->next) cn_build_wordlist(c, r->lm->winfo);
00776     /* find most similar pair */
00777     max_sim = 0.0;
00778     for(c=croot;c;c=c->next) {
00779       for(cc=c->next;cc;cc=cc->next) {
00780         sim = get_cluster_interword_similarity(c, cc, r->lm->winfo);
00781         if (max_sim < sim) {
00782           max_sim = sim;
00783           cmax1 = c;
00784           cmax2 = cc;
00785         }
00786       }
00787     }
00788     /* merge the maximum one if exist */
00789     if (max_sim != 0.0) {
00790 #ifdef CDEBUG
00791       printf(">>> max_sim = %f\n", max_sim);
00792       put_cluster(stdout, cmax1, r->lm->winfo);
00793       put_cluster(stdout, cmax2, r->lm->winfo);
00794 #endif
00795       cn_merge(cmax1, cmax2);
00796       cn_destroy(cmax2, &croot);
00797     }
00798   } while (max_sim != 0.0); /* loop until no more similar pair exists */
00799 
00800   n = 0;
00801   for(c=croot;c;c=c->next) n++;
00802   if (verbose_flag) jlog("STAT: confnet: -> %d clusters by inter-word clustering\n", n);
00803 
00804   /* compute posterior probabilities and insert NULL entry */
00805   {
00806     PROB p, psum;
00807     int j;
00808 
00809     for(c=croot;c;c=c->next) {
00810       psum = 0.0;
00811       c->pp = (LOGPROB *)mymalloc(sizeof(LOGPROB) * (c->wordsnum + 1));
00812       for(i=0;i<c->wordsnum;i++) {
00813         p = 0.0;
00814         for(j = 0; j < c->wgnum; j++) {
00815           if (is_same_word(c->wg[j]->wid, c->words[i], r->lm->winfo)) {
00816 #ifdef PREFER_GRAPH_CM
00817             p += c->wg[j]->graph_cm;
00818 #else
00819             p += c->wg[j]->cmscore;
00820 #endif
00821           }
00822         }
00823         c->pp[i] = p;
00824         psum += p;
00825       }
00826       if (psum < 1.0) {
00827         c->words[c->wordsnum] = WORD_INVALID;
00828         c->pp[c->wordsnum] = 1.0 - psum;
00829         c->wordsnum++;
00830       }
00831     }
00832   }
00833 
00834   /* sort the words in each cluster by their posterior probabilities */
00835   {
00836     int j;
00837     WORD_ID wtmp;
00838     LOGPROB ltmp;
00839     for(c=croot;c;c=c->next) {
00840       for(i=0;i<c->wordsnum;i++) {
00841         for(j=c->wordsnum - 1;j>i;j--) {
00842           if (c->pp[j-1] < c->pp[j]) {
00843             ltmp = c->pp[j-1];
00844             c->pp[j-1] = c->pp[j];
00845             c->pp[j] = ltmp;
00846             wtmp = c->words[j-1];
00847             c->words[j-1] = c->words[j];
00848             c->words[j] = wtmp;
00849           }
00850         }
00851       }
00852     }
00853   }
00854 
00855   /* re-order clusters by their beginning frames */
00856   {
00857     CN_CLUSTER **clist;
00858     int k;
00859 
00860     /* sort cluster list by the left frame*/
00861     clist = (CN_CLUSTER **)mymalloc(sizeof(CN_CLUSTER *) * n);
00862     for(i=0,c=croot;c;c=c->next) clist[i++] = c;
00863     qsort(clist, n, sizeof(CN_CLUSTER *), (int (*)(const void *, const void *))compare_cluster);
00864     croot = NULL;
00865     for(k=0;k<n;k++) {
00866       if (k == 0) croot = clist[k];
00867       if (k == n - 1) clist[k]->next = NULL;
00868       else clist[k]->next = clist[k+1];
00869     }
00870   }
00871 
00872 
00873 #if 0
00874   /* output */
00875   printf("---- begin confusion network ---\n");
00876   for(c=croot;c;c=c->next) {
00877     for(i=0;i<c->wordsnum;i++) {
00878       printf("(%s:%.3f)", (c->words[i] == WORD_INVALID) ? "-" : r->lm->winfo->woutput[c->words[i]], c->pp[i]);
00879       if (i == 0) printf("  ");
00880     }
00881     printf("\n");
00882   }
00883   printf("---- end confusion network ---\n");
00884 #endif
00885 
00886   return(croot);
00887 }
00888 
00889 /* end of file */