libsent/src/ngram/ngram_access.c

Go to the documentation of this file.
00001 
00018 /*
00019  * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University
00020  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00021  * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology
00022  * All rights reserved
00023  */
00024 
00025 #include <sent/stddefs.h>
00026 #include <sent/ngram2.h>
00027 
00038 NNID
00039 search_bigram(NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r)
00040 {
00041   /* do binary search */
00042   /* assume that data in (bigrams) are ordered by wid */
00043   NNID left,right,mid;          /* n2 */
00044 
00045   if ((left = ndata->n2_bgn[w_l]) == NNID_INVALID) /* has no bigram */
00046     return (NNID_INVALID);
00047   right = left + ndata->n2_num[w_l] - 1;
00048   while(left < right) {
00049     mid = (left + right) / 2;
00050     if (ndata->n2tonid[mid] < w_r) {
00051       left = mid + 1;
00052     } else {
00053       right = mid;
00054     }
00055   }
00056   if (ndata->n2tonid[left] == w_r) {
00057     return (left);
00058   } else {
00059     return (NNID_INVALID);
00060   }
00061 }
00062 
00076 static NNID
00077 search_trigram_v3(NGRAM_INFO *ndata, NNID n2, WORD_ID wkey)
00078 {
00079   /* do binary search */
00080   /* assume that data in (trigrams) are ordered by wid */
00081   int left,right,mid;
00082 
00083   if ((left = ndata->n3_bgn[n2]) == NNID_INVALID)       /* has no bigram */
00084     return (NNID_INVALID);
00085   right = left + ndata->n3_num[n2] - 1;
00086   while(left < right) {
00087     mid = (left + right) / 2;
00088     if (ndata->n3tonid[mid] < wkey) {
00089       left = mid + 1;
00090     } else {
00091       right = mid;
00092     }
00093   }
00094   if (ndata->n3tonid[left] == wkey) {
00095     return (left);
00096   } else {
00097     return (NNID_INVALID);
00098   }
00099 }
00100 
00114 static NNID
00115 search_trigram_v4(NGRAM_INFO *ndata, NNID n2, WORD_ID wkey)
00116 {
00117   /* do binary search */
00118   /* assume that data in (trigrams) are ordered by wid */
00119   NNID left,right,mid,boid;
00120 
00121   if ((boid = ndata->n2bo_upper[n2]) == NNID_INVALID_UPPER)     /* has no bigram */
00122      return (NNID_INVALID);
00123   boid = (boid << 16) + (NNID)(ndata->n2bo_lower[n2]);
00124   left = ((NNID)(ndata->n3_bgn_upper[boid]) << 16) + (NNID)(ndata->n3_bgn_lower[boid]);
00125   right = left + ndata->n3_num[boid] - 1;
00126   while(left < right) {
00127     mid = (left + right) / 2;
00128     if (ndata->n3tonid[mid] < wkey) {
00129       left = mid + 1;
00130     } else {
00131       right = mid;
00132     }
00133   }
00134   if (ndata->n3tonid[left] == wkey) {
00135     return (left);
00136   } else {
00137     return (NNID_INVALID);
00138   }
00139 }
00140 
00141 
00142 /* ---------------------------------------------------------------------- */
00143 /* for 1-gram */
00144 
00153 LOGPROB
00154 uni_prob(NGRAM_INFO *ndata, WORD_ID w)
00155 {
00156   if (w != ndata->unk_id) {
00157     return(ndata->p[w]);
00158   } else {
00159     return(ndata->p[w] - ndata->unk_num_log);
00160   }
00161 }
00162 
00163 /* for 2-gram */
00174 LOGPROB
00175 bi_prob_lr(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
00176 {
00177   NNID n2;
00178   LOGPROB prob;
00179 
00180   if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) {
00181     /* bigram exist */
00182     prob = ndata->p_lr[n2];
00183   } else {
00184     /* bigram not exist, return back-off prob */
00185     /* bo_wt_lr(w1) * p(w2) */
00186     prob = ndata->bo_wt_lr[w1] + ndata->p[w2];
00187   }
00188   if (w2 != ndata->unk_id) {
00189     return(prob);
00190   } else {
00191     return(prob - ndata->unk_num_log);
00192   }
00193 }
00194 
00205 LOGPROB
00206 bi_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
00207 {
00208   NNID n2;
00209   LOGPROB prob;
00210 
00211   if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) {
00212     /* bigram exist */
00213     prob = ndata->p_rl[n2];
00214   } else {
00215     /* bigram not exist, return back-off prob */
00216     /* bo_wt_rl(w2) * p(w1) */
00217     prob = ndata->bo_wt_rl[w2] + ndata->p[w1];
00218   }
00219   if (w1 != ndata->unk_id) {
00220     return(prob);
00221   } else {
00222     return(prob - ndata->unk_num_log);
00223   }
00224 }
00225 
00226 /* for 3-gram */
00238 LOGPROB
00239 tri_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3)
00240 {
00241   NNID n2, n3;
00242   int boid;
00243   
00244   if ((n2 = search_bigram(ndata, w2, w3)) != NNID_INVALID) {
00245     switch(ndata->version) {
00246     case 4:
00247       n3 = search_trigram_v4(ndata, n2, w1);
00248       break;
00249     case 3:
00250       n3 = search_trigram_v3(ndata, n2, w1);
00251       break;
00252     }
00253     if (n3 != NNID_INVALID) {
00254       /* trigram exist */
00255       if (w1 != ndata->unk_id) {
00256         return(ndata->p_rrl[n3]);
00257       } else {
00258         return(ndata->p_rrl[n3] - ndata->unk_num_log);
00259       }
00260     } else {
00261       /* return back-off prob */
00262       /* bo_wt_rl(w2,w3) * p(w1|w2) */
00263       /* unk will be discounted at bi-gram */
00264       switch(ndata->version) {
00265       case 4:
00266         if ((boid = ndata->n2bo_upper[n2]) == NNID_INVALID_UPPER) {     /* has no bigram */
00267           return(bi_prob_rl(ndata, w1, w2));
00268         } else {
00269           boid = (boid << 16) + (NNID)(ndata->n2bo_lower[n2]);
00270           return(ndata->bo_wt_rrl[boid] + bi_prob_rl(ndata, w1, w2)); 
00271         }
00272         break;
00273       case 3:
00274         return(ndata->bo_wt_rrl[n2] + bi_prob_rl(ndata, w1, w2));
00275         break;
00276       }
00277     }
00278   }
00279   /* context not exist, so return bigram prob */
00280   return(bi_prob_rl(ndata, w1, w2));
00281 }

Generated on Tue Dec 26 16:16:33 2006 for Julius by  doxygen 1.5.0