00001
00019
00020
00021
00022
00023
00024
00025
00026 #include <sent/stddefs.h>
00027 #include <sent/vocabulary.h>
00028 #include <sent/htk_hmm.h>
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045 #define PHONEMELEN_STEP 30
00046 static char buf[MAXLINELEN];
00047 static char bufbak[MAXLINELEN];
00048
00049
00055 static void
00056 add_to_error(WORD_INFO *winfo, char *name)
00057 {
00058 char *buf;
00059 char *match;
00060
00061 buf = (char *)mymalloc(strlen(name) + 1);
00062 strcpy(buf, name);
00063 if (winfo->errph_root == NULL) {
00064 winfo->errph_root = aptree_make_root_node(buf);
00065 } else {
00066 match = aptree_search_data(buf, winfo->errph_root);
00067 if (match == NULL || !strmatch(match, buf)) {
00068 aptree_add_entry(buf, buf, match, &(winfo->errph_root));
00069 }
00070 }
00071 }
00072
00078 static void
00079 callback_list_error(void *x)
00080 {
00081 char *name;
00082 name = x;
00083 jlog("Error: voca_load_wordlist: %s\n", name);
00084 }
00090 static void
00091 list_error(WORD_INFO *winfo)
00092 {
00093 jlog("Error: voca_load_wordlist: begin missing phones\n");
00094 aptree_traverse_and_do(winfo->errph_root, callback_list_error);
00095 jlog("Error: voca_load_wordlist: end missing phones\n");
00096 }
00097
00113 boolean
00114 voca_load_word_line(char *buf, WORD_INFO *winfo, HTK_HMM_INFO *hmminfo, char *headphone, char *tailphone, char *contextphone)
00115 {
00116 WORD_ID vnum;
00117
00118 winfo->linenum++;
00119 vnum = winfo->num;
00120 if (vnum >= winfo->maxnum) {
00121 if (winfo_expand(winfo) == FALSE) return FALSE;
00122 }
00123 if (voca_load_wordlist_line(buf, &vnum, winfo->linenum, winfo, hmminfo, winfo->do_conv, &(winfo->ok_flag), headphone, tailphone, contextphone) == FALSE) {
00124 return FALSE;
00125 }
00126 winfo->num = vnum;
00127 return TRUE;
00128 }
00141 boolean
00142 voca_load_wordlist(FILE *fp, WORD_INFO *winfo, HTK_HMM_INFO *hmminfo, char *headphone, char *tailphone, char *contextphone)
00143 {
00144 boolean ret;
00145
00146 voca_load_start(winfo, hmminfo, FALSE);
00147 while (getl(buf, sizeof(buf), fp) != NULL) {
00148 if (voca_load_word_line(buf, winfo, hmminfo, headphone, tailphone, contextphone) == FALSE) break;
00149 }
00150 ret = voca_load_end(winfo);
00151
00152 return(ret);
00153 }
00154
00171 boolean
00172 voca_load_wordlist_line(char *buf, WORD_ID *vnum_p, int linenum, WORD_INFO *winfo, HTK_HMM_INFO *hmminfo, boolean do_conv, boolean *ok_flag, char *headphone, char *tailphone, char *contextphone)
00173 {
00174 char *ptmp, *lp = NULL, *p;
00175 static char cbuf[MAX_HMMNAME_LEN];
00176 static HMM_Logical **tmpwseq = NULL;
00177 static int tmpmaxlen;
00178 int len;
00179 HMM_Logical *tmplg;
00180 boolean pok, first;
00181 int vnum;
00182
00183 vnum = *vnum_p;
00184
00185 if (strmatch(buf, "DICEND")) return FALSE;
00186
00187
00188 if (tmpwseq == NULL) {
00189 tmpmaxlen = PHONEMELEN_STEP;
00190 tmpwseq = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * tmpmaxlen);
00191 }
00192
00193
00194 strcpy(bufbak, buf);
00195
00196
00197 if ((ptmp = mystrtok_quote(buf, " \t\n")) == NULL) {
00198 jlog("Error: voca_load_wordlist: line %d: corrupted data:\n> %s\n", linenum, bufbak);
00199 winfo->errnum++;
00200 *ok_flag = FALSE;
00201 return TRUE;
00202 }
00203 winfo->wname[vnum] = strcpy((char *)mybmalloc2(strlen(ptmp)+1, &(winfo->mroot)), ptmp);
00204
00205
00206 winfo->is_transparent[vnum] = FALSE;
00207
00208
00209 if ((ptmp = mystrtok_movetonext(NULL, " \t\n")) == NULL) {
00210 jlog("Error: voca_load_wordlist: line %d: corrupted data:\n> %s\n", linenum, bufbak);
00211 winfo->errnum++;
00212 *ok_flag = FALSE;
00213 return TRUE;
00214 }
00215 #ifdef CLASS_NGRAM
00216 winfo->cprob[vnum] = 0.0;
00217 #endif
00218
00219 if (ptmp[0] == '@') {
00220 #ifdef CLASS_NGRAM
00221
00222
00223
00224
00225
00226 if ((ptmp = mystrtok(NULL, " \t\n")) == NULL) {
00227 jlog("Error: voca_load_wordlist: line %d: corrupted data:\n> %s\n", linenum, bufbak);
00228 winfo->errnum++;
00229 *ok_flag = FALSE;
00230 return TRUE;
00231 }
00232 if (ptmp[1] == '\0') {
00233 jlog("Error: voca_load_wordlist: line %d: value after '@' missing, maybe wrong space?\n> %s\n", linenum, bufbak);
00234 winfo->errnum++;
00235 *ok_flag = FALSE;
00236 return TRUE;
00237 }
00238 winfo->cprob[vnum] = atof(&(ptmp[1]));
00239 if (winfo->cprob[vnum] != 0.0) winfo->cwnum++;
00240
00241 if ((ptmp = mystrtok(NULL, " \t\n")) == NULL) {
00242 jlog("Error: voca_load_wordlist: line %d: corrupted data:\n> %s\n", linenum,bufbak);
00243 winfo->errnum++;
00244 *ok_flag = FALSE;
00245 return TRUE;
00246 }
00247
00248 if ((ptmp = mystrtok_movetonext(NULL, " \t\n")) == NULL) {
00249 jlog("Error: voca_load_wordlist: line %d: corrupted data:\n> %s\n", linenum, bufbak);
00250 winfo->errnum++;
00251 *ok_flag = FALSE;
00252 return TRUE;
00253 }
00254 #else
00255 jlog("Error: voca_load_wordlist: line %d: cannot handle in-class word probability\n> %s\n", linenum, ptmp, bufbak);
00256 winfo->errnum++;
00257 *ok_flag = FALSE;
00258 return TRUE;
00259 #endif
00260 }
00261
00262
00263 switch(ptmp[0]) {
00264 case '[':
00265 ptmp = mystrtok_quotation(NULL, " \t\n", '[', ']', 0);
00266 break;
00267 case '{':
00268 ptmp = mystrtok_quotation(NULL, " \t\n", '{', '}', 0);
00269 break;
00270 default:
00271
00272
00273 ptmp = winfo->wname[vnum];
00274 }
00275 if (ptmp == NULL) {
00276 jlog("Error: voca_load_htkdict: line %d: corrupted data:\n> %s\n", linenum, bufbak);
00277 winfo->errnum++;
00278 *ok_flag = FALSE;
00279 return TRUE;
00280 }
00281 winfo->woutput[vnum] = strcpy((char *)mybmalloc2(strlen(ptmp)+1, &(winfo->mroot)), ptmp);
00282
00283
00284 if (hmminfo == NULL) {
00285
00286 winfo->wseq[vnum] = NULL;
00287 winfo->wlen[vnum] = 0;
00288 } else {
00289
00290 len = 0;
00291 first = TRUE;
00292 pok = TRUE;
00293
00294 for (;;) {
00295 if (do_conv) {
00296 if (first) {
00297
00298 cycle_triphone(NULL);
00299
00300 if (contextphone) {
00301 cycle_triphone(contextphone);
00302 } else {
00303 cycle_triphone("NULL_C");
00304 }
00305 if ((lp = mystrtok(NULL, " \t\n")) == NULL) {
00306 jlog("Error: voca_load_wordlist: line %d: word %s has no phoneme:\n> %s\n", linenum, winfo->wname[vnum], bufbak);
00307 winfo->errnum++;
00308 *ok_flag = FALSE;
00309 return TRUE;
00310 }
00311 p = cycle_triphone(lp);
00312 first = FALSE;
00313 } else {
00314 if (lp != NULL) {
00315 lp = mystrtok(NULL, " \t\n");
00316 if (lp != NULL) {
00317
00318 p = cycle_triphone(lp);
00319 } else {
00320
00321 if (contextphone) {
00322 p = cycle_triphone(contextphone);
00323 } else {
00324 p = cycle_triphone("NULL_C");
00325 }
00326 }
00327 } else {
00328
00329 p = cycle_triphone_flush();
00330 }
00331 }
00332 } else {
00333 if (first) {
00334 p = lp = headphone;
00335 first = FALSE;
00336 } else {
00337 if (lp != NULL) {
00338 p = lp = mystrtok(NULL, " \t\n");
00339
00340 if (lp == NULL) p = tailphone;
00341 } else {
00342
00343 p = NULL;
00344 }
00345 }
00346 }
00347 if (p == NULL) break;
00348
00349
00350 if (do_conv) {
00351 center_name(p, cbuf);
00352 if (contextphone) {
00353 if (strmatch(cbuf, contextphone)) {
00354 if (len == 0) {
00355 p = headphone;
00356 } else if (lp == NULL) {
00357 p = tailphone;
00358 }
00359 }
00360 } else {
00361 if (strmatch(cbuf, "NULL_C")) {
00362 if (len == 0) {
00363 p = headphone;
00364 } else if (lp == NULL) {
00365 p = tailphone;
00366 }
00367 } else {
00368 if (strnmatch(p, "NULL_C", 6)) {
00369 if (strnmatch(&(p[strlen(p)-6]), "NULL_C", 6)) {
00370 p = cbuf;
00371 } else {
00372 p = rightcenter_name(p, cbuf);
00373 }
00374 } else if (strnmatch(&(p[strlen(p)-6]), "NULL_C", 6)) {
00375 p = leftcenter_name(p, cbuf);
00376 }
00377 }
00378 }
00379 }
00380
00381
00382
00383 tmplg = htk_hmmdata_lookup_logical(hmminfo, p);
00384 if (tmplg == NULL) {
00385
00386 if (do_conv) {
00387
00388 jlog("Error: voca_load_wordlist: line %d: logical phone \"%s\" not found\n", linenum, p);
00389 snprintf(cbuf,MAX_HMMNAME_LEN,"%s", p);
00390 } else {
00391 jlog("Error: voca_load_wordlist: line %d: phone \"%s\" not found\n", linenum, p);
00392 snprintf(cbuf, MAX_HMMNAME_LEN, "%s", p);
00393 }
00394 add_to_error(winfo, cbuf);
00395 pok = FALSE;
00396 } else {
00397
00398 if (len >= tmpmaxlen) {
00399
00400 tmpmaxlen += PHONEMELEN_STEP;
00401 tmpwseq = (HMM_Logical **)myrealloc(tmpwseq, sizeof(HMM_Logical *) * tmpmaxlen);
00402 }
00403
00404 tmpwseq[len] = tmplg;
00405 }
00406 len++;
00407 }
00408 if (!pok) {
00409 jlog("Error: voca_load_wordlist: the line content was: %s\n", bufbak);
00410 winfo->errnum++;
00411 *ok_flag = FALSE;
00412 return TRUE;
00413 }
00414 if (len == 0) {
00415 jlog("Error: voca_load_wordlist: line %d: no phone specified:\n> %s\n", linenum, bufbak);
00416 winfo->errnum++;
00417 *ok_flag = FALSE;
00418 return TRUE;
00419 }
00420
00421 winfo->wseq[vnum] = (HMM_Logical **)mybmalloc2(sizeof(HMM_Logical *) * len, &(winfo->mroot));
00422 memcpy(winfo->wseq[vnum], tmpwseq, sizeof(HMM_Logical *) * len);
00423 winfo->wlen[vnum] = len;
00424 }
00425
00426 vnum++;
00427 *vnum_p = vnum;
00428
00429 return(TRUE);
00430 }
00431