/* * Copyright 2007 LORIA, France. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. */ package edu.cmu.sphinx.linguist.acoustic.tiedstate.HTK; import java.io.BufferedReader; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.HashMap; import java.util.StringTokenizer; /** * HTK is case-sensitive, S4 is not. * * One must then first convert the HMM names to upper-case, resolve conflicts, * and use the same conversion to convert the lexicons and grammar. This tool * does it. * * @author Christophe Cerisara * */ public class NamesConversion { final HashMap<String,String> phoneConv = new HashMap<String,String>(); final HashMap<String,String> wordConv = new HashMap<String,String>(); String left, base, right; public NamesConversion() { } void addInConv(String item, HashMap<String,String> conv) { if (!conv.containsKey(item)) { // new item String cand = item.toUpperCase(); while (conv.containsValue(cand)) { // conflict ! cand = cand+"_X"; } conv.put(item,cand); } } void buildPhoneConversion(String MMFfile) { try { BufferedReader bf = new BufferedReader(new FileReader(MMFfile)); String s; for (;;) { s=bf.readLine(); if (s==null) break; int i=s.indexOf("~h"); if (i>=0) { i=s.indexOf('"'); int j = s.lastIndexOf('"'); String nom = s.substring(i+1,j); split3ph(nom); if (left!=null) addInConv(left,phoneConv); if (base!=null) addInConv(base,phoneConv); if (right!=null) addInConv(right,phoneConv); } } bf.close(); } catch (IOException e) { e.printStackTrace(); } } void buildWordConversion(String lexFile) { try { BufferedReader bf = new BufferedReader(new FileReader(lexFile)); String s; for (;;) { s=bf.readLine(); if (s==null) break; StringTokenizer st = new StringTokenizer(s); if (st.hasMoreTokens()) { String word = st.nextToken(); addInConv(word,wordConv); } } bf.close(); } catch (IOException e) { e.printStackTrace(); } } void split3ph(String nom) { int i = nom.indexOf('-'); if (i>=0) { left = nom.substring(0,i); } else {left=null; i=-1;} String s = nom.substring(i+1); i = s.indexOf('+'); if (i>=0) { right = s.substring(i+1); } else {right=null; i=s.length();} base = s.substring(0,i); } String conv3ph() { String rep; if (left!=null) { rep=conv1ph(left)+ '-'; } else rep=""; rep+=conv1ph(base); if (right!=null) { rep+= '+' +conv1ph(right); } if (rep.equals("null")) { System.err.println("detson error "+left+ ' ' +base+ ' ' +right); System.exit(1); } return rep; } String conv1ph(String p) { return phoneConv.get(p); } void convertMMF(String MMFfile) { try { BufferedReader bf = new BufferedReader(new FileReader(MMFfile)); PrintWriter pf = new PrintWriter(new FileWriter(MMFfile+".conv")); String s; for (;;) { s=bf.readLine(); if (s==null) break; int i=s.indexOf("~h"); if (i>=0) { i=s.indexOf('"'); int j = s.lastIndexOf('"'); String nom = s.substring(i+1,j); split3ph(nom); String newnom = conv3ph(); pf.println("~h \""+newnom+ '\"'); } else pf.println(s); } pf.close(); bf.close(); } catch (IOException e) { e.printStackTrace(); } } void convertWordGrammar(String gramFile) { try { BufferedReader bf = new BufferedReader(new FileReader(gramFile)); PrintWriter pf = new PrintWriter(new FileWriter(gramFile+".conv")); String s; // skip comments for (;;) { s=bf.readLine(); if (s==null) {pf.close();bf.close();return;} pf.println(s); int i=s.indexOf("\\data\\"); if (i==0) break; } // wait for 1-gram for (;;) { s=bf.readLine(); if (s==null) {pf.close();bf.close();return;} pf.println(s); int i=s.indexOf("\\1-grams:"); if (i==0) break; } // 1-grams: boolean fin=false; while (!fin) { s=bf.readLine(); if (s==null) {pf.close();bf.close();return;} int i=s.indexOf("\\2-grams:"); if (i==0) { pf.println(s); break; } i=s.indexOf("\\end\\"); if (i==0) {fin=true; pf.println(s); break;} StringTokenizer st = new StringTokenizer(s); if (st!=null & st.hasMoreTokens()) { pf.print(st.nextToken()+ ' '); if (st.hasMoreTokens()) { String mot = st.nextToken(); String newmot = wordConv.get(mot); if (newmot==null) { // when the word is not in the lexicon, we get null here. // we should then build a new converted item System.err.println("WARNING word "+mot+" not in lexicon !"); addInConv(mot,wordConv); newmot = wordConv.get(mot); } pf.print(newmot+ ' '); while (st.hasMoreTokens()) pf.print(st.nextToken()+ ' '); } pf.println(); } } // 2-grams: while (!fin) { s=bf.readLine(); if (s==null) {pf.close();bf.close();return;} int i=s.indexOf("\\3-grams:"); if (i==0) { pf.println(s); break; } i=s.indexOf("\\end\\"); if (i==0) {fin=true; pf.println(s); break;} StringTokenizer st = new StringTokenizer(s); if (st!=null & st.hasMoreTokens()) { pf.print(st.nextToken()+ ' '); if (st.hasMoreTokens()) { String mot = st.nextToken(); String newmot = wordConv.get(mot); if (newmot==null) newmot=mot; pf.print(newmot+ ' '); if (st.hasMoreTokens()) { mot = st.nextToken(); newmot = wordConv.get(mot); if (newmot==null) newmot=mot; pf.print(newmot+ ' '); while (st.hasMoreTokens()) pf.print(st.nextToken()+ ' '); } } pf.println(); } } // 3-grams: while (!fin) { s=bf.readLine(); if (s==null) {pf.close();bf.close();return;} int i=s.indexOf("\\end\\"); if (i==0) {fin=true; pf.println(s); break;} StringTokenizer st = new StringTokenizer(s); if (st!=null & st.hasMoreTokens()) { pf.print(st.nextToken()+ ' '); if (st.hasMoreTokens()) { String mot = st.nextToken(); String newmot = wordConv.get(mot); if (newmot==null) newmot=mot; pf.print(newmot+ ' '); if (st.hasMoreTokens()) { mot = st.nextToken(); newmot = wordConv.get(mot); if (newmot==null) newmot=mot; pf.print(newmot+ ' '); if (st.hasMoreTokens()) { mot = st.nextToken(); newmot = wordConv.get(mot); if (newmot==null) newmot=mot; pf.print(newmot+ ' '); while (st.hasMoreTokens()) pf.print(st.nextToken()+ ' '); } } } pf.println(); } } pf.close(); bf.close(); } catch (IOException e) { e.printStackTrace(); } } void convertLexicon(String lexFile) { try { BufferedReader bf = new BufferedReader(new FileReader(lexFile)); PrintWriter pf = new PrintWriter(new FileWriter(lexFile+".conv")); String s; for (;;) { s=bf.readLine(); if (s==null) break; StringTokenizer st = new StringTokenizer(s); if (st==null || !st.hasMoreTokens()) continue; String mot = st.nextToken(); String newmot = wordConv.get(mot); if (newmot!=null) mot=newmot; pf.print(mot+ ' '); while (st.hasMoreTokens()) { String ph = st.nextToken(); // format julius: delete the output string between [..] if (ph.charAt(0)=='[') { for (;;) { if (ph.endsWith("]")) break; ph = st.nextToken(); } ph = st.nextToken(); } split3ph(ph); String newnom = conv3ph(); pf.print(newnom+ ' '); } pf.println(); } pf.close(); bf.close(); } catch (IOException e) { e.printStackTrace(); } } // TODO: support without filler, which shall be loaded from the HTK lexicon in the future public static void main(String args[]) { String MMFfile = null; String lexFile = null; String fillerFile = null; String gramFile = null; for (int i=0;i<args.length;i++) { if (args[i].equals("-lex")) { lexFile = args[++i]; } else if (args[i].equals("-gram")) { gramFile = args[++i]; } else if (args[i].equals("-mmf")) { MMFfile = args[++i]; } else if (args[i].equals("-filler")) { fillerFile = args[++i]; } } // output = same files + extension ".conv" if (MMFfile!=null) { // conversion des phonemes et des mots NamesConversion nc = new NamesConversion(); nc.buildPhoneConversion(MMFfile); nc.buildWordConversion(lexFile); System.out.println("converting phones in MMF to "+MMFfile+".conv"); nc.convertMMF(MMFfile); if (lexFile!=null) { System.out.println("converting phones and words in lexicon to "+lexFile+".conv"); nc.convertLexicon(lexFile); } if (fillerFile!=null) { System.out.println("converting phones in filler to "+fillerFile+".conv"); nc.convertLexicon(fillerFile); } if (gramFile!=null) { System.out.println("converting words in gram to "+gramFile+".conv"); nc.convertWordGrammar(gramFile); } } } }