/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */ package edu.cmu.sphinx.linguist.dictionary; import edu.cmu.sphinx.linguist.acoustic.Context; import edu.cmu.sphinx.linguist.acoustic.Unit; import edu.cmu.sphinx.linguist.acoustic.UnitManager; import edu.cmu.sphinx.linguist.g2p.G2PConverter; import edu.cmu.sphinx.linguist.g2p.Path; import edu.cmu.sphinx.util.Timer; import edu.cmu.sphinx.util.TimerPool; import edu.cmu.sphinx.util.props.ConfigurationManagerUtils; import edu.cmu.sphinx.util.props.PropertyException; import edu.cmu.sphinx.util.props.PropertySheet; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.logging.Logger; /** * Creates a dictionary by quickly reading in an ASCII-based Sphinx-3 format * dictionary. When loaded the dictionary just loads each line of the dictionary * into the hash table, assuming that most words are not going to be used. Only * when a word is actually used is its pronunciations massaged into an array of * pronunciations. * <p> * The format of the ASCII dictionary is the word, followed by spaces or tab, * followed by the pronunciation(s). For example, a digits dictionary will look * like: * <pre> * ONE HH W AH N * ONE(2) W AH N * TWO T UW * THREE TH R IY * FOUR F AO R * FIVE F AY V * SIX S IH K S * SEVEN S EH V AH N * EIGHT EY T * NINE N AY N * ZERO Z IH R OW * ZERO(2) Z IY R OW * OH OW * </pre> * In the above example, the words "one" and "zero" have two pronunciations * each. */ public class TextDictionary implements Dictionary { // ------------------------------- // Configuration data // -------------------------------- protected Logger logger; protected URL wordDictionaryFile; protected URL fillerDictionaryFile; protected List<URL> addendaUrlList; // Replacement to use if word is missing private String wordReplacement; // G2P model to use if word replacement is not specified and word is missing protected URL g2pModelFile; protected int g2pMaxPron = 0; protected UnitManager unitManager; // ------------------------------- // working data // ------------------------------- protected Map<String, String> dictionary; protected Map<String, Word> wordDictionary; protected G2PConverter g2pDecoder; protected final static String FILLER_TAG = "-F-"; protected Set<String> fillerWords; protected boolean allocated; public TextDictionary(String wordDictionaryFile, String fillerDictionaryFile, List<URL> addendaUrlList, boolean addSilEndingPronunciation, String wordReplacement, UnitManager unitManager) throws MalformedURLException, ClassNotFoundException { this(ConfigurationManagerUtils.resourceToURL(wordDictionaryFile), ConfigurationManagerUtils .resourceToURL(fillerDictionaryFile), addendaUrlList, wordReplacement, unitManager); } public TextDictionary(URL wordDictionaryFile, URL fillerDictionaryFile, List<URL> addendaUrlList, String wordReplacement, UnitManager unitManager) { this.logger = Logger.getLogger(getClass().getName()); this.wordDictionaryFile = wordDictionaryFile; this.fillerDictionaryFile = fillerDictionaryFile; this.addendaUrlList = addendaUrlList; this.wordReplacement = wordReplacement; this.unitManager = unitManager; } public TextDictionary(URL wordDictionaryFile, URL fillerDictionaryFile, List<URL> addendaUrlList, boolean addSilEndingPronunciation, String wordReplacement, UnitManager unitManager, URL g2pModelFile, int g2pMaxPron) { this(wordDictionaryFile, fillerDictionaryFile, addendaUrlList, wordReplacement, unitManager); this.g2pModelFile = g2pModelFile; this.g2pMaxPron = g2pMaxPron; } public TextDictionary() { } /* * (non-Javadoc) * * @see * edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util * .props.PropertySheet) */ public void newProperties(PropertySheet ps) throws PropertyException { logger = ps.getLogger(); wordDictionaryFile = ConfigurationManagerUtils.getResource(PROP_DICTIONARY, ps); fillerDictionaryFile = ConfigurationManagerUtils.getResource(PROP_FILLER_DICTIONARY, ps); addendaUrlList = ps.getResourceList(PROP_ADDENDA); wordReplacement = ps.getString(Dictionary.PROP_WORD_REPLACEMENT); unitManager = (UnitManager) ps.getComponent(PROP_UNIT_MANAGER); g2pModelFile = ConfigurationManagerUtils.getResource(PROP_G2P_MODEL_PATH, ps); g2pMaxPron = ps.getInt(PROP_G2P_MAX_PRONUNCIATIONS); } /** * Get the word dictionary file * * @return the URL of the word dictionary file */ public URL getWordDictionaryFile() { return wordDictionaryFile; } /** * Get the filler dictionary file * * @return the URL of the filler dictionary file */ public URL getFillerDictionaryFile() { return fillerDictionaryFile; } /* * (non-Javadoc) * * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#allocate() */ public void allocate() throws IOException { if (!allocated) { dictionary = new HashMap<String, String>(); wordDictionary = new HashMap<String, Word>(); Timer loadTimer = TimerPool.getTimer(this, "Load Dictionary"); fillerWords = new HashSet<String>(); loadTimer.start(); logger.info("Loading dictionary from: " + wordDictionaryFile); loadDictionary(wordDictionaryFile.openStream(), false); loadCustomDictionaries(addendaUrlList); logger.info("Loading filler dictionary from: " + fillerDictionaryFile); loadDictionary(fillerDictionaryFile.openStream(), true); if (g2pModelFile != null && !g2pModelFile.getPath().equals("")) { g2pDecoder = new G2PConverter(g2pModelFile); } loadTimer.stop(); } } /* * (non-Javadoc) * * @see edu.cmu.sphinx.linguist.dictionary.Dictionary#deallocate() */ public void deallocate() { if (allocated) { dictionary = null; g2pDecoder = null; allocated = false; } } /** * Loads the given simple dictionary from the given InputStream. The * InputStream is assumed to contain ASCII data. * * @param inputStream * the InputStream of the dictionary * @param isFillerDict * true if this is a filler dictionary, false otherwise * @throws java.io.IOException * if there is an error reading the dictionary */ protected void loadDictionary(InputStream inputStream, boolean isFillerDict) throws IOException { InputStreamReader isr = new InputStreamReader(inputStream); BufferedReader br = new BufferedReader(isr); String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.isEmpty()) { continue; } int spaceIndex = getSpaceIndex(line); if (spaceIndex < 0) { throw new Error("Error loading word: " + line); } String word = line.substring(0, spaceIndex); // Add numeric index if the word is repeating. if (dictionary.containsKey(word)) { int index = 2; String wordWithIdx; do { wordWithIdx = String.format("%s(%d)", word, index++); } while (dictionary.containsKey(wordWithIdx)); word = wordWithIdx; } if (isFillerDict) { dictionary.put(word, (FILLER_TAG + line)); fillerWords.add(word); } else { dictionary.put(word, line); } } br.close(); isr.close(); inputStream.close(); } private int getSpaceIndex(String line) { for (int i = 0; i < line.length(); i++) { if (line.charAt(i) == ' ' || line.charAt(i) == '\t') return i; } return -1; } /** * Gets a context independent unit. There should only be one instance of any * CI unit * * @param name * the name of the unit * @param isFiller * if true, the unit is a filler unit * @return the unit */ protected Unit getCIUnit(String name, boolean isFiller) { return unitManager.getUnit(name, isFiller, Context.EMPTY_CONTEXT); } /** * Returns the sentence start word. * * @return the sentence start word */ public Word getSentenceStartWord() { return getWord(SENTENCE_START_SPELLING); } /** * Returns the sentence end word. * * @return the sentence end word */ public Word getSentenceEndWord() { return getWord(SENTENCE_END_SPELLING); } /** * Returns the silence word. * * @return the silence word */ public Word getSilenceWord() { return getWord(SILENCE_SPELLING); } /** * Returns a Word object based on the spelling and its classification. The * behavior of this method is also affected by the properties * wordReplacement and g2pModel * * @param text * the spelling of the word of interest. * @return a Word object * @see edu.cmu.sphinx.linguist.dictionary.Word */ public Word getWord(String text) { Word wordObject = wordDictionary.get(text); if (wordObject != null) { return wordObject; } String word = dictionary.get(text); if (word == null) { // deal with 'not found' case logger.info("The dictionary is missing a phonetic transcription for the word '" + text + "'"); if (wordReplacement != null) { wordObject = getWord(wordReplacement); } else if (g2pModelFile != null && !g2pModelFile.getPath().equals("")) { logger.info("Generating phonetic transcription(s) for the word '" + text + "' using g2p model"); wordObject = extractPronunciation(text); wordDictionary.put(text, wordObject); } } else { // first lookup for this string wordObject = processEntry(text); } return wordObject; } private Word extractPronunciation(String text) { Word wordObject; ArrayList<Path> paths = g2pDecoder.phoneticize(text, g2pMaxPron); List<Pronunciation> pronunciations = new LinkedList<Pronunciation>(); for (Path p : paths) { int unitCount = p.getPath().size(); ArrayList<Unit> units = new ArrayList<Unit>(unitCount); for (String token : p.getPath()) { units.add(getCIUnit(token, false)); } if (units.size() == 0) { units.add(UnitManager.SILENCE); } pronunciations.add(new Pronunciation(units)); } Pronunciation[] pronunciationsArray = pronunciations.toArray(new Pronunciation[pronunciations.size()]); wordObject = createWord(text, pronunciationsArray, false); for (Pronunciation pronunciation : pronunciationsArray) { pronunciation.setWord(wordObject); } return wordObject; } /** * Create a Word object with the given spelling and pronunciations, and * insert it into the dictionary. * * @param text * the spelling of the word * @param pronunciation * the pronunciation of the word * @param isFiller * if <code>true</code> this is a filler word * @return the word */ private Word createWord(String text, Pronunciation[] pronunciation, boolean isFiller) { Word word = new Word(text, pronunciation, isFiller); dictionary.put(text, word.toString()); return word; } /** * Processes a dictionary entry. When loaded the dictionary just loads each * line of the dictionary into the hash table, assuming that most words are * not going to be used. Only when a word is actually used is its * pronunciations massaged into an array of pronunciations. */ private Word processEntry(String word) { List<Pronunciation> pronunciations = new LinkedList<Pronunciation>(); String line; int count = 0; boolean isFiller = false; do { count++; String lookupWord = word; if (count > 1) { lookupWord = lookupWord + '(' + count + ')'; } line = dictionary.get(lookupWord); if (line != null) { StringTokenizer st = new StringTokenizer(line); String tag = st.nextToken(); isFiller = tag.startsWith(FILLER_TAG); int unitCount = st.countTokens(); ArrayList<Unit> units = new ArrayList<Unit>(unitCount); for (int i = 0; i < unitCount; i++) { String unitName = st.nextToken(); units.add(getCIUnit(unitName, isFiller)); } pronunciations.add(new Pronunciation(units)); } } while (line != null); Pronunciation[] pronunciationsArray = pronunciations.toArray(new Pronunciation[pronunciations.size()]); Word wordObject = createWord(word, pronunciationsArray, isFiller); for (Pronunciation pronunciation : pronunciationsArray) { pronunciation.setWord(wordObject); } wordDictionary.put(word, wordObject); return wordObject; } /** * Returns a string representation of this TextDictionary in alphabetical * order. * * @return a string representation of this dictionary */ @Override public String toString() { SortedMap<String, String> sorted = new TreeMap<String, String>(dictionary); StringBuilder result = new StringBuilder(); for (Map.Entry<String, String> entry : sorted.entrySet()) { result.append(entry.getKey()); result.append(" ").append(entry.getValue()).append('\n'); } return result.toString(); } /** * Gets the set of all filler words in the dictionary * * @return an array (possibly empty) of all filler words */ public Word[] getFillerWords() { Word[] fillerWordArray = new Word[fillerWords.size()]; int index = 0; for (String spelling : fillerWords) { fillerWordArray[index++] = getWord(spelling); } return fillerWordArray; } /** * Dumps this FastDictionary to System.out. */ public void dump() { System.out.print(toString()); } /** * Loads the dictionary with a list of URLs to custom dictionary resources * * @param addenda * the list of custom dictionary URLs to be loaded * @throws IOException * if there is an error reading the resource URL */ private void loadCustomDictionaries(List<URL> addenda) throws IOException { if (addenda != null) { for (URL addendumUrl : addenda) { logger.info("Loading addendum dictionary from: " + addendumUrl); loadDictionary(addendumUrl.openStream(), false); } } } }