/*
* Copyright 1999-2002 Carnegie Mellon University.
* Portions Copyright 2002 Sun Microsystems, Inc.
* Portions Copyright 2002 Mitsubishi Electric Research Laboratories.
* All Rights Reserved. Use is subject to license terms.
*
* See the file "license.terms" for information on usage and
* redistribution of this file, and for a DISCLAIMER OF ALL
* WARRANTIES.
*
*/
package edu.cmu.sphinx.linguist.language.ngram;
import static java.lang.Math.max;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import edu.cmu.sphinx.linguist.WordSequence;
import edu.cmu.sphinx.linguist.dictionary.Dictionary;
import edu.cmu.sphinx.linguist.dictionary.Word;
import edu.cmu.sphinx.util.LogMath;
import edu.cmu.sphinx.util.props.ConfigurationManagerUtils;
import edu.cmu.sphinx.util.props.PropertyException;
import edu.cmu.sphinx.util.props.PropertySheet;
/**
* An ASCII ARPA language model loader. This loader makes no attempt to
* optimize storage, so it can only load very small language models
* <p>
* Note that all probabilities in the grammar are stored in LogMath log base
* format. Language Probabilities in the language model file are stored in log
* 10 base.
*/
public class SimpleNGramModel implements LanguageModel {
// ----------------------------
// Configuration data
// ----------------------------
private String name;
private LogMath logMath;
private URL urlLocation;
private float unigramWeight;
private Dictionary dictionary;
private int desiredMaxDepth;
private int maxNGram;
private Map<WordSequence, Probability> map;
private Set<String> vocabulary;
protected int lineNumber;
protected BufferedReader reader;
protected String fileName;
private boolean allocated;
private LinkedList<WordSequence> tokens;
public SimpleNGramModel(String location, Dictionary dictionary,
float unigramWeight, int desiredMaxDepth)
throws MalformedURLException, ClassNotFoundException
{
this(ConfigurationManagerUtils.resourceToURL(location), dictionary,
unigramWeight, desiredMaxDepth);
}
public SimpleNGramModel(URL urlLocation, Dictionary dictionary,
float unigramWeight, int desiredMaxDepth)
{
this.urlLocation = urlLocation;
this.unigramWeight = unigramWeight;
this.logMath = LogMath.getLogMath();
this.desiredMaxDepth = desiredMaxDepth;
this.dictionary = dictionary;
this.map = new HashMap<WordSequence, Probability>();
this.vocabulary = new HashSet<String>();
this.tokens = new LinkedList<WordSequence>();
}
public SimpleNGramModel() {
}
/*
* (non-Javadoc)
* @see
* edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.
* util.props.PropertySheet)
*/
public void newProperties(PropertySheet ps) throws PropertyException {
logMath = LogMath.getLogMath();
if (allocated) {
throw new RuntimeException("Can't change properties after allocation");
}
urlLocation = ConfigurationManagerUtils.getResource(PROP_LOCATION, ps);
unigramWeight = ps.getFloat(PROP_UNIGRAM_WEIGHT);
desiredMaxDepth = ps.getInt(PROP_MAX_DEPTH);
dictionary = (Dictionary) ps.getComponent(PROP_DICTIONARY);
map = new HashMap<WordSequence, Probability>();
vocabulary = new HashSet<String>();
tokens = new LinkedList<WordSequence>();
}
/*
* (non-Javadoc)
* @see edu.cmu.sphinx.linguist.language.ngram.LanguageModel#allocate()
*/
public void allocate() throws IOException {
allocated = true;
load(urlLocation, unigramWeight, dictionary);
if (desiredMaxDepth > 0) {
if (desiredMaxDepth < maxNGram) {
maxNGram = desiredMaxDepth;
}
}
}
/*
* (non-Javadoc)
* @see edu.cmu.sphinx.linguist.language.ngram.LanguageModel#deallocate()
*/
public void deallocate() {
allocated = false;
}
/*
* (non-Javadoc)
* @see edu.cmu.sphinx.util.props.Configurable#getName()
*/
public String getName() {
return name;
}
/**
* Gets the ngram probability of the word sequence represented by the word
* list
*
* @param wordSequence the word sequence
* @return the probability of the word sequence. Probability is in logMath
* log base
*/
public float getProbability(WordSequence wordSequence) {
float logProbability = 0.0f;
Probability prob = getProb(wordSequence);
if (prob == null) {
if (wordSequence.size() > 1) {
logProbability = getBackoff(wordSequence.getOldest())
+ getProbability(wordSequence.getNewest());
} else { // if the single word is not in the model at all
// then its zero likelihood that we'll use it
logProbability = LogMath.LOG_ZERO;
}
} else {
logProbability = prob.logProbability;
}
// System.out.println("Search: " + wordSequence + " : "
// + logProbability + " "
// + logMath.logToLinear(logProbability));
return logProbability;
}
/**
* Gets the smear term for the given wordSequence
*
* @param wordSequence the word sequence
* @return the smear term associated with this word sequence
*/
public float getSmear(WordSequence wordSequence) {
return 0.0f; // TODO not implemented
}
/**
* Returns the backoff probability for the give sequence of words
*
* @param wordSequence the sequence of words
* @return the backoff probability in LogMath log base
*/
public float getBackoff(WordSequence wordSequence) {
float logBackoff = 0.0f; // log of 1.0
Probability prob = getProb(wordSequence);
if (prob != null) {
logBackoff = prob.logBackoff;
}
return logBackoff;
}
/**
* Returns the maximum depth of the language model
*
* @return the maximum depth of the language model
*/
public int getMaxDepth() {
return maxNGram;
}
/**
* Returns the set of words in the language model. The set is unmodifiable.
*
* @return the unmodifiable set of words
*/
public Set<String> getVocabulary() {
return Collections.unmodifiableSet(vocabulary);
}
@Override
public void onUtteranceEnd() {
//TODO not implemented
}
/**
* Gets the probability entry for the given word sequence or null if there
* is no entry
*
* @param wordSequence a word sequence
* @return the probability entry for the wordlist or null
*/
private Probability getProb(WordSequence wordSequence) {
return map.get(wordSequence);
}
/**
* Converts a wordList to a string
*
* @param wordList the wordList
* @return the string
*/
@SuppressWarnings("unused")
private String listToString(List<Word> wordList) {
StringBuilder sb = new StringBuilder();
for (Word word : wordList)
sb.append(word).append(' ');
return sb.toString();
}
/** Dumps the language model */
public void dump() {
for (Map.Entry<WordSequence, Probability> entry : map.entrySet())
System.out.println(entry.getKey() + " " + entry.getValue());
}
/**
* Retrieves a string representation of the wordlist, suitable for map
* access
*
* @param wordList the list of words
* @return a string representation of the word list
*/
@SuppressWarnings("unused")
private String getRepresentation(List<String> wordList) {
if (wordList.isEmpty())
return "";
StringBuilder sb = new StringBuilder();
for (String word : wordList)
sb.append(word).append('+');
sb.setLength(sb.length() - 1);
return sb.toString();
}
/**
* Loads the language model from the given location.
*
* @param location the URL location of the model
* @param unigramWeight the unigram weight
* @throws IOException if an error occurs while loading
*/
private void load(URL location, float unigramWeight,
Dictionary dictionary) throws IOException {
String line;
float logUnigramWeight = logMath.linearToLog(unigramWeight);
float inverseLogUnigramWeight = logMath
.linearToLog(1.0 - unigramWeight);
open(location);
// look for beginning of data
readUntil("\\data\\");
// look for ngram statements
List<Integer> ngramList = new ArrayList<Integer>();
while ((line = readLine()) != null) {
if (line.startsWith("ngram")) {
StringTokenizer st = new StringTokenizer(line, " \t\n\r\f=");
if (st.countTokens() != 3) {
corrupt("corrupt ngram field " + line + ' '
+ st.countTokens());
}
st.nextToken();
int index = Integer.parseInt(st.nextToken());
int count = Integer.parseInt(st.nextToken());
ngramList.add(index - 1, count);
maxNGram = max(index, maxNGram);
} else if (line.equals("\\1-grams:")) {
break;
}
}
int numUnigrams = ngramList.get(0) - 1;
// -log(x) = log(1/x)
float logUniformProbability = -logMath.linearToLog(numUnigrams);
for (int index = 0; index < ngramList.size(); index++) {
int ngram = index + 1;
int ngramCount = ngramList.get(index);
for (int i = 0; i < ngramCount; i++) {
StringTokenizer tok = new StringTokenizer(readLine());
int tokenCount = tok.countTokens();
if (tokenCount != ngram + 1 && tokenCount != ngram + 2) {
corrupt("Bad format");
}
float log10Prob = Float.parseFloat(tok.nextToken());
float log10Backoff = 0.0f;
// construct the WordSequence for this N-Gram
List<Word> wordList = new ArrayList<Word>(maxNGram);
for (int j = 0; j < ngram; j++) {
String word = tok.nextToken();
vocabulary.add(word);
Word wordObject = dictionary.getWord(word);
if (wordObject == null) {
wordObject = Word.UNKNOWN;
}
wordList.add(wordObject);
}
WordSequence wordSequence = new WordSequence(wordList);
if (tok.hasMoreTokens()) {
log10Backoff = Float.parseFloat(tok.nextToken());
}
float logProb = logMath.log10ToLog(log10Prob);
float logBackoff = logMath.log10ToLog(log10Backoff);
// Apply unigram weights if this is a unigram probability
if (ngram == 1) {
float p1 = logProb + logUnigramWeight;
float p2 = logUniformProbability + inverseLogUnigramWeight;
logProb = logMath.addAsLinear(p1, p2);
// System.out
// .println("p1 " + p1 + " p2 " + p2 + " luw "
// + logUnigramWeight + " iluw "
// + inverseLogUnigramWeight + " lup "
// + logUniformProbability + " logprog "
// + logProb);
}
put(wordSequence, logProb, logBackoff);
}
if (index < ngramList.size() - 1) {
String next = "\\" + (ngram + 1) + "-grams:";
readUntil(next);
}
}
readUntil("\\end\\");
close();
}
/**
* Puts the probability into the map
*
* @param wordSequence the tag for the prob.
* @param logProb the probability in log math base
* @param logBackoff the backoff probability in log math base
*/
private void put(WordSequence wordSequence, float logProb, float logBackoff) {
// System.out.println("Putting " + wordSequence + " p " + logProb
// + " b " + logBackoff);
map.put(wordSequence, new Probability(logProb, logBackoff));
tokens.add(wordSequence);
}
/**
* Returns a list of all the word sequences in the language model This
* method is used to create Finite State Transducers of the language model.
*
* @return List containing all the word sequences
*/
public LinkedList<WordSequence> getNGrams() {
return tokens;
}
/**
* Reads the next line from the LM file. Keeps track of line number.
*
* @throws IOException if an error occurs while reading the input or an EOF
* is encountered.
*/
private String readLine() throws IOException {
String line;
lineNumber++;
line = reader.readLine();
if (line == null) {
corrupt("Premature EOF");
}
return line.trim();
}
/**
* Opens the language model at the given location
*
* @param location the path to the language model
* @throws IOException if an error occurs while opening the file
*/
private void open(URL location) throws
IOException {
lineNumber = 0;
fileName = location.toString();
reader = new BufferedReader
(new InputStreamReader(location.openStream()));
}
/**
* Reads from the input stream until the input matches the given string
*
* @param match the string to match on
* @throws IOException if an error occurs while reading the input or an EOF
* is encountered before finding the match
*/
private void readUntil(String match) throws IOException {
try {
while (!readLine().equals(match)) {
}
} catch (IOException ioe) {
corrupt("Premature EOF while waiting for " + match);
}
}
/**
* Closes the language model file
*
* @throws IOException if an error occurs
*/
private void close() throws IOException {
reader.close();
reader = null;
}
/**
* Generates a 'corrupt' IO exception
*
* @throws IOException with the given string
*/
private void corrupt(String why) throws IOException {
throw new IOException("Corrupt Language Model " + fileName
+ " at line " + lineNumber + ':' + why);
}
}
/** Represents a probability and a backoff probability */
class Probability {
final float logProbability;
final float logBackoff;
/**
* Constructs a probability
*
* @param logProbability the probability
* @param logBackoff the backoff probability
*/
Probability(float logProbability, float logBackoff) {
this.logProbability = logProbability;
this.logBackoff = logBackoff;
}
/**
* Returns a string representation of this object
*
* @return the string form of this object
*/
@Override
public String toString() {
return "Prob: " + logProbability + ' ' + logBackoff;
}
}