/*
Copyright (c) 2009-2011
Speech Group at Informatik 5, Univ. Erlangen-Nuremberg, GERMANY
Korbinian Riedhammer
Tobias Bocklet
Stephan Steidl
Florian Hoenig
This file is part of the Java Speech Toolkit (JSTK).
The JSTK is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
The JSTK is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the JSTK. If not, see <http://www.gnu.org/licenses/>.
*/
package de.fau.cs.jstk.lm;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import de.fau.cs.jstk.arch.TokenHierarchy;
import de.fau.cs.jstk.arch.TokenTree;
import de.fau.cs.jstk.arch.Tokenization;
import de.fau.cs.jstk.arch.Tokenizer;
import de.fau.cs.jstk.arch.TreeNode;
import de.fau.cs.jstk.exceptions.OutOfVocabularyException;
/**
* The uni-gram allows (in contrast to the zero-gram) specify language model
* weights for each word.
*
* @author sikoried
*
*/
public class Unigram implements LanguageModel {
/** The default language model probability for OOV words */
public static final float DEFAULT_OOV = 0.001f;
/** out-of-vocabulary probability */
private float oovProb = DEFAULT_OOV;
private Tokenizer tok;
private TokenHierarchy th;
private HashMap<Tokenization, Float> sils = new HashMap<Tokenization, Float>();
private HashMap<Tokenization, Float> probs = new HashMap<Tokenization, Float>();
/**
* Generate a new Zerogram for all words in the given Tokenizer. The words
* are uniformly weighted after subtraction of the silence probability mass.
*
* @param tokenizer
* @param hierarchy
* @param sils
*/
public Unigram(Tokenizer tokenizer, TokenHierarchy hierarchy, HashMap<Tokenization, Float> sils) {
this.tok = tokenizer;
this.th = hierarchy;
this.sils = sils;
}
/**
* Set the uni-gram probability
* @param t
* @param p
*/
public void setProb(Tokenization t, float p) {
probs.put(t, p);
}
public void setOovProb(float p) {
oovProb = p;
}
public double getOovProb() {
return oovProb;
}
/**
* Load uni-gram Probabilities froom SRILM-stype LM file
* @param file
* @throws IOException
*/
public void loadSrilm(File file) throws IOException, OutOfVocabularyException {
BufferedReader br = new BufferedReader(new FileReader(file));
String lin;
// skip everything till \1-gram
while ((lin = br.readLine()) != null) {
if (lin.equals("\\1-grams:"))
break;
}
// now read everything till next thing starts with a backslash
while ((lin = br.readLine()) != null) {
if (lin.startsWith("\\"))
break;
if (lin.trim().length() < 3)
continue;
String [] sp = lin.trim().split("\\s+");
// ignore words not in the tokenizer
if (!tok.validate(sp[1]))
continue;
// set the prob, mind the exponentiation!
probs.put(tok.getWordTokenization(sp[1]), (float) Math.pow(10, Float.parseFloat(sp[0])));
}
}
public TreeNode generateNetwork() {
// re-distribute the probability masses to compensate for the silences
float pmass = 0.f;
for (Map.Entry<Tokenization, Float> e : sils.entrySet())
pmass += e.getValue();
float umass = 0.f;
for (Tokenization t : tok.tokenizations) {
if (sils.containsKey(t))
continue;
Float p = probs.get(t);
if (p == null)
probs.put(t, p = oovProb);
umass += p;
}
float skew = (1.f - pmass) / umass;
for (Tokenization t : probs.keySet())
probs.put(t, probs.get(t) * skew);
// build lexical tree
TokenTree tree = new TokenTree(0);
for (Tokenization t : tok.tokenizations) {
if (sils.containsKey(t))
tree.addToTree(t, th.tokenizeWord(t.sequence), sils.get(t));
else
tree.addToTree(t, th.tokenizeWord(t.sequence), probs.get(t));
}
// factor
tree.factor();
// loop
for (TreeNode n : tree.leaves())
n.setLst(tree.root);
return tree.root;
}
}