ArpaFileTest.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.decoder.ff.lm;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;

import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.lm.buildin_lm.TrieLM;

import org.testng.Assert;
import org.testng.annotations.Test;

/**
 * Unit tests for testing ARPA language model class.
 * 
 * @author Lane Schwartz
 */
public class ArpaFileTest {

	String arpaFileName;
	SymbolTable vocab;
	
	@Test
	public void setup() {
		
		vocab = new Vocabulary();
		vocab.addTerminal("a");
		vocab.addTerminal("because");
		vocab.addTerminal("boycott");
		vocab.addTerminal("of");
		vocab.addTerminal("parliament");
		vocab.addTerminal("potato");
		vocab.addTerminal("resumption");
		vocab.addTerminal("the");
		
		try {
			File file = File.createTempFile("testLM", "arpa");
			PrintStream out = new PrintStream(file, "UTF-8");
			
			out.println();
			out.println("\\data\\");
			out.println("ngram 1=8");
			out.println("ngram 2=4");
			out.println("ngram 3=1");
			out.println();
			
			out.println("\\1-grams:");
			out.println("-1.992672       a       -0.1195484");
			out.println("-2.713723       because -0.4665429");
			out.println("-4.678545       boycott -0.0902521");
			out.println("-1.609573       of      -0.1991907");
			out.println("-3.875917       parliament      -0.1274891");
			out.println("-9.753210       potato");
			out.println("-4.678545       resumption      -0.07945678");
			out.println("-1.712444       the     -0.1606644");
			
			out.println();
			out.println("\\2-grams:");
			out.println("-0.3552987      because of      -0.03083654");
			out.println("-1.403534       of a");
			out.println("-0.7507797      of the  -0.05237135");
			out.println("-0.7266324      resumption of");
			out.println("-3.936147       the resumption");
			
			out.println();
			out.println("\\3-grams:");
			out.println("-0.6309999      because of the");
			out.println();
			
			out.println("\\end\\");
			
			out.close();
			this.arpaFileName = file.getAbsolutePath();
			
		} catch (IOException e) {
			Assert.fail("Unable to create temporary file: " + e.toString());
		}
		
	}
	
	@Test(dependsOnMethods={"setup"})
	public void testOrder() {
		ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
		
		try {
			Assert.assertEquals(arpaFile.getOrder(), 3);
		} catch (FileNotFoundException e) {
			Assert.fail(e.toString());
		}
	}
	
	@Test(dependsOnMethods={"setup"})
	public void testIteration() {
		
		ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
		
		Map<Integer,Integer> counts = new HashMap<Integer,Integer>();
		
		boolean iterationOccurred = false;
		
		for (ArpaNgram ngram : arpaFile) {
		
			iterationOccurred = true;
			
			int order = ngram.order();
//			System.err.println("Order = " + order);
			
			int count;
			if (counts.containsKey(order)) {
				count = counts.get(order) + 1;
			} else {
				count = 1;
			}
			
			counts.put(order, count);

		}
		
		Assert.assertTrue(iterationOccurred);

		Assert.assertTrue(counts.containsKey(1));
		Assert.assertTrue(counts.containsKey(2));
		Assert.assertTrue(counts.containsKey(3));
		
		Assert.assertEquals((int) counts.get(1), 8);
		Assert.assertEquals((int) counts.get(2), 5);
		Assert.assertEquals((int) counts.get(3), 1);
		
	}
	
	@Test(dependsOnMethods={"setup"})
	public void testSize() {
		ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
		
		Assert.assertEquals(arpaFile.size(), 14);
	}
	
	@Test(dependsOnMethods={"setup","testIteration"})
	public void testChildren() throws FileNotFoundException {
		ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
		
		TrieLM lm = new TrieLM(arpaFile);
//		System.err.println(lm.getChildren().size());
		Assert.assertNotSame(lm.getChildren().size(), 0);
	}
	
	@Test(dependsOnMethods={"setup","testIteration","testChildren"})
	public void testTrie() throws FileNotFoundException {
		ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
		
		TrieLM lm = new TrieLM(arpaFile);

		// Test unigrams known to be in the language model
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a")),-1.992672, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because")),-2.713723, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("boycott")),-4.678545, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of")),-1.609573, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("parliament")),-3.875917, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato")),-9.753210, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption")),-4.678545, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the")),-1.712444, 0.000001f);
		
		// Test unigrams known to NOT be in the language model
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("banana")), -JoshuaConfiguration.lm_ceiling_cost, 0.000001f);
		
		
		// Test bigrams known to be in the language model
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of")), -0.3552987, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the")), -0.7507797, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption of")), -0.7266324, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the resumption")), -3.936147, 0.000001f);
		
		// Test trigrams known to be in the language model
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of the")), -0.6309999f, 0.000001f);
	
	
		// Test bigrams know to NOT be in the language model (but the unigrams are)
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a boycott")), -4.678545f + -0.1195484f, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of parliament")), -3.875917f + -0.1991907f, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the potato")), -9.753210f + -0.1606644f, 0.000001f);
		Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato parliament")), -3.875917f + -0.0f, 0.000001f);
		
		// Test trigrams know to NOT be in the language model (but the bigrams are)
		int[] words = vocab.getIDs("because of a");
		double f = lm.ngramLogProbability(words);
		Assert.assertEquals(f, -1.403534f + -0.03083654f, 0.000001f);
//		//Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the parliament")), -3.875917f + -0.05237135f, 0.000001f);
		
	}
}