/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package joshua.corpus.vocab; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.Date; import java.util.HashSet; import joshua.corpus.vocab.Vocabulary; import org.testng.Assert; import org.testng.annotations.Test; /** * * * @author Lane Schwartz */ public class VocabularyTest { /** [X], [X,1], [X,2], [S], [S,1] <unk>, <s>, </s>, -pau-*/ int numBuiltInSymbols = 9; /** <unk>, <s>, </s>, -pau- */ int numBuiltInTerminals = 4; @Test public void basicVocabTest() { Vocabulary vocab1 = new Vocabulary(); Vocabulary vocab2 = new Vocabulary(new HashSet<String>()); Assert.assertEquals(vocab1, vocab2); Assert.assertFalse(vocab1.intToString.isEmpty()); // Assert.assertTrue(vocab1.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING); Assert.assertFalse(vocab1.getWords().isEmpty()); Assert.assertTrue(vocab1.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING); Assert.assertEquals(vocab1.getWords(), vocab1.intToString.values()); Assert.assertEquals(vocab1.size(), numBuiltInSymbols); Assert.assertEquals(vocab1.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING); //Assert.assertEquals(vocab1.getID("sample"), Vocabulary.UNKNOWN_WORD); //Assert.assertEquals(vocab1.getID(null), Vocabulary.UNKNOWN_WORD); Assert.assertFalse(vocab1.terminalToInt.isEmpty()); Assert.assertEquals(vocab1.terminalToInt.size(), this.numBuiltInTerminals); // Assert.assertFalse(vocab1.isFixed); // // vocab1.fixVocabulary(); // Assert.assertTrue(vocab1.isFixed); Assert.assertEquals(vocab1.getID(SymbolTable.X_STRING), -1); Assert.assertEquals(vocab1.getID(SymbolTable.X1_STRING), -2); Assert.assertEquals(vocab1.getID(SymbolTable.X2_STRING), -3); Assert.assertEquals(vocab1.getWord(-1), SymbolTable.X_STRING); Assert.assertEquals(vocab1.getWord(-2), SymbolTable.X1_STRING); Assert.assertEquals(vocab1.getWord(-3), SymbolTable.X2_STRING); Assert.assertFalse(vocab2.intToString.isEmpty()); // Assert.assertTrue(vocab2.intToString.get(0)==Vocabulary.UNKNOWN_WORD_STRING); Assert.assertFalse(vocab2.getWords().isEmpty()); // Assert.assertTrue(vocab2.getWord(0)==Vocabulary.UNKNOWN_WORD_STRING); Assert.assertEquals(vocab2.getWords(), vocab2.intToString.values()); Assert.assertEquals(vocab2.size(), numBuiltInSymbols); Assert.assertEquals(vocab2.getWord(Vocabulary.UNKNOWN_WORD), Vocabulary.UNKNOWN_WORD_STRING); // Assert.assertEquals(vocab2.getID("sample"), Vocabulary.UNKNOWN_WORD); // Assert.assertEquals(vocab2.getID(null), Vocabulary.UNKNOWN_WORD); Assert.assertFalse(vocab2.terminalToInt.isEmpty()); Assert.assertEquals(vocab2.terminalToInt.size(), this.numBuiltInTerminals); // Assert.assertTrue(vocab2.isFixed); } @Test public void verifyWordIDs() throws IOException { // Adam Lopez's example... String corpusString = "it makes him and it mars him , it sets him on and it takes him off ."; // String queryString = "it persuades him and it disheartens him"; String sourceFileName; { File sourceFile = File.createTempFile("source", new Date().toString()); PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8"); sourcePrintStream.println(corpusString); sourcePrintStream.close(); sourceFileName = sourceFile.getAbsolutePath(); } Vocabulary vocab = new Vocabulary(); Vocabulary.initializeVocabulary(sourceFileName, vocab, true); Assert.assertEquals(vocab.getWord(vocab.getID("it")), "it"); Assert.assertEquals(vocab.getWord(vocab.getID("makes")), "makes"); Assert.assertEquals(vocab.getWord(vocab.getID("him")), "him"); Assert.assertEquals(vocab.getWord(vocab.getID("and")), "and"); Assert.assertEquals(vocab.getWord(vocab.getID("mars")), "mars"); Assert.assertEquals(vocab.getWord(vocab.getID(",")), ","); Assert.assertEquals(vocab.getWord(vocab.getID("sets")), "sets"); Assert.assertEquals(vocab.getWord(vocab.getID("on")), "on"); Assert.assertEquals(vocab.getWord(vocab.getID("takes")), "takes"); Assert.assertEquals(vocab.getWord(vocab.getID("off")), "off"); // Assert.assertEquals(vocab.getWord(vocab.getID("persuades")), Vocabulary.UNKNOWN_WORD_STRING); // Assert.assertEquals(vocab.getWord(vocab.getID("disheartens")), Vocabulary.UNKNOWN_WORD_STRING); } @Test public void loadVocabFromFile() { String filename = "data/tiny.en"; int numSentences = 5; // Should be 5 sentences in tiny.en int numWords = 89; // Should be 89 words in tiny.en int numUniqWords = 60; // Should be 60 unique words in tiny.en Vocabulary vocab = new Vocabulary(); Vocabulary vocab2 = new Vocabulary(); Assert.assertTrue(vocab.equals(vocab2)); Assert.assertTrue(vocab2.equals(vocab)); Assert.assertEquals(vocab, vocab2); try { int[] result = Vocabulary.initializeVocabulary(filename, vocab, true); Assert.assertNotNull(result); Assert.assertEquals(result.length, 2); Assert.assertEquals(result[0], numWords); Assert.assertEquals(result[1], numSentences); // Assert.assertTrue(vocab.isFixed); Assert.assertEquals(vocab.size(), numUniqWords+numBuiltInSymbols); } catch (IOException e) { Assert.fail("Could not load file " + filename); } Assert.assertFalse(vocab.equals(vocab2)); try { int[] result = Vocabulary.initializeVocabulary(filename, vocab2, true); Assert.assertNotNull(result); Assert.assertEquals(result.length, 2); Assert.assertEquals(result[0], numWords); Assert.assertEquals(result[1], numSentences); // Assert.assertTrue(vocab2.isFixed); Assert.assertEquals(vocab2.size(), numUniqWords+numBuiltInSymbols); } catch (IOException e) { Assert.fail("Could not load file " + filename); } Assert.assertEquals(vocab, vocab2); } }