package edu.berkeley.cs.nlp.ocular.gsm; import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.TILDE_COMBINING; import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet; import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.List; import java.util.Set; import org.junit.Test; import edu.berkeley.cs.nlp.ocular.data.textreader.Charset; import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory; import tberg.murphy.indexer.HashMapIndexer; import tberg.murphy.indexer.Indexer; /** * @author Dan Garrette (dhgarrette@gmail.com) */ public class BasicGlyphSubstitutionModelTests { @Test public void test_getSmoothingValue() { double gsmSmoothingCount = 0.1; double gsmElisionSmoothingCountMultiplier = 500.0; Indexer<String> langIndexer = new HashMapIndexer<String>(); langIndexer.index(new String[] {"spanish", "latin"}); langIndexer.lock(); String[] chars = new String[] {" ","-","a","b","c","d","e","f","k","n","o","s","\\'o"}; Indexer<String> charIndexer = new HashMapIndexer<String>(); charIndexer.index(chars); List<Integer> charIndices = new ArrayList<Integer>(); for (String c : chars) charIndices.add(charIndexer.getIndex(c)); Set<Integer> fullCharSet = makeSet(charIndices); @SuppressWarnings("unchecked") Set<Integer>[] activeCharacterSets = new Set[] {fullCharSet, fullCharSet}; charIndexer.getIndex("z"); charIndexer.getIndex(Charset.LONG_S); for (String c : new String[] {"a","b","c","d","e","f","k","n","o","s","z"}) charIndices.add(charIndexer.getIndex(c+TILDE_COMBINING)); charIndexer.lock(); double gsmPower = 2.0; int minCountsForEvalGsm = 2; String outputPath = ""; BasicGlyphSubstitutionModelFactory gsmf = new BasicGlyphSubstitutionModelFactory( gsmSmoothingCount, gsmElisionSmoothingCountMultiplier, langIndexer, charIndexer, activeCharacterSets, gsmPower, minCountsForEvalGsm, outputPath); assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("\\'o"), gsmf.GLYPH_ELISION_TILDE), 1e-9); assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), charIndexer.getIndex("k")), 1e-9); assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_FIRST_ELIDED), 1e-9); assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_FIRST_ELIDED), 1e-9); assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("k"), gsmf.GLYPH_TILDE_ELIDED), 1e-9); assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("a")), 1e-9); assertEquals(gsmSmoothingCount*gsmElisionSmoothingCountMultiplier, gsmf.getSmoothingValue(0, charIndexer.getIndex("n"), gsmf.GLYPH_TILDE_ELIDED), 1e-9); assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("a")), 1e-9); assertEquals(0.0, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex("z")), 1e-9); assertEquals(0.0, gsmf.getSmoothingValue(0, charIndexer.getIndex("a"), charIndexer.getIndex(Charset.LONG_S)), 1e-9); assertEquals(gsmSmoothingCount, gsmf.getSmoothingValue(0, charIndexer.getIndex("s"), charIndexer.getIndex(Charset.LONG_S)), 1e-9); } }