/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dkpro.core.api.embeddings.binary; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import org.dkpro.core.api.embeddings.VectorizerUtils; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import static org.dkpro.core.api.embeddings.binary.BinaryWordVectorUtils.convertWordVectorsToBinary; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; public class BinaryWordVectorUtilsTest { // TODO: test for very large data (>2GB should be chunked) @Rule public DkproTestContext testContext = new DkproTestContext(); private Map<String, float[]> vectors; @Before public void setUp() { vectors = new HashMap<>(); vectors.put("t1", new float[] { 0.1f, -0.2f, 0.9f }); vectors.put("t2", new float[] { 0.4f, 0.32f, -0.9f }); } @Test public void testConvertWordVectorsToBinary() throws Exception { File binaryTarget = writeBinaryFile(vectors); BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget); assertTrue(vec.contains("t1")); assertTrue(vec.contains("t2")); assertEquals(3, vec.dimensions()); assertEquals(2, vec.size()); assertTrue(vec.isCaseless()); for (String word : vectors.keySet()) { float[] orig = vectors.get(word); float[] conv = vec.vectorize(word); assertTrue("Vectors differ for " + word, Arrays.equals(orig, conv)); } } @Test public void testConvertWordVectorsToBinaryCaseSensitive() throws Exception { vectors.put("T1", new float[] { 0.1f, 0.2f, 0.3f }); File binaryTarget = writeBinaryFile(vectors); BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget); assertTrue(vec.contains("t1")); assertTrue(vec.contains("t2")); assertTrue(vec.contains("T1")); assertFalse(vec.contains("T2")); assertEquals(3, vec.dimensions()); assertEquals(3, vec.size()); assertFalse(vec.isCaseless()); for (String word : vectors.keySet()) { float[] orig = vectors.get(word); float[] conv = vec.vectorize(word); assertTrue("Vectors differ for " + word, Arrays.equals(orig, conv)); } } @Test public void testRandomVector() throws IOException { File binaryTarget = writeBinaryFile(vectors); BinaryVectorizer vec = BinaryVectorizer.load(binaryTarget); float[] randVector = VectorizerUtils.randomVector(3); float[] unk1 = vec.vectorize("unk1"); float[] unk2 = vec.vectorize("unk2"); assertTrue(Arrays.equals(randVector, unk1)); assertTrue(Arrays.equals(randVector, unk2)); assertTrue("Vectors or unknown words should always be the same.", Arrays.equals(unk1, unk2)); } /** * Write a binary vectors file to a testContext-dependent location. * * @return the binary vectors {@link File} * @throws IOException if an I/O error occurs */ private File writeBinaryFile(Map<String, float[]> vectors) throws IOException { File binaryTarget = new File(testContext.getTestOutputFolder(), "binaryTarget"); convertWordVectorsToBinary(vectors, binaryTarget); return binaryTarget; } }