/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dkpro.core.api.embeddings.text; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import org.dkpro.core.api.embeddings.binary.BinaryVectorizer; import org.junit.Rule; import org.junit.Test; import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.util.Arrays; import java.util.Map; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class TextFormatVectorizerUtilsTest { @Rule public DkproTestContext testContext = new DkproTestContext(); @Test public void testReadEmbeddingFileTxt() throws IOException, URISyntaxException { File modelFile = new File("src/test/resources/dummy.vec"); int expectedSize = 699; int expectedDimensions = 50; boolean hasHeader = false; Map<String, float[]> embeddings = TextFormatVectorizerUtils .readEmbeddingFileTxt(modelFile, hasHeader); assertEquals(expectedSize, embeddings.size()); embeddings.values().forEach(vector -> assertEquals(expectedDimensions, vector.length)); } @Test public void testReadEmbeddingFileTxtWithHeader() throws IOException, URISyntaxException { File modelFile = new File("src/test/resources/dummy_with_header.vec"); int expectedSize = 699; int expectedDimensions = 50; boolean hasHeader = true; Map<String, float[]> embeddings = TextFormatVectorizerUtils .readEmbeddingFileTxt(modelFile, hasHeader); assertEquals(expectedSize, embeddings.size()); embeddings.values().forEach(vector -> assertEquals(expectedDimensions, vector.length)); } @Test public void testReadEmbeddingFileTxtCompressed() throws IOException, URISyntaxException { File modelFile = new File("src/test/resources/embeddings.gz"); int expectedSize = 699; int expectedDimensions = 50; boolean hasHeader = false; Map<String, float[]> embeddings = TextFormatVectorizerUtils .readEmbeddingFileTxt(modelFile, hasHeader); assertEquals(expectedSize, embeddings.size()); embeddings.values().forEach(vector -> assertEquals(expectedDimensions, vector.length)); } @Test public void testConvertMalletEmbeddingsToBinary() throws IOException { File modelFile = new File("src/test/resources/dummy.vec"); File targetFile = new File(testContext.getTestOutputFolder(), "binary"); Map<String, float[]> embeddings = TextFormatVectorizerUtils .readEmbeddingFileTxt(modelFile, false); TextFormatVectorizerUtils.convertMalletEmbeddingsToBinary(modelFile, targetFile); BinaryVectorizer vec = BinaryVectorizer.load(targetFile); for (String token : embeddings.keySet()) { assertTrue("Arrays to not match for token " + token, Arrays.equals(embeddings.get(token), vec.vectorize(token))); } } // TODO add tests for caseless }