/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.ditop;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.junit.Before;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;
import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelInferencer;
import de.tudarmstadt.ukp.dkpro.core.mallet.lda.MalletLdaTopicModelTrainer;
import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
public class DiTopWriterTest
{
private static final String DITOP_CORPUSNAME = "test";
private static final String TARGET_DITOP = "target/ditop";
private static final File MODEL_FILE = new File("target/mallet/model");
private static final String CAS_DIR = "src/test/resources/txt";
private static final String CAS_FILE_PATTERN = "[+]*.txt";
private static final int N_TOPICS = 10;
private static final int N_ITERATIONS = 50;
private static final String LANGUAGE = "en";
@Before
public void setUp()
throws Exception
{
/* Generate model */
CollectionReaderDescription reader = createReaderDescription(TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, CAS_DIR,
TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN,
TextReader.PARAM_LANGUAGE, LANGUAGE);
AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription estimator = createEngineDescription(
MalletLdaTopicModelTrainer.class,
MalletLdaTopicModelTrainer.PARAM_TARGET_LOCATION, MODEL_FILE,
MalletLdaTopicModelTrainer.PARAM_N_ITERATIONS, N_ITERATIONS,
MalletLdaTopicModelTrainer.PARAM_N_TOPICS, N_TOPICS);
SimplePipeline.runPipeline(reader, segmenter, estimator);
MODEL_FILE.deleteOnExit();
}
@Test
public void testSimple()
throws UIMAException, IOException
{
int expectedNDocuments = 2;
CollectionReaderDescription reader = createReaderDescription(TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, CAS_DIR,
TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN,
TextReader.PARAM_LANGUAGE, LANGUAGE);
AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription inferencer = createEngineDescription(
MalletLdaTopicModelInferencer.class,
MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE);
AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class,
DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP,
DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE,
DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME);
SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter);
/* test whether target files and dirs exist */
File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS);
File topicsFile = new File(contentDir, "topics.csv");
File topicTermT15File = new File(contentDir, "topicTerm-T15.txt");
File topicTermFile = new File(contentDir, "topicTerm.txt");
File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt");
assertTrue(new File(TARGET_DITOP, "config.all").exists());
assertTrue(contentDir.isDirectory());
assertTrue(topicTermT15File.exists());
assertTrue(topicTermFile.exists());
assertTrue(topicTermMatrixFile.exists());
assertTrue(topicsFile.exists());
/* check that file lengths are correct */
assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size());
MODEL_FILE.delete();
}
@Test
public void testCollectionValuesExact()
throws UIMAException, IOException
{
int expectedNDocuments = 2;
String exactName = new File(CAS_DIR).toURI().toString();
String[] collectionValues = new String[] { exactName };
boolean exactMatch = true;
CollectionReaderDescription reader = createReaderDescription(TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, CAS_DIR,
TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN,
TextReader.PARAM_LANGUAGE, LANGUAGE);
AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription inferencer = createEngineDescription(
MalletLdaTopicModelInferencer.class,
MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE);
AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class,
DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP,
DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE,
DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME,
DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues,
DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch);
SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter);
/* test whether target files and dirs exist */
File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS);
File topicsFile = new File(contentDir, "topics.csv");
File topicTermT15File = new File(contentDir, "topicTerm-T15.txt");
File topicTermFile = new File(contentDir, "topicTerm.txt");
File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt");
assertTrue(new File(TARGET_DITOP, "config.all").exists());
assertTrue(contentDir.isDirectory());
assertTrue(topicTermT15File.exists());
assertTrue(topicTermFile.exists());
assertTrue(topicTermMatrixFile.exists());
assertTrue(topicsFile.exists());
/* check that file lengths are correct */
assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size());
MODEL_FILE.delete();
}
@Test
public void testCollectionValuesExactNoMatch()
throws UIMAException, IOException
{
int expectedNDocuments = 0;
String[] collectionValues = new String[] { "file:/home/schnober/workspace/de.tudarmstadt.ukp.dkpro.core-asl/de.tudarmstadt.ukp.dkpro.core.io.ditop/src/test/resources/" };
boolean exactMatch = true;
CollectionReaderDescription reader = createReaderDescription(TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, CAS_DIR,
TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN,
TextReader.PARAM_LANGUAGE, LANGUAGE);
AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription inferencer = createEngineDescription(
MalletLdaTopicModelInferencer.class,
MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE);
AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class,
DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP,
DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE,
DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME,
DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues,
DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch);
SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter);
/* test whether target files and dirs exist */
File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS);
File topicsFile = new File(contentDir, "topics.csv");
File topicTermT15File = new File(contentDir, "topicTerm-T15.txt");
File topicTermFile = new File(contentDir, "topicTerm.txt");
File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt");
assertTrue(new File(TARGET_DITOP, "config.all").exists());
assertTrue(contentDir.isDirectory());
assertTrue(topicTermT15File.exists());
assertTrue(topicTermFile.exists());
assertTrue(topicTermMatrixFile.exists());
assertTrue(topicsFile.exists());
/* check that file lengths are correct */
assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size());
MODEL_FILE.delete();
}
@Test
public void testCollectionValuesNotExact()
throws UIMAException, IOException
{
int expectedNDocuments = 2;
String[] collectionValues = new String[] { "txt" };
boolean exactMatch = false;
CollectionReaderDescription reader = createReaderDescription(TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, CAS_DIR,
TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN,
TextReader.PARAM_LANGUAGE, LANGUAGE);
AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription inferencer = createEngineDescription(
MalletLdaTopicModelInferencer.class,
MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE);
AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class,
DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP,
DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE,
DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME,
DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues,
DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch);
SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter);
/* test whether target files and dirs exist */
File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS);
File topicsFile = new File(contentDir, "topics.csv");
File topicTermT15File = new File(contentDir, "topicTerm-T15.txt");
File topicTermFile = new File(contentDir, "topicTerm.txt");
File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt");
assertTrue(new File(TARGET_DITOP, "config.all").exists());
assertTrue(contentDir.isDirectory());
assertTrue(topicTermT15File.exists());
assertTrue(topicTermFile.exists());
assertTrue(topicTermMatrixFile.exists());
assertTrue(topicsFile.exists());
/* check that file lengths are correct */
assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size());
MODEL_FILE.delete();
}
@Test
public void testCollectionValuesNotExactNoMatch()
throws UIMAException, IOException
{
int expectedNDocuments = 0;
String[] collectionValues = new String[] { "abcd" };
boolean exactMatch = false;
CollectionReaderDescription reader = createReaderDescription(TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, CAS_DIR,
TextReader.PARAM_PATTERNS, CAS_FILE_PATTERN,
TextReader.PARAM_LANGUAGE, LANGUAGE);
AnalysisEngineDescription segmenter = createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription inferencer = createEngineDescription(
MalletLdaTopicModelInferencer.class,
MalletLdaTopicModelInferencer.PARAM_MODEL_LOCATION, MODEL_FILE);
AnalysisEngineDescription ditopwriter = createEngineDescription(DiTopWriter.class,
DiTopWriter.PARAM_TARGET_LOCATION, TARGET_DITOP,
DiTopWriter.PARAM_MODEL_LOCATION, MODEL_FILE,
DiTopWriter.PARAM_CORPUS_NAME, DITOP_CORPUSNAME,
DiTopWriter.PARAM_COLLECTION_VALUES, collectionValues,
DiTopWriter.PARAM_COLLECTION_VALUES_EXACT_MATCH, exactMatch);
SimplePipeline.runPipeline(reader, segmenter, inferencer, ditopwriter);
/* test whether target files and dirs exist */
File contentDir = new File(TARGET_DITOP, DITOP_CORPUSNAME + "_" + N_TOPICS);
File topicsFile = new File(contentDir, "topics.csv");
File topicTermT15File = new File(contentDir, "topicTerm-T15.txt");
File topicTermFile = new File(contentDir, "topicTerm.txt");
File topicTermMatrixFile = new File(contentDir, "topicTermMatrix.txt");
assertTrue(new File(TARGET_DITOP, "config.all").exists());
assertTrue(contentDir.isDirectory());
assertTrue(topicTermT15File.exists());
assertTrue(topicTermFile.exists());
assertTrue(topicTermMatrixFile.exists());
assertTrue(topicsFile.exists());
/* check that file lengths are correct */
assertEquals(expectedNDocuments + 1, FileUtils.readLines(topicsFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermT15File).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermFile).size());
assertEquals(N_TOPICS, FileUtils.readLines(topicTermMatrixFile).size());
MODEL_FILE.delete();
}
}