/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.text; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext; import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner; import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.junit.Rule; import org.junit.Test; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.List; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class TokenizedTextWriterTest { @Rule public DkproTestContext context = new DkproTestContext(); @Test public void testDefault() throws UIMAException, IOException { String text = "This is the 1st sentence .\nHere is another sentence ."; File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterTest.out"); File tokenized = new File("src/test/resources/tokenizedTexts/textTokenized.txt"); AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE, true); TestRunner.runTest("id", writer, "en", text); assertTrue(FileUtils.contentEquals(tokenized, targetFile)); } @Test public void testMultipleFiles() throws UIMAException, IOException { String text = "This is the 1st sentence .\nHere is another sentence ."; File targetDir = context.getTestOutputFolder(); File targetFile = new File(targetDir, "id.txt"); File tokenized = new File("src/test/resources/tokenizedTexts/textTokenized.txt"); AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, targetDir, TokenizedTextWriter.PARAM_SINGULAR_TARGET, false, TokenizedTextWriter.PARAM_OVERWRITE, true); TestRunner.runTest("id", writer, "en", text); assertTrue(targetDir.isDirectory()); assertTrue(targetFile.exists()); assertTrue(FileUtils.contentEquals(tokenized, targetFile)); } @Test public void testTokens() throws UIMAException, IOException { File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterTokensTest.out"); String text = "This is the 1st sentence .\nHere is another sentence ."; String typeName = Token.class.getTypeName(); File tokenized = new File("src/test/resources/tokenizedTexts/textTokenized.txt"); AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_FEATURE_PATH, typeName, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true); TestRunner.runTest("id", writer, "en", text); assertTrue(FileUtils.contentEquals(tokenized, targetFile)); } @Test public void testLemmas() throws IOException, UIMAException { File targetFile = new File(context.getTestOutputFolder(), "lemmas.out"); targetFile.deleteOnExit(); String expected = "lemma1 lemma2"; int expectedLines = 1; String featurePath = Token.class.getName() + "/lemma/value"; JCas jCas = JCasFactory.createJCas(); jCas.setDocumentText("token1 token2"); DocumentMetaData metaData = DocumentMetaData.create(jCas); metaData.setDocumentId("lemmasTest"); metaData.addToIndexes(jCas); Token token1 = new Token(jCas, 0, 6); Token token2 = new Token(jCas, 7, 13); Lemma lemma1 = new Lemma(jCas, 0, 6); lemma1.setValue("lemma1"); Lemma lemma2 = new Lemma(jCas, 7, 13); lemma2.setValue("lemma2"); token1.setLemma(lemma1); token2.setLemma(lemma2); token1.addToIndexes(jCas); token2.addToIndexes(jCas); lemma1.addToIndexes(jCas); lemma2.addToIndexes(jCas); Sentence sentence = new Sentence(jCas, 0, 13); sentence.addToIndexes(jCas); AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_FEATURE_PATH, featurePath, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE, true); SimplePipeline.runPipeline(jCas, writer); List<String> output = Files.readAllLines(targetFile.toPath()); assertEquals(expectedLines, output.size()); assertEquals(expected, output.get(0)); } @Test public void testStopwords() throws UIMAException, IOException { File targetFile = new File(context.getTestOutputFolder(),"TokenizedTextWriterNoStopwords.out"); targetFile.deleteOnExit(); File tokenized = new File("src/test/resources/tokenizedTexts/textTokenizedNoStopwords.txt"); String text = "This is the 1st sentence .\nHere is another sentence ."; String stopwordsFile = "src/test/resources/stopwords_en.txt"; AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_STOPWORDS_FILE, stopwordsFile, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE, true); TestRunner.runTest("id", writer, "en", text); assertTrue(FileUtils.contentEquals(tokenized, targetFile)); } @Test public void testNumbers() throws UIMAException, IOException { File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterNoStopwords.out"); targetFile.deleteOnExit(); File tokenized = new File("src/test/resources/tokenizedTexts/textTokenizedNoNumbers.txt"); String text = "This is 1 sentence .\nHere is 2 sentences , or even 2.5 ."; String numbersRegex = "^[0-9]+(\\.[0-9]*)?$"; AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_NUMBER_REGEX, numbersRegex, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE, true); TestRunner.runTest("id", writer, "en", text); assertTrue(FileUtils.contentEquals(tokenized, targetFile)); } @Test public void testNoSentences() throws IOException, UIMAException { File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterNoSentences.out"); File tokenized = new File("src/test/resources/tokenizedTexts/textNoSentences.txt"); String text = "This is the 1st sentence . Here is another sentence ."; AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class, TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE, true, TokenizedTextWriter.PARAM_COVERING_TYPE, null); TestRunner.runTest("id", writer, "en", text); assertTrue(FileUtils.contentEquals(tokenized, targetFile)); } }