//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentextractors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.apache.uima.UimaContext; import org.apache.uima.fit.factory.UimaContextFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.junit.Test; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.uima.BaleenContentExtractor; import uk.gov.dstl.baleen.uima.testing.JCasSingleton; public class TikaContentExtractorTest { @Test public void testTikaWord() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("test.docx").getPath()); contentExtractor.initialize(context, Collections.emptyMap()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals("Test Document\nThis is a simple test document, with a title and a single sentence.\n", jCas.getDocumentText()); Collection<Metadata> metadata = JCasUtil.select(jCas, Metadata.class); assertEquals(44, metadata.size()); Map<String, String> metadataMap = new HashMap<>(); for(Metadata md : metadata){ metadataMap.put(md.getKey(), md.getValue()); } assertTrue(metadataMap.containsKey("Page-Count")); assertEquals("1", metadataMap.get("Page-Count")); assertTrue(metadataMap.containsKey("meta:author")); assertEquals("James Baker", metadataMap.get("meta:author")); } @Test public void testTikaText() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("test.txt").getPath()); contentExtractor.initialize(context, Collections.emptyMap()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals("Hello World\n", jCas.getDocumentText()); assertEquals(4, JCasUtil.select(jCas, Metadata.class).size()); } @Test public void testTikaWrappingDocx() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("wrappingLines.docx").getPath()); contentExtractor.initialize(context, Collections.emptyMap()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals("Test Document\nThis is my test document, which has a sentence that is long enough to wrap over two lines but we want it to appear as a single line when we extract the content.\nThis is a second paragraph. This is a third sentence, but still the second paragraph. Super-cali-fragi-listic-expi-alo-docious.\n", jCas.getDocumentText()); } @Test public void testTikaCorruptFile() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("corrupt.docx").getPath()); contentExtractor.initialize(context, Collections.emptyMap()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals(TikaContentExtractor.CORRUPT_FILE_TEXT, jCas.getDocumentText()); } }