//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentextractors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.apache.uima.UimaContext; import org.apache.uima.fit.factory.UimaContextFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.junit.Test; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.uima.BaleenContentExtractor; import uk.gov.dstl.baleen.uima.testing.JCasSingleton; public class TearlineContentExtractorTest { @Test public void testTearline() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(context, Collections.emptyMap()); String[] files = new String[]{"1.docx", "2.docx", "3.docx", "4.docx", "5.doc", "6.pdf"}; for(String file : files){ File f = new File(getClass().getResource("tearline/" + file).getPath()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This is the first tearline.", jCas.getDocumentText()); jCas.reset(); } } contentExtractor.destroy(); } @Test public void testNoTearline() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(context, Collections.emptyMap()); File f = new File(getClass().getResource("tearline/notearline.docx").getPath()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This document has no tearline.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); } @Test public void testBoilerplate() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); Map<String, Object> params = new HashMap<>(); params.put("boilerplate", new String[]{"[aeiou]"}); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(context, params); File f = new File(getClass().getResource("tearline/notearline.docx").getPath()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("Ths dcmnt hs n trln.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); } @Test public void testMetadata() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(context, Collections.emptyMap()); File f = new File(getClass().getResource("tearline/1.docx").getPath()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); assertFalse(JCasUtil.select(jCas, Metadata.class).isEmpty()); } contentExtractor.destroy(); } @Test public void testCustomTearline() throws Exception{ UimaContext context = UimaContextFactory.createUimaContext(); JCas jCas = JCasSingleton.getJCasInstance(); Map<String, Object> params = new HashMap<>(); params.put("tearline", "Customer Form:"); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(context, params); File f = new File(getClass().getResource("tearline/customtearline.docx").getPath()); try( InputStream is = new FileInputStream(f); ){ contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This is the first tearline.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); } }