//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.fit.factory.UimaContextFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.junit.Test;
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Multimap;
import io.committed.krill.extraction.Extraction;
import io.committed.krill.extraction.exception.ExtractionException;
import io.committed.krill.extraction.impl.DefaultExtraction;
import uk.gov.dstl.baleen.types.language.Text;
import uk.gov.dstl.baleen.types.metadata.Metadata;
import uk.gov.dstl.baleen.types.structure.Paragraph;
import uk.gov.dstl.baleen.uima.BaleenContentExtractor;
import uk.gov.dstl.baleen.uima.testing.JCasSingleton;
public class StructureContentExtractorTest {
public static class TestStructureContentExtractor extends StructureContentExtractor {
@Override
protected Extraction extract(InputStream stream, String source)
throws ExtractionException {
Multimap<String, String> metadata = LinkedHashMultimap.create();
metadata.put("test", "true");
return new DefaultExtraction(
"<html><head><meta name=\"test\" content=\"true\" /></head><body><h1>Title</h1>\n<p>Example</p></body></html>",
metadata);
}
}
@Test
public void test() throws UIMAException, IOException {
UimaContext context = UimaContextFactory.createUimaContext();
JCas jCas = JCasSingleton.getJCasInstance();
BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
contentExtractor.initialize(context, Collections.emptyMap());
contentExtractor.processStream(null, "source", jCas);
assertEquals("Title\nExample", jCas.getDocumentText());
Collection<Paragraph> select = JCasUtil.select(jCas, Paragraph.class);
assertEquals(select.size(), 1);
Paragraph p = select.iterator().next();
assertEquals(p.getBegin(), 6);
assertEquals(p.getEnd(), 13);
List<Metadata> contentMeta = JCasUtil.select(jCas, Metadata.class).stream()
.filter(m -> m.getKey().startsWith("baleen:content-")).collect(Collectors.toList());
assertEquals(3, contentMeta.size());
}
@Test
public void testInitializingManipulator() throws UIMAException, IOException {
UimaContext context = UimaContextFactory.createUimaContext();
JCas jCas = JCasSingleton.getJCasInstance();
BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
Map<String, Object> params = new HashMap<>();
params.put("contentManipulators", new String[] {"RemoveEmptyText"});
contentExtractor.initialize(context, params);
contentExtractor.processStream(null, "source", jCas);
long count = JCasUtil.select(jCas, Metadata.class).stream()
.filter(m -> m.getKey().equals("baleen:content-manipulators")
&& m.getValue().contains("RemoveEmptyText"))
.count();
assertEquals(1, count);
}
@Test
public void testInitializingMapper() throws UIMAException, IOException {
UimaContext context = UimaContextFactory.createUimaContext();
JCas jCas = JCasSingleton.getJCasInstance();
BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
Map<String, Object> params = new HashMap<>();
params.put("contentMappers", new String[] {"MetaTags"});
contentExtractor.initialize(context, params);
contentExtractor.processStream(null, "source", jCas);
long count = JCasUtil.select(jCas, Metadata.class).stream()
.filter(m -> m.getKey().equals("baleen:content-mappers")
&& m.getValue().contains("MetaTags"))
.count();
assertEquals(1, count);
}
@Test(expected = ResourceInitializationException.class)
public void testInitializingBadMapper() throws UIMAException, IOException {
UimaContext context = UimaContextFactory.createUimaContext();
BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
Map<String, Object> params = new HashMap<>();
params.put("contentMappers", new String[] {"DoesNotExist"});
contentExtractor.initialize(context, params);
}
@Test
public void testInitializingManipulatorAsMapper() throws UIMAException, IOException {
UimaContext context = UimaContextFactory.createUimaContext();
BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
Map<String, Object> params = new HashMap<>();
params.put("contentMappers",
new String[] {"uk.gov.dstl.baleen.contentmanipulators.HeaderAndFooterRemover"});
contentExtractor.initialize(context, params);
// TODO Could test its not actually used here...
}
@Test
public void testTextBlocksEnabled() throws Exception {
UimaContext context = UimaContextFactory.createUimaContext();
JCas jCas = JCasSingleton.getJCasInstance();
BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
contentExtractor.initialize(context, Collections.emptyMap());
contentExtractor.processStream(null, "source", jCas);
assertEquals("Title\nExample", jCas.getDocumentText());
Collection<Text> select = JCasUtil.select(jCas, Text.class);
assertTrue(select.size() > 0);
}
@Test
public void testDisableTextBlocks() throws Exception {
UimaContext context = UimaContextFactory.createUimaContext();
JCas jCas = JCasSingleton.getJCasInstance();
BaleenContentExtractor contentExtractor = new TestStructureContentExtractor();
Map<String, Object> map = new HashMap<>();
map.put(StructureContentExtractor.FIELD_EXTRACT_TEXT_BLOCKS, "false");
contentExtractor.initialize(context, map);
contentExtractor.processStream(null, "source", jCas);
assertEquals("Title\nExample", jCas.getDocumentText());
Collection<Text> select = JCasUtil.select(jCas, Text.class);
assertTrue(select.isEmpty());
}
}