//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors.helpers;
import static org.junit.Assert.assertEquals;
import java.util.Collection;
import java.util.Collections;
import org.apache.uima.UIMAException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;
import uk.gov.dstl.baleen.contentmappers.helpers.AnnotationCollector;
import uk.gov.dstl.baleen.contentmappers.helpers.ContentMapper;
import uk.gov.dstl.baleen.types.structure.Paragraph;
import uk.gov.dstl.baleen.uima.testing.JCasSingleton;
public class DocumentToJCasConverterTest {
@Test
public void test() throws UIMAException {
JCas jCas = JCasSingleton.getJCasInstance();
DocumentToJCasConverter converter = new DocumentToJCasConverter(Collections.emptyList());
Document doc = Jsoup.parseBodyFragment("<p>Hello</p><pre>Something\nFormatted</pre>");
converter.apply(doc, jCas);
assertEquals("HelloSomething\nFormatted", jCas.getDocumentText());
}
@Test
public void testWithSimpleMapper() throws UIMAException {
JCas jCas = JCasSingleton.getJCasInstance();
DocumentToJCasConverter converter =
new DocumentToJCasConverter(Collections.singletonList(new MapOnlyP()));
Document doc = Jsoup.parseBodyFragment("<p>Hello</p><pre>Something\nFormatted</pre>");
converter.apply(doc, jCas);
assertEquals("HelloSomething\nFormatted", jCas.getDocumentText());
Collection<Paragraph> select = JCasUtil.select(jCas, Paragraph.class);
assertEquals(select.size(), 1);
Paragraph p = select.iterator().next();
assertEquals(p.getCoveredText(), "Hello");
}
public static class MapOnlyP implements ContentMapper {
@Override
public void map(JCas jCas, Element element, AnnotationCollector collector) {
if (element.tagName().equalsIgnoreCase("p")) {
collector.add(new Paragraph(jCas));
}
}
}
}