//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentmappers; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.uima.UIMAException; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.jsoup.nodes.Element; import org.jsoup.parser.Tag; import org.junit.Test; import com.beust.jcommander.internal.Maps; import uk.gov.dstl.baleen.contentmappers.helpers.AnnotationCollector; import uk.gov.dstl.baleen.types.structure.Anchor; import uk.gov.dstl.baleen.types.structure.Aside; import uk.gov.dstl.baleen.types.structure.Caption; import uk.gov.dstl.baleen.types.structure.DefinitionDescription; import uk.gov.dstl.baleen.types.structure.DefinitionItem; import uk.gov.dstl.baleen.types.structure.DefinitionList; import uk.gov.dstl.baleen.types.structure.Details; import uk.gov.dstl.baleen.types.structure.Document; import uk.gov.dstl.baleen.types.structure.Figure; import uk.gov.dstl.baleen.types.structure.Footer; import uk.gov.dstl.baleen.types.structure.Header; import uk.gov.dstl.baleen.types.structure.Heading; import uk.gov.dstl.baleen.types.structure.Link; import uk.gov.dstl.baleen.types.structure.ListItem; import uk.gov.dstl.baleen.types.structure.Ordered; import uk.gov.dstl.baleen.types.structure.Page; import uk.gov.dstl.baleen.types.structure.Paragraph; import uk.gov.dstl.baleen.types.structure.Preformatted; import uk.gov.dstl.baleen.types.structure.Quotation; import uk.gov.dstl.baleen.types.structure.Section; import uk.gov.dstl.baleen.types.structure.Sheet; import uk.gov.dstl.baleen.types.structure.Slide; import uk.gov.dstl.baleen.types.structure.SlideShow; import uk.gov.dstl.baleen.types.structure.SpreadSheet; import uk.gov.dstl.baleen.types.structure.Style; import uk.gov.dstl.baleen.types.structure.Summary; import uk.gov.dstl.baleen.types.structure.Table; import uk.gov.dstl.baleen.types.structure.TableBody; import uk.gov.dstl.baleen.types.structure.TableCell; import uk.gov.dstl.baleen.types.structure.TableFooter; import uk.gov.dstl.baleen.types.structure.TableHeader; import uk.gov.dstl.baleen.types.structure.TableRow; import uk.gov.dstl.baleen.types.structure.Unordered; import uk.gov.dstl.baleen.uima.testing.JCasSingleton; public class StructuralAnnotationsTest { private final static Map<Tag, Class<?>[]> expected = Maps.newHashMap(); static { addExpected("p", Paragraph.class); addExpected("h1", Heading.class); addExpected("h2", Heading.class); addExpected("h3", Heading.class); addExpected("h4", Heading.class); addExpected("h5", Heading.class); addExpected("h6", Heading.class); addExpected("ul", Unordered.class); addExpected("ol", Ordered.class); addExpected("li", ListItem.class); addExpected("dl", DefinitionList.class); addExpected("dt", DefinitionItem.class); addExpected("dd", DefinitionDescription.class); // Table addExpected("table", Table.class); addExpected("thead", TableHeader.class); addExpected("tfoot", TableFooter.class); addExpected("tbody", TableBody.class); addExpected("tr", TableRow.class); addExpected("th", TableCell.class); addExpected("td", TableCell.class); // Images addExpected("audio", Figure.class); addExpected("video", Figure.class); addExpected("embed", Figure.class); addExpected("object", Figure.class); addExpected("img", Figure.class); addExpected("map", Figure.class); addExpected("area", Figure.class); addExpected("canvas", Figure.class); addExpected("figure", Figure.class); addExpected("caption", Caption.class); addExpected("figcaption", Caption.class); // Styling addExpected("ins", Style.class); addExpected("i", Style.class); addExpected("em", Style.class); addExpected("b", Style.class); addExpected("strong", Style.class); addExpected("strike", Style.class); addExpected("s", Style.class); addExpected("del", Style.class); addExpected("sup", Style.class); addExpected("sub", Style.class); addExpected("small", Style.class); addExpected("big", Style.class); addExpected("mark", Style.class); // Purely structural addExpected("aside", Aside.class); addExpected("details", Details.class); addExpected("summary", Summary.class); addExpected("section", Section.class); addExpected("div", Section.class); addExpected("header", Header.class); addExpected("footer", Footer.class); addExpected("kbd", Preformatted.class); addExpected("samp", Preformatted.class); addExpected("code", Preformatted.class); addExpected("pre", Preformatted.class); addExpected("blockquote", Section.class, Quotation.class); addExpected("q", Quotation.class); addExpectedEmpty("span", "time", "meter", "dfn", "address", "abbr", "cite", "html", "head", "title", "meta", "base", "style", "script", "noscript", "link", "hr", "dialog", "nav", "menu", "menuitem", "param", "track", "source", "iframe", "form", "input", "textarea", "button", "select", "optgroup", "option", "label", "fieldset", "legend", "datalist", "keygen", "output", "ruby", "rt", "rp", "progress", "bdo", "bdi"); } @Test public void testMap() throws UIMAException { final JCas jCas = JCasSingleton.getJCasInstance(); final StructuralAnnotations sa = new StructuralAnnotations(); for (final Map.Entry<Tag, Class<?>[]> e : expected.entrySet()) { final Element element = new Element(e.getKey(), ""); final AnnotationCollector collector = new AnnotationCollector(); sa.map(jCas, element, collector); final List<Annotation> annotations = collector.getAnnotations(); final Class<?>[] classes = e.getValue(); if (classes == null || classes.length == 0) { if (annotations != null) { assertTrue(annotations.isEmpty()); } } else { assertEquals(annotations.size(), classes.length); for (int i = 0; i < classes.length; i++) { final Class<?> c = classes[i]; assertTrue(c.isInstance(annotations.get(i))); } } } } @Test public void testAnchor() throws UIMAException { final JCas jCas = JCasSingleton.getJCasInstance(); final StructuralAnnotations sa = new StructuralAnnotations(); final Element anchor = new Element(Tag.valueOf("a"), ""); final AnnotationCollector collector = new AnnotationCollector(); sa.map(jCas, anchor, collector); assertTrue(collector.getAnnotations().get(0) instanceof Anchor); } @Test public void testHeadings() throws UIMAException { final JCas jCas = JCasSingleton.getJCasInstance(); final StructuralAnnotations sa = new StructuralAnnotations(); final Element h1 = new Element(Tag.valueOf("h1"), ""); final Element h2 = new Element(Tag.valueOf("h2"), ""); final Element h3 = new Element(Tag.valueOf("h3"), ""); final Element h4 = new Element(Tag.valueOf("h4"), ""); final AnnotationCollector collector = new AnnotationCollector(); sa.map(jCas, h1, collector); sa.map(jCas, h2, collector); sa.map(jCas, h3, collector); sa.map(jCas, h4, collector); Heading heading1 = (Heading) collector.getAnnotations().get(0); Heading heading2 = (Heading) collector.getAnnotations().get(1); Heading heading3 = (Heading) collector.getAnnotations().get(2); Heading heading4 = (Heading) collector.getAnnotations().get(3); assertEquals(1, heading1.getLevel()); assertEquals(2, heading2.getLevel()); assertEquals(3, heading3.getLevel()); assertEquals(4, heading4.getLevel()); } @Test public void testLink() throws UIMAException { final JCas jCas = JCasSingleton.getJCasInstance(); final StructuralAnnotations sa = new StructuralAnnotations(); final Element a1 = new Element(Tag.valueOf("a"), ""); a1.attr("href", "http://example.com"); final Element a2 = new Element(Tag.valueOf("a"), ""); a2.attr("href", "/example.com"); final AnnotationCollector collector = new AnnotationCollector(); sa.map(jCas, a1, collector); sa.map(jCas, a2, collector); Annotation link = collector.getAnnotations().get(0); assertTrue(link instanceof Link); assertEquals("http://example.com", ((Link) link).getTarget()); Annotation link2 = collector.getAnnotations().get(1); assertTrue(link2 instanceof Link); assertEquals("/example.com", ((Link) link2).getTarget()); } @Test public void testFigure() throws UIMAException { final JCas jCas = JCasSingleton.getJCasInstance(); final StructuralAnnotations sa = new StructuralAnnotations(); final Element anchor = new Element(Tag.valueOf("img"), ""); anchor.attr("src", "test"); final AnnotationCollector collector = new AnnotationCollector(); sa.map(jCas, anchor, collector); Annotation fig = collector.getAnnotations().get(0); assertTrue(fig instanceof Figure); assertEquals("test", ((Figure) fig).getTarget()); } @Test public void testMain() throws UIMAException { final JCas jCas = JCasSingleton.getJCasInstance(); final StructuralAnnotations sa = new StructuralAnnotations(); final Map<String, Class<?>> expectedMain = new HashMap<>(); expectedMain.put("Document", Document.class); expectedMain.put("SlideShow", SlideShow.class); expectedMain.put("SpreadSheet", SpreadSheet.class); expectedMain.put("Another", Document.class); for (final Map.Entry<String, Class<?>> e : expectedMain.entrySet()) { final Element anchor = new Element(Tag.valueOf("main"), ""); anchor.attr("class", e.getKey()); final AnnotationCollector collector = new AnnotationCollector(); sa.map(jCas, anchor, collector); if (e.getValue() != null) { assertTrue(e.getValue().isInstance(collector.getAnnotations().get(0))); } else { assertNull(collector.getAnnotations()); } } } @Test public void testArticle() throws UIMAException { final JCas jCas = JCasSingleton.getJCasInstance(); final StructuralAnnotations sa = new StructuralAnnotations(); final Map<String, Class<?>> expectedArticle = new HashMap<>(); expectedArticle.put("Sheet", Sheet.class); expectedArticle.put("Slide", Slide.class); expectedArticle.put("Page", Page.class); expectedArticle.put("Another", Page.class); for (final Map.Entry<String, Class<?>> e : expectedArticle.entrySet()) { final Element anchor = new Element(Tag.valueOf("article"), ""); anchor.attr("class", e.getKey()); final AnnotationCollector collector = new AnnotationCollector(); sa.map(jCas, anchor, collector); if (e.getValue() != null) { assertTrue(e.getValue().isInstance(collector.getAnnotations().get(0))); } else { assertNull(collector.getAnnotations()); } } } private static void addExpected(final String tagName, final Class<?>... classes) { expected.put(Tag.valueOf(tagName), classes); } private static void addExpectedEmpty(final String... tagNames) { Arrays.stream(tagNames).forEach(s -> addExpected(s)); } }