//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentmappers.structure; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.util.List; import org.apache.uima.UIMAException; import org.junit.Test; import uk.gov.dstl.baleen.types.structure.Structure; import uk.gov.dstl.baleen.uima.utils.select.Node; import uk.gov.dstl.baleen.uima.utils.select.NodeVisitor; import uk.gov.dstl.baleen.uima.utils.select.Nodes; public class StructureNodesTest extends AbstractHtmlToStructureTest { @Test public void filter() throws UIMAException { String h = "<p>Excl</p><div class=headline><p>Hello</p><p>There</p></div><div class=headline><h1>Headline</h1></div>"; Node<Structure> doc = createStructure(h); Nodes<Structure> els = doc.select(".headline").select("Paragraph"); assertEquals(2, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("There", els.get(1).text()); } @Test public void attributes() throws UIMAException { String h = "<img src=foo><img src=bar><img class=foo><img class=bar>"; Node<Structure> doc = createStructure(h); Nodes<Structure> withTarget = doc.select("Figure[target]"); assertEquals(2, withTarget.size()); assertTrue(withTarget.hasAttr("target")); assertFalse(withTarget.hasAttr("class")); assertEquals("foo", withTarget.attr("target")); } @Test public void hasAttr() throws UIMAException { Node<Structure> doc = createStructure("<p title=foo><p title=bar><p class=foo><p class=bar>"); Nodes<Structure> ps = doc.select("Paragraph"); assertTrue(ps.hasAttr("class")); assertFalse(ps.hasAttr("style")); } @Test public void attr() throws UIMAException { Node<Structure> doc = createStructure("<h1>test</h1>"); String classVal = doc.select("Heading").attr("level"); assertEquals("1", classVal); } @Test public void text() throws UIMAException { String h = "<div><p>Hello<p>there<p>world</div>"; Node<Structure> doc = createStructure(h); assertEquals("Hello there world", doc.select("Section > *").text()); } @Test public void hasText() throws UIMAException { Node<Structure> doc = createStructure("<div><p>Hello</p></div><div><p></p></div>"); Nodes<Structure> divs = doc.select("Section"); assertTrue(divs.hasText()); assertFalse(doc.select("Section + Section").hasText()); } @Test public void eq() throws UIMAException { String h = "<p>Hello<p>there<p>world"; Node<Structure> doc = createStructure(h); assertEquals("there", doc.select("Paragraph").eq(1).text()); assertEquals("there", doc.select("Paragraph").get(1).text()); } @Test public void is() throws UIMAException { String h = "<h1>Hello</h1><h2>there</h2>"; Node<Structure> doc = createStructure(h); Nodes<Structure> ps = doc.select("Heading"); assertTrue(ps.is("[level=1]")); assertFalse(ps.is("[level=3]")); } @Test public void parents() throws UIMAException { Node<Structure> doc = createStructure("<div><p>Hello</p></div><p>There</p>"); Nodes<Structure> parents = doc.select("Paragraph").parents(); assertEquals(2, parents.size()); assertEquals("Section", parents.get(0).getTypeName()); assertEquals("Root", parents.get(1).getTypeName()); } @Test public void not() throws UIMAException { Node<Structure> doc = createStructure("<div id=1><p>One</p></div> <div id=2><p><em>Two</em></p></div>"); Nodes<Structure> div1 = doc.select("Section").not(":has(Paragraph > Style)"); assertEquals(1, div1.size()); assertEquals("One", div1.first().text()); Nodes<Structure> div2 = doc.select("Section").not("#1"); assertEquals(1, div2.size()); assertEquals("Two", div2.first().text()); } @Test public void traverse() throws UIMAException { Node<Structure> doc = createStructure("<div><p>Hello</p></div><div>There</div>"); final StringBuilder accum = new StringBuilder(); doc.select("Section").traverse(new NodeVisitor<Structure>() { @Override public void head(Node<Structure> node, int depth) { accum.append("<" + node.getTypeName() + ">"); } @Override public void tail(Node<Structure> node, int depth) { accum.append("</" + node.getTypeName() + ">"); } }); assertEquals("<Section><Paragraph></Paragraph></Section><Section></Section>", accum.toString()); } @Test public void siblings() throws UIMAException { Node<Structure> doc = createStructure( "<div><p>1<p>2<p>3<p>4<p>5<p>6</div><div><p>7<p>8<p>9<p>10<p>11<p>12</div>"); Nodes<Structure> els = doc.select("Paragraph:eq(3)"); // gets p4 and p10 assertEquals(2, els.size()); Nodes<Structure> next = els.next(); assertEquals(2, next.size()); assertEquals("5", next.first().text()); assertEquals("11", next.last().text()); assertEquals(0, els.next("Paragraph:contains(6)").size()); final Nodes<Structure> nextF = els.next("Paragraph:contains(5)"); assertEquals(1, nextF.size()); assertEquals("5", nextF.first().text()); Nodes<Structure> nextA = els.nextAll(); assertEquals(4, nextA.size()); assertEquals("5", nextA.first().text()); assertEquals("12", nextA.last().text()); Nodes<Structure> nextAF = els.nextAll("Paragraph:contains(6)"); assertEquals(1, nextAF.size()); assertEquals("6", nextAF.first().text()); Nodes<Structure> prev = els.prev(); assertEquals(2, prev.size()); assertEquals("3", prev.first().text()); assertEquals("9", prev.last().text()); assertEquals(0, els.prev("Paragraph:contains(1)").size()); final Nodes<Structure> prevF = els.prev("Paragraph:contains(3)"); assertEquals(1, prevF.size()); assertEquals("3", prevF.first().text()); Nodes<Structure> prevA = els.prevAll(); assertEquals(6, prevA.size()); assertEquals("3", prevA.first().text()); assertEquals("7", prevA.last().text()); Nodes<Structure> prevAF = els.prevAll("Paragraph:contains(1)"); assertEquals(1, prevAF.size()); assertEquals("1", prevAF.first().text()); } @Test public void eachText() throws UIMAException { Node<Structure> doc = createStructure( "<div><p>1<p>2<p>3<p>4<p>5<p>6</div><div><p>7<p>8<p>9<p>10<p>11<p>12<p></p></div>"); List<String> divText = doc.select("Section").eachText(); assertEquals(2, divText.size()); assertEquals("123456", divText.get(0)); assertEquals("789101112", divText.get(1)); List<String> pText = doc.select("Paragraph").eachText(); Nodes<Structure> ps = doc.select("Paragraph"); assertEquals(13, ps.size()); assertEquals(12, pText.size()); // not 13, as last doesn't have text assertEquals("1", pText.get(0)); assertEquals("2", pText.get(1)); assertEquals("5", pText.get(4)); assertEquals("7", pText.get(6)); assertEquals("12", pText.get(11)); } @Test public void eachAttr() throws UIMAException { Node<Structure> doc = createStructure( "<div><a href='/foo'>1</a><a href='http://example.com/bar'>2</a><a href=''>3</a><a href='/foo'>4</a><a>5</a>"); List<String> hrefAttrs = doc.select("Link").eachAttr("target"); assertEquals(3, hrefAttrs.size()); assertEquals("/foo", hrefAttrs.get(0)); assertEquals("http://example.com/bar", hrefAttrs.get(1)); assertEquals("/foo", hrefAttrs.get(2)); assertEquals(3, doc.select("Link").size()); } @Test public void empty() throws UIMAException { Nodes<Structure> doc = new Nodes<>(); assertNull(doc.first()); assertNull(doc.last()); assertTrue(doc.isEmpty()); assertTrue(doc.parents().isEmpty()); assertTrue(doc.eachAttr("test").isEmpty()); assertTrue(doc.attr("test").isEmpty()); assertTrue(doc.eq(0).isEmpty()); assertFalse(doc.hasAttr("class")); } }