//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentmappers.structure; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.apache.uima.UIMAException; import org.junit.Test; import uk.gov.dstl.baleen.types.structure.Structure; import uk.gov.dstl.baleen.uima.utils.select.Node; import uk.gov.dstl.baleen.uima.utils.select.Nodes; public class SelectorTest extends AbstractHtmlToStructureTest { @Test public void testByTypeName() throws UIMAException { // should be case insensitive Nodes<Structure> els = createStructure("<div><div id=2><p>Hello</p></div></div><DIV id=3>").select("Section"); assertEquals(3, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("Hello", els.get(1).text()); assertEquals("", els.get(2).text()); Nodes<Structure> none = createStructure("<div id=1><div id=2><p>Hello</p></div></div><div id=3>").select("Style"); assertEquals(0, none.size()); } @Test public void testByAttribute() throws UIMAException { String h = "<a href=Foo />1<a href=Bar />2<a Style=Qux />3<a href=Bam />4<a href=SLAM />5" + "<a href='with spaces'/>"; Node<Structure> doc = createStructure(h); Nodes<Structure> withTitle = doc.select("[target]"); assertEquals(5, withTitle.size()); Nodes<Structure> foo = doc.select("[TARGET=foo]"); assertEquals(1, foo.size()); Nodes<Structure> foo2 = doc.select("[target=\"foo\"]"); assertEquals(1, foo2.size()); Nodes<Structure> foo3 = doc.select("[target=\"Foo\"]"); assertEquals(1, foo3.size()); Nodes<Structure> dataName = doc.select("[target=\"with spaces\"]"); assertEquals(1, dataName.size()); assertEquals("with spaces", dataName.first().attr("target")); Nodes<Structure> not = doc.select("Link[target!=bar]"); assertEquals(4, not.size()); assertEquals("Foo", not.first().attr("target")); Nodes<Structure> starts = doc.select("[target^=ba]"); assertEquals(2, starts.size()); assertEquals("Bar", starts.first().attr("target")); assertEquals("Bam", starts.last().attr("target")); Nodes<Structure> ends = doc.select("[target$=am]"); assertEquals(2, ends.size()); assertEquals("Bam", ends.first().attr("target")); assertEquals("SLAM", ends.last().attr("target")); Nodes<Structure> contains = doc.select("[target*=a]"); assertEquals(4, contains.size()); assertEquals("Bar", contains.first().attr("target")); assertEquals("with spaces", contains.last().attr("target")); } @Test public void testByAttributeStarting() throws UIMAException { Node<Structure> doc = createStructure("<a id=1 href=jsoup>Hello</a><em>There</em><a href=3>No</a>"); Nodes<Structure> withData = doc.select("[^tar]"); assertEquals(2, withData.size()); assertEquals("Hello", withData.first().text()); assertEquals("No", withData.last().text()); withData = doc.select("Style[^decor]"); assertEquals(1, withData.size()); assertEquals("There", withData.first().text()); } @Test public void testByAttributeRegex() throws UIMAException { Node<Structure> doc = createStructure( "<p><img src=foo.png id=1>1<img src=bar.jpg id=2>2<img src=qux.JPEG id=3>3<img src=old.gif>4<img>5</p>"); Nodes<Structure> imgs = doc.select("Figure[target~=(?i)\\.(png|jpe?g)]"); assertEquals(3, imgs.size()); assertEquals("foo.png", imgs.get(0).attr("target")); assertEquals("bar.jpg", imgs.get(1).attr("target")); assertEquals("qux.JPEG", imgs.get(2).attr("target")); } @Test public void testByAttributeRegexCharacterClass() throws UIMAException { Node<Structure> doc = createStructure( "<p><img src=foo.png id=1><img src=bar.jpg id=2><img src=qux.JPEG id=3><img src=old.gif id=4></p>"); Nodes<Structure> imgs = doc.select("Figure[target~=[o]]"); assertEquals(2, imgs.size()); assertEquals("foo.png", imgs.get(0).attr("target")); assertEquals("old.gif", imgs.get(1).attr("target")); } @Test public void testByAttributeRegexCombined() throws UIMAException { Node<Structure> doc = createStructure("<div><a href=x>Hello</a></div>"); Nodes<Structure> els = doc.select("Section Link[target~=x|y]"); assertEquals(1, els.size()); assertEquals("Hello", els.text()); } @Test public void testCombinedWithContains() throws UIMAException { Node<Structure> doc = createStructure("<p id=1>One</p><p>Two +</p><p>Three +</p>"); Nodes<Structure> els = doc.select("Paragraph:nth-of-type(1) + :contains(+)"); assertEquals(1, els.size()); assertEquals("Two +", els.text()); assertEquals("Paragraph", els.first().getTypeName()); } @Test public void testAllNodes() throws UIMAException { String h = "<div><p>Hello</p><p><b>there</b></p></div>"; Node<Structure> doc = createStructure(h); Nodes<Structure> allDoc = doc.select("*"); Nodes<Structure> allUnderDiv = doc.select("Section *"); assertEquals(5, allDoc.size()); assertEquals(3, allUnderDiv.size()); assertEquals("Paragraph", allUnderDiv.first().getTypeName()); } @Test public void testGroupOr() throws UIMAException { String h = "<a href=foo />1<a href=bar />2<div />3<p>4</p>5<img />6<img src=qux>"; Node<Structure> doc = createStructure(h); Nodes<Structure> els = doc.select("Link,Section,[target]"); assertEquals(4, els.size()); assertEquals("Link", els.get(0).getTypeName()); assertEquals("foo", els.get(0).attr("target")); assertEquals("Link", els.get(1).getTypeName()); assertEquals("bar", els.get(1).attr("target")); assertEquals("Section", els.get(2).getTypeName()); assertTrue(els.get(2).attr("target").length() == 0); // missing attributes come back as empty // string assertFalse(els.get(2).hasAttr("title")); assertEquals("Figure", els.get(3).getTypeName()); assertEquals("qux", els.get(3).attr("target")); } @Test public void testGroupOrAttribute() throws UIMAException { String h = "<h1 /><h2 /><img src=foo /><img src=bar />"; Nodes<Structure> els = createStructure(h).select("[level],[target=foo]"); assertEquals(3, els.size()); assertEquals("1", els.get(0).attr("level")); assertEquals("2", els.get(1).attr("level")); assertEquals("foo", els.get(2).attr("target")); } @Test public void descendant() throws UIMAException { String h = "<div class=head><p class=first>Hello</p><p>There</p></div><p>None</p>"; Node<Structure> root = createStructure(h); Nodes<Structure> els = root.select("Section Paragraph"); assertEquals(2, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("There", els.get(1).text()); Nodes<Structure> p = root.select("Paragraph:nth-of-type(1)"); assertEquals(2, p.size()); assertEquals("Hello", p.get(0).text()); assertEquals("None", p.get(1).text()); Nodes<Structure> empty = root.select("Paragraph Paragraph"); assertEquals(0, empty.size()); } @Test public void and() throws UIMAException { String h = "<a href=foo><img src=foo>Hello</img></a>"; Node<Structure> doc = createStructure(h); Nodes<Structure> a = doc.select("Link[target=foo]"); assertEquals(1, a.size()); assertEquals("Link", a.first().getTypeName()); Nodes<Structure> img = doc.select("Link [target=foo]"); assertEquals(1, img.size()); assertEquals("Figure", img.first().getTypeName()); } @Test public void deeperDescendant() throws UIMAException { String h = "<div class=head><p><a href=first>Hello</a></div><div class=head><p class=first><a>Another</a><p>Again</div>"; Node<Structure> root = createStructure(h); Nodes<Structure> els = root.select("Section Paragraph [target=first]"); assertEquals(1, els.size()); assertEquals("Hello", els.first().text()); assertEquals("Link", els.first().getTypeName()); Nodes<Structure> aboveRoot = root.select("Document Paragraph [target=first]"); assertEquals(0, aboveRoot.size()); } @Test public void parentChildElement() throws UIMAException { String h = "<div id=1>1<div>2<div>3</div></div></div><div>4</div>"; Node<Structure> doc = createStructure(h); Nodes<Structure> divs = doc.select("Section > Section"); assertEquals(2, divs.size()); assertEquals("23", divs.get(0).text()); // 2 is child of 1 assertEquals("3", divs.get(1).text()); // 3 is child of 2 Nodes<Structure> div2 = doc.select("Section#1 > Section"); assertEquals(1, div2.size()); assertEquals("23", div2.get(0).text()); } @Test public void parentWithClassChild() throws UIMAException { String h = "<h1 class=foo><a href=1 />1</h1><h2 class=foo><a href=2 class=bar />2</h2><h1><a href=3 />3</h1>"; Node<Structure> doc = createStructure(h); Nodes<Structure> allAs = doc.select("Heading > Link"); assertEquals(3, allAs.size()); assertEquals("Link", allAs.first().getTypeName()); Nodes<Structure> fooAs = doc.select("Heading[level=1] > Link"); assertEquals(2, fooAs.size()); assertEquals("Link", fooAs.first().getTypeName()); Nodes<Structure> barAs = doc.select("Heading[level=2] > Link[class=bar]"); assertEquals(1, barAs.size()); } @Test public void parentChildStar() throws UIMAException { String h = "<div id=1><p>Hello<p><b>there</b></p></div><div id=2><a>Hi</a></div>"; Node<Structure> doc = createStructure(h); Nodes<Structure> divChilds = doc.select("Section > *"); assertEquals(3, divChilds.size()); assertEquals("Paragraph", divChilds.get(0).getTypeName()); assertEquals("Paragraph", divChilds.get(1).getTypeName()); assertEquals("Anchor", divChilds.get(2).getTypeName()); } @Test public void multiChildDescent() throws UIMAException { String h = "<div id=foo><h1 class=bar><a href=http://example.com/>One</a></h1></div>"; Node<Structure> doc = createStructure(h); Nodes<Structure> els = doc.select("Section > Heading[level=1] > Link[target*=example]"); assertEquals(1, els.size()); assertEquals("Link", els.first().getTypeName()); } @Test public void caseInsensitive() throws UIMAException { String h = "<A href=bAr><a href=foo>"; // mixed case so a simple toLowerCase() on value doesn't // catch Node<Structure> doc = createStructure(h); assertEquals(2, doc.select("Link").size()); assertEquals(2, doc.select("LInk[taRget]").size()); assertEquals(1, doc.select("LINK[Target=BAR]").size()); assertEquals(0, doc.select("Link[TARGET=BARBARELLA]").size()); } @Test public void adjacentSiblings() throws UIMAException { String h = "<ol><li>One<li>Two<li>Three</ol>"; Node<Structure> doc = createStructure(h); Nodes<Structure> sibs = doc.select("ListItem + ListItem"); assertEquals(2, sibs.size()); assertEquals("Two", sibs.get(0).text()); assertEquals("Three", sibs.get(1).text()); } @Test public void notAdjacent() throws UIMAException { String h = "<h1>One</h1><h2>Two</h2><h3>Three</h3>"; Node<Structure> doc = createStructure(h); Nodes<Structure> sibs = doc.select("Heading[level=1] + Heading[level=3]"); assertEquals(0, sibs.size()); } @Test public void mixCombinator() throws UIMAException { String h = "<h1><div class=foo><ol><li>One<li>Two<li>Three</ol></div></h1>"; Node<Structure> doc = createStructure(h); Nodes<Structure> sibs = doc.select("Heading > Section ListItem + ListItem"); assertEquals(2, sibs.size()); assertEquals("Two", sibs.get(0).text()); assertEquals("Three", sibs.get(1).text()); } @Test public void mixCombinatorGroup() throws UIMAException { String h = "<h1><ol><li>One<li>Two<li>Three</ol></h1>"; Node<Structure> doc = createStructure(h); Nodes<Structure> els = doc.select("[level=1] > Ordered, Ordered > ListItem + ListItem"); assertEquals(3, els.size()); assertEquals("Ordered", els.get(0).getTypeName()); assertEquals("Two", els.get(1).text()); assertEquals("Three", els.get(2).text()); } @Test public void generalSiblings() throws UIMAException { String h = "<h1>One</h1><h2>Two</h2><h3>Three</h3>"; Node<Structure> doc = createStructure(h); Nodes<Structure> sibs = doc.select("Heading[level=1] ~ Heading[level=3]"); assertEquals(1, sibs.size()); assertEquals("Three", sibs.first().text()); } // for http://github.com/jhy/jsoup/issues#issue/13 @Test public void testSupportsLeadingCombinator() throws UIMAException { String h = "<div><p><a>One</a><a>Two</a></p></div>"; Node<Structure> doc = createStructure(h); Node<Structure> p = doc.select("Section > Paragraph").first(); Nodes<Structure> spans = p.select("> Anchor"); assertEquals(2, spans.size()); assertEquals("One", spans.first().text()); // make sure doesn't get nested h = "<div id=1>1<div id=2>2<div id=3>3</div></div></div>"; doc = createStructure(h); Node<Structure> div = doc.select("Section").select(" > Section").first(); assertEquals("23", div.text()); } @Test public void testPseudoLessThan() throws UIMAException { Node<Structure> doc = createStructure("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>"); Nodes<Structure> ps = doc.select("Section Paragraph:lt(2)"); assertEquals(3, ps.size()); assertEquals("One", ps.get(0).text()); assertEquals("Two", ps.get(1).text()); assertEquals("Four", ps.get(2).text()); } @Test public void testPseudoGreaterThan() throws UIMAException { Node<Structure> doc = createStructure("<div><p>One</p><p>Two</p><p>Three</p></div><div><p>Four</p>"); Nodes<Structure> ps = doc.select("Section Paragraph:gt(0)"); assertEquals(2, ps.size()); assertEquals("Two", ps.get(0).text()); assertEquals("Three", ps.get(1).text()); } @Test public void testPseudoEquals() throws UIMAException { Node<Structure> doc = createStructure("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>"); Nodes<Structure> ps = doc.select("Section Paragraph:eq(0)"); assertEquals(2, ps.size()); assertEquals("One", ps.get(0).text()); assertEquals("Four", ps.get(1).text()); Nodes<Structure> ps2 = doc.select("Section:eq(0) Paragraph:eq(0)"); assertEquals(1, ps2.size()); assertEquals("One", ps2.get(0).text()); assertEquals("Paragraph", ps2.get(0).getTypeName()); } @Test public void testPseudoBetween() throws UIMAException { Node<Structure> doc = createStructure("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>"); Nodes<Structure> ps = doc.select("Section Paragraph:gt(0):lt(2)"); assertEquals(1, ps.size()); assertEquals("Two", ps.get(0).text()); } @Test public void testPseudoCombined() throws UIMAException { Node<Structure> doc = createStructure( "<h1 class='foo'><p>One</p><p>Two</p></h1><h1><p>Three</p><p>Four</p></h1>"); Nodes<Structure> ps = doc.select("Heading.foo Paragraph:gt(0)"); assertEquals(1, ps.size()); assertEquals("Two", ps.get(0).text()); } @Test public void testPseudoHas() throws UIMAException { Node<Structure> doc = createStructure( "<div id=0><p><img>Hello</img></p></div> <div id=1><img src=foo>There</img></div> <div id=2><p>Not</p></div>"); Nodes<Structure> divs1 = doc.select("Section:has(Figure)"); assertEquals(2, divs1.size()); assertEquals("Hello", divs1.get(0).text()); assertEquals("There", divs1.get(1).text()); Nodes<Structure> divs2 = doc.select("Section:has([target])"); assertEquals(1, divs2.size()); assertEquals("There", divs2.get(0).text()); Nodes<Structure> divs3 = doc.select("Section:has(Figure, Paragraph)"); assertEquals(3, divs3.size()); assertEquals("Hello", divs3.get(0).text()); assertEquals("There", divs3.get(1).text()); assertEquals("Not", divs3.get(2).text()); Nodes<Structure> els1 = doc.select(":has(Paragraph)"); assertEquals(3, els1.size()); // body, div, dib assertEquals("Root", els1.first().getTypeName()); assertEquals("Hello", els1.get(1).text()); assertEquals("Not", els1.get(2).text()); } @Test public void testNestedHas() throws UIMAException { Node<Structure> doc = createStructure("<div><p><a>One</a></p></div> <div><p>Two</p></div>"); Nodes<Structure> divs = doc.select("Section:has(Paragraph:has(Anchor))"); assertEquals(1, divs.size()); assertEquals("One", divs.first().text()); // test matches in has divs = doc.select("Section:has(Paragraph:matches((?i)two))"); assertEquals(1, divs.size()); assertEquals("Section", divs.first().getTypeName()); assertEquals("Two", divs.first().text()); // test contains in has divs = doc.select("Section:has(Paragraph:contains(two))"); assertEquals(1, divs.size()); assertEquals("Section", divs.first().getTypeName()); assertEquals("Two", divs.first().text()); } @Test public void testPseudoContains() throws UIMAException { Node<Structure> doc = createStructure( "<div><p>The Rain.</p> <p class=light>The <i>rain</i>.</p> <p>Rain, the.</p></div>"); Nodes<Structure> ps1 = doc.select("Paragraph:contains(Rain)"); assertEquals(3, ps1.size()); Nodes<Structure> ps2 = doc.select("Paragraph:contains(the rain)"); assertEquals(2, ps2.size()); assertEquals("The Rain.", ps2.first().text()); assertEquals("The rain.", ps2.last().text()); Nodes<Structure> ps3 = doc.select("Paragraph:contains(the Rain):has(Style)"); assertEquals(1, ps3.size()); assertEquals("The rain.", ps3.first().text()); Nodes<Structure> ps5 = doc.select(":contains(rain)"); assertEquals(5, ps5.size()); // Section, Paragraph, Paragraph, Style, Paragraph } @Test public void testPsuedoContainsWithParentheses() throws UIMAException { Node<Structure> doc = createStructure("<div><p id=1>This (is good)</p><p id=2>This is bad)</p>"); Nodes<Structure> ps1 = doc.select("Paragraph:contains(this (is good))"); assertEquals(1, ps1.size()); assertEquals("This (is good)", ps1.first().text()); Nodes<Structure> ps2 = doc.select("Paragraph:contains(this is bad\\))"); assertEquals(1, ps2.size()); assertEquals("This is bad)", ps2.first().text()); } @Test public void containsOwn() throws UIMAException { Node<Structure> doc = createStructure("<p id=1>Hello <b>there</b> now</p>"); Nodes<Structure> ps = doc.select("Paragraph:containsOwn(Hello now)"); assertEquals(1, ps.size()); assertEquals("Hello there now", ps.first().text()); assertEquals(0, doc.select("Paragraph:containsOwn(there)").size()); } @Test public void testMatches() throws UIMAException { Node<Structure> doc = createStructure( "<p id=1>The <i>Rain</i></p> <p id=2>There are 99 bottles.</p> <p id=3>Harder (this)</p> <p id=4>Rain</p>"); Nodes<Structure> p1 = doc.select("Paragraph:matches(The rain)"); // no match, // case // sensitive assertEquals(0, p1.size()); Nodes<Structure> p2 = doc.select("Paragraph:matches((?i)the rain)"); // case // insense. // should // include root, // html, // body assertEquals(1, p2.size()); assertEquals("The Rain", p2.first().text()); Nodes<Structure> p4 = doc.select("Paragraph:matches((?i)^rain$)"); // bounding assertEquals(1, p4.size()); assertEquals("Rain", p4.first().text()); Nodes<Structure> p5 = doc.select("Paragraph:matches(\\d+)"); assertEquals(1, p5.size()); assertEquals("There are 99 bottles.", p5.first().text()); Nodes<Structure> p6 = doc.select("Paragraph:matches(\\w+\\s+\\(\\w+\\))"); // test // bracket // matching assertEquals(1, p6.size()); assertEquals("Harder (this)", p6.first().text()); Nodes<Structure> p7 = doc.select("Paragraph:matches((?i)the):has(Style)"); // multi assertEquals(1, p7.size()); assertEquals("The Rain", p7.first().text()); } @Test public void matchesOwn() throws UIMAException { Node<Structure> doc = createStructure("<p id=1>Hello <b>there</b> now</p>"); Nodes<Structure> p1 = doc.select("Paragraph:matchesOwn((?i)hello now)"); assertEquals(1, p1.size()); assertEquals("Hello there now", p1.first().text()); assertEquals(0, doc.select("Paragraph:matchesOwn(there)").size()); } @Test public void notParas() throws UIMAException { Node<Structure> doc = createStructure("<h1 id=1>One</h1> <h2>Two</h2> <h3><a>Three</a></h3>"); Nodes<Structure> el1 = doc.select("Heading:not([level=1])"); assertEquals(2, el1.size()); assertEquals("Two", el1.first().text()); assertEquals("Three", el1.last().text()); Nodes<Structure> el2 = doc.select("Heading:not(:has(Anchor))"); assertEquals(2, el2.size()); assertEquals("One", el2.first().text()); assertEquals("Two", el2.last().text()); } @Test public void notAll() throws UIMAException { Node<Structure> doc = createStructure("<p>Two</p> <p><a>Three</a></p>"); Nodes<Structure> el1 = doc.select(":not(Paragraph)"); // should just be the a assertEquals(2, el1.size()); assertEquals("Root", el1.first().getTypeName()); assertEquals("Anchor", el1.last().getTypeName()); } @Test public void handlesCommasInSelector() throws UIMAException { Node<Structure> doc = createStructure("<a href='1,2'>One</a><div>Two</div><ol><li>123</li><li>Text</li></ol>"); Nodes<Structure> ps = doc.select("[target=1,2]"); assertEquals(1, ps.size()); Nodes<Structure> containers = doc.select("Section, ListItem:matches([0-9,]+)"); assertEquals(2, containers.size()); assertEquals("Section", containers.get(0).getTypeName()); assertEquals("ListItem", containers.get(1).getTypeName()); assertEquals("123", containers.get(1).text()); } @Test public void selectClassWithSpace() throws UIMAException { final String html = "<a href=\"value\">class without space</a>\n" + "<a href=\"value \">class with space</a>"; Node<Structure> doc = createStructure(html); Nodes<Structure> found = doc.select("Link[target=value ]"); assertEquals(2, found.size()); assertEquals("class without space", found.get(0).text()); assertEquals("class with space", found.get(1).text()); found = doc.select("Link[target=\"value \"]"); assertEquals(2, found.size()); assertEquals("class without space", found.get(0).text()); assertEquals("class with space", found.get(1).text()); found = doc.select("Link[target=\"value\\ \"]"); assertEquals(0, found.size()); } @Test public void selectSameNodes() throws UIMAException { final String html = "<div>one</div><div>one</div>"; Node<Structure> doc = createStructure(html); Nodes<Structure> els = doc.select("Section"); assertEquals(2, els.size()); Nodes<Structure> subSelect = els.select(":contains(one)"); assertEquals(2, subSelect.size()); } @Test public void attributeWithBrackets() throws UIMAException { String html = "<a href='End]'>One</a> <a href='[Another)]]'>Two</a>"; Node<Structure> doc = createStructure(html); assertEquals("One", doc.select("Link[target='End]']").first().text()); assertEquals("Two", doc.select("Link[target='[Another)]]']").first().text()); assertEquals("One", doc.select("Link[target=\"End]\"]").first().text()); assertEquals("Two", doc.select("Link[target=\"[Another)]]\"]").first().text()); } @Test public void containsWithQuote() throws UIMAException { String html = "<p>One'One</p><p>One'Two</p>"; Node<Structure> doc = createStructure(html); Nodes<Structure> els = doc.select("Paragraph:contains(One\\'One)"); assertEquals(1, els.size()); assertEquals("One'One", els.text()); } }