package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 上午8:42
*/
public class HtmlTest {
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
}
@Ignore("not work in jsoup 1.8.x")
@Test
public void testDisableJsoupHtmlEntityEscape() throws Exception {
Html.DISABLE_HTML_ENTITY_ESCAPE = true;
Html html = new Html("aaaaaaa&b");
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
}
@Test
public void testEnableJsoupHtmlEntityEscape() throws Exception {
Html.DISABLE_HTML_ENTITY_ESCAPE = false;
Html html = new Html("aaaaaaa&b");
assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
}
@Test
public void testAHrefExtract(){
Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>");
assertThat(html.links().all()).contains("/xx/xx");
}
@Test
public void testNthNodesGet(){
Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>");
assertThat(html.xpath("//a[1]/@href").get()).isEqualTo("/xx/xx");
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
}
@Test
public void testGetHrefsByJsoup(){
Html html = new Html("<html><a href='issues'>issues</a><img src='webmagic.jpg'/></html>","https://github.com/code4craft/webmagic/");
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
html = new Html("<html><base href='https://github.com/code4craft/webmagic/'><a href='issues'>issues</a><img src='webmagic.jpg'/></base></html>");
assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues");
assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg");
}
}