package com.fpcms.common.webcrawler.htmlparser; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.List; import org.apache.commons.lang.StringUtils; import org.junit.Test; import org.mockito.Mockito; import com.fpcms.common.webcrawler.htmlparser.HtmlPage.Anchor; public class SinglePageCrawlerTest extends Mockito{ SinglePageCrawler c = new SinglePageCrawler(); @Test public void test_extractArticleByJsoup() throws IOException { Anchor anchor = newAnchor("http://badqiu.iteye.com/blog/1776090"); HtmlPage page = c.extractArticleByJsoup(anchor); assertTrue(page.getContent().contains("合理的内链,便于蜘蛛爬行,如文章的:上一篇,下一篇")); assertTrue(page.getContent().contains("纯文本链接格式: 锚文本http://www.iteye.com")); assertTrue(page.getKeywords().equals("SEO经验及教训")); assertTrue(page.getDescription().contains("合理的内链,便于蜘蛛爬行,如文章的:上一篇,下一篇")); assertTrue(page.getTitle().equals("SEO经验及教训")); } @Test public void test_extractArticleByJsoup2() throws IOException { Anchor anchor = newAnchor("http://news.163.com/13/0204/02/8MR9K0MR00014AED.html"); HtmlPage page = c.extractArticleByJsoup(anchor); assertTrue(page.getContent().contains("汽车行业内一直呼吁的“省部级领导带头乘坐自主品牌汽车”")); assertTrue(page.getContent().contains("否则在用车辆直接淘汰,会造成资源浪费。")); assertTrue(page.getKeywords().equals("国产,公车,官员")); assertTrue(page.getDescription().contains("核心提示:近日,宁夏、湖南、甘肃、新疆等多省党委政府规定:")); assertEquals(page.getTitle(),"多省市出台改作风规定 要求领导乘国产品牌汽车"); } @Test public void test_getShoudVisitAnchorList() throws IOException { c.setAcceptUrlRegexList(".*/\\d{2}/\\d{4}/\\d{2}/.*.html\\??.*"); List<Anchor> list = c.getShoudVisitAnchorList("http://news.163.com"); System.out.println(StringUtils.join(list,"\n")); assertTrue(list.size() > 50); } @Test public void test_getShoudVisitAnchorList_yahoo() throws IOException { c.setAcceptUrlRegexList("http://.*.yahoo.com/.*-\\d{6,}.html.*"); List<Anchor> list = c.getShoudVisitAnchorList("http://news.yahoo.com/"); System.out.println(StringUtils.join(list,"\n")); assertTrue(list.size() > 20); } @Test public void test() throws IOException { verifyExtractArticleByJsoup("http://www.arabnews.com/news/445285", 1000); verifyExtractArticleByJsoup("http://blogs.wsj.com/venturecapital/2013/03/14/mobeam-brings-digital-coupons-to-new-samsung-phone/", 1000); verifyExtractArticleByJsoup("http://www.washingtonpost.com/local/education/cava-java-offers-more-than-just-coffee-for-students-in-silver-spring/2013/03/17/4f24bf96-8cc0-11e2-9f54-f3fdd70acad2_story.html", 1000); } private void verifyExtractArticleByJsoup(String url, int expectedLength) throws IOException { HtmlPage page = c.extractArticleByJsoup(newAnchor(url)); System.out.println(page.getContent().length() + " "+page.getContent()); assertTrue(page.getContent().length() > expectedLength); } @Test public void test_isAcceptUrl() throws IOException { assertFalse(c.isAcceptUrl(null)); assertFalse(c.isAcceptUrl(" ")); assertFalse(c.isAcceptUrl("httpbs://www.163.com")); c.setAcceptUrlRegexList(".*163.com.*",".*sina.com.*"); assertTrue(c.isAcceptUrl("https://www.163.com/news/111.html")); assertTrue(c.isAcceptUrl("https://www.sina.com/news/111.html")); assertFalse(c.isAcceptUrl("https://www.qq.com/news/111.html")); c.setExcludeUriRegexList(".*blog/111.html"); assertFalse(c.isAcceptUrl("https://www.163.com/blog/111.html")); } private Anchor newAnchor(String url) { Anchor anchor = new Anchor(); anchor.setHref(url); anchor.setText(url); return anchor; } }