package guang.crawler.crawlWorker; import guang.crawler.commons.Page; import guang.crawler.commons.parserData.HtmlParseData; import guang.crawler.commons.parserData.ParseData; import guang.crawler.crawlWorker.pageProcessor.DownloadPlugin; import guang.crawler.crawlWorker.url.URLCanonicalizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; public class EchoCommentURLPlugin implements DownloadPlugin { private static String viewURL = "http://comment.ifeng.com/view.php"; private static String viewSpecialuRL = "http://comment.ifeng.com/viewspecial.php"; /** * 在HTML页面中查找下面的内容: <code> * var comment_json = { 'docUrl':'http://news.ifeng.com/opinion/special/shenghuodabaozha/', 'docName':'生活大爆炸遭禁播的背后', 'skey':'190dd4', 'pagesize':parseInt(0), 'isSpecial':parseInt(1), 'isMatch':parseInt(0), 'cmtBox':parseInt(1), 'banner':'', 'sns':parseInt(1), 'status':parseInt(1), 'countIds':[], 'links':[] }; * </code> 然后用该内容组合成URL,访问该URL. */ @Override public boolean work(final Page page) { ParseData parseData = page.getParseData(); if (parseData instanceof HtmlParseData) { HtmlParseData htmlData = (HtmlParseData) parseData; Pattern findPattern = Pattern .compile("var\\s+comment_json\\s*=\\s*(\\{[^}]+\\})\\s*;"); Matcher findMatcher = findPattern.matcher(htmlData.getHtml()); if (findMatcher.find()) { String jsonString = findMatcher.group(1).replaceAll( "parseInt\\(([.0-9]+)\\)", "$1"); JSONObject result = JSON.parseObject(jsonString); StringBuilder urlBuilder = new StringBuilder(); if (result.getInteger("isSpecial") == 1) { urlBuilder.append(EchoCommentURLPlugin.viewSpecialuRL); } else { urlBuilder.append(EchoCommentURLPlugin.viewURL); } urlBuilder.append("?doc_url=") .append(result.getString("docUrl")) .append("&doc_name=") .append(result.getString("docName")).append("&skey=") .append(result.getString("skey")).append("&p=1"); String firstPageUrl = URLCanonicalizer .getCanonicalURL(urlBuilder.toString()); System.out.println(firstPageUrl); return true; } else { System.out.println(htmlData.getHtml()); } } return false; } }