package guang.crawler.extension.urlExtractor.qq; import guang.crawler.commons.Page; import guang.crawler.commons.WebURL; import guang.crawler.commons.parserData.HtmlParseData; import guang.crawler.commons.parserData.ParseData; import guang.crawler.extension.urlExtractor.URLsExtractor; import java.util.List; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; /** * 从评论中获取下一条评论的地址 * * @author sun * */ public class QQNewsCommentURLsExtractor implements URLsExtractor { @Override public void extractURLs(final Page page) { List<WebURL> urlList = page.getLinksToFollow(); // 检测是否设置了被评论的页面 String commentedDocID = (String) page.getWebURL() .getProperty("commentedDocID"); if (commentedDocID == null) { return; } ParseData parseData = page.getParseData(); // 处理的必须是JSON数据 if (parseData instanceof HtmlParseData) { // 必须是JSON数据 HtmlParseData data = (HtmlParseData) parseData; String jsonString = data.getHtml(); if ((jsonString == null) || (jsonString.length() == 0)) { return; } JSONObject responseObject = JSON.parseObject(jsonString); if ((responseObject == null) || ((responseObject = responseObject.getJSONObject("data")) == null)) { return; } String lastID = responseObject.getString("last"); String targetID = responseObject.getString("targetid"); String retnum = responseObject.getString("retnum"); int cmtCount = 0; try { cmtCount = Integer.parseInt(retnum); } catch (NumberFormatException e) { cmtCount = 0; } if (cmtCount == 0) { // 该新闻已经没有新的评论了 return; } urlList.add(WebURL.newWebURL() .setURL("http://coral.qq.com/article/" + targetID + "/comment?commentid=" + lastID) .setProperty("commentedDocID", commentedDocID) .setShouldDepthIncrease(false)); } } }