package guang.crawler.extension.filedExtractor.qq; import guang.crawler.commons.DataFields; import guang.crawler.commons.Page; import guang.crawler.commons.parserData.HtmlParseData; import guang.crawler.commons.parserData.ParseData; import guang.crawler.connector.WebDataTableConnector; import guang.crawler.extension.filedExtractor.FieldsExtractor; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; /** * 处理类似于 <code>http://coral.qq.com/article/1009470758/commentnum</code> * URL的页面的信息。 * * @author sun * */ public class QQCommentCountFieldsExtractor implements FieldsExtractor { @Override public void extractFields(final Page page) { ParseData parseData = page.getParseData(); if (parseData instanceof HtmlParseData) { // 确保当前读到的是JSON数据 HtmlParseData data = (HtmlParseData) parseData; String jsonString = data.getHtml(); JSONObject responseObj = null; // 检查JSON字符串的合法性 if ((jsonString == null) || ((jsonString.length() == 0) || ((responseObj = JSON.parseObject(jsonString)) == null)) || ((responseObj = responseObj.getJSONObject("data")) == null)) { return; } String commentNum = responseObj.getString("commentnum"); DataFields fields = page.getDataToSave(); String commentedDocID = (String) page.getWebURL() .getProperty("commentedDocID"); if (commentedDocID != null) { fields.addFiled(commentedDocID, WebDataTableConnector.FAMILY_SUPPORT_DATA, "cmtCount", commentNum); } } } }