package guang.crawler.extension.filedExtractor.qq; import guang.crawler.commons.DataFields; import guang.crawler.commons.Page; import guang.crawler.commons.parserData.HtmlParseData; import guang.crawler.commons.parserData.ParseData; import guang.crawler.connector.WebDataTableConnector; import guang.crawler.extension.filedExtractor.FieldsExtractor; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; /** * 从QQ评论页面中抽取评论内容,处理类似 * <code>http://coral.qq.com/article/1007777754/comment?commentid=5875472061804651770</code> * URL的页面。 * * @author sun * */ public class QQCommentFieldsExtractor implements FieldsExtractor { @Override public void extractFields(final Page page) { ParseData parseData = page.getParseData(); // 检测数据内容 if (parseData instanceof HtmlParseData) { // 必须是JSON数据 HtmlParseData data = (HtmlParseData) parseData; String jsonString = data.getHtml(); if ((jsonString == null) || (jsonString.length() == 0)) { return; } JSONObject responseObject = JSON.parseObject(jsonString); JSONArray jsonArray = null; if ((responseObject == null) || ((responseObject = responseObject.getJSONObject("data")) == null) || ((jsonArray = responseObject.getJSONArray("commentid")) == null)) { return; } // 处理每条评论 DataFields dataFileds = page.getDataToSave(); String commentedDocID = (String) page.getWebURL() .getProperty("commentedDocID"); int size = jsonArray.size(); for (int i = 0; i < size; i++) { JSONObject comment = jsonArray.getJSONObject(i); String commentId = comment.getString("id"); String commentData = comment.toJSONString(); // 将评论内容添加到HBase中,每个评论一列。 dataFileds.addFiled(commentedDocID, WebDataTableConnector.FAMILY_SUPPORT_DATA, "cmt" + commentId, commentData); } } } }