package guang.crawler.extension.urlExtractor.qq;
import guang.crawler.commons.Page;
import guang.crawler.commons.WebURL;
import guang.crawler.commons.parserData.HtmlParseData;
import guang.crawler.commons.parserData.ParseData;
import guang.crawler.extension.urlExtractor.URLsExtractor;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获取QQ新闻页面中应当获取的所有URL,包括静态链接URL,第一条评论的URL地址,以及评论数等信息。
*
* @author sun
*
*/
public class QQNewsURLsExtractor implements URLsExtractor {
private Pattern cmtIdPattern;
public QQNewsURLsExtractor() {
this.cmtIdPattern = Pattern.compile("cmt_id\\s*=\\s*([0-9]+)\\s*;");
}
@Override
public void extractURLs(final Page page) {
List<WebURL> urlList = page.getLinksToFollow();
ParseData data = page.getParseData();
if (data instanceof HtmlParseData) {
HtmlParseData htmlData = (HtmlParseData) data;
// 1. 获取静态URL列表
// urlList.addAll(htmlData.getOutgoingUrls());
// 2. 获取动态URL列表
String html = htmlData.getHtml();
// 2.1 获取cmt_id的值
String cmtId = this.getCmtId(html);
// 2.2 构建需要爬去的动态URL的值
// 评论数URL
String cmtCountURLString = "http://coral.qq.com/article/" + cmtId
+ "/commentnum";
WebURL cmtCountURL = WebURL.newWebURL()
.setURL(cmtCountURLString)
.setShouldDepthIncrease(false)
.setProperty("commentedDocID",
page.getWebURL()
.getDocid());
// 第一条评论URL
String firstCmtURLString = "http://coral.qq.com/article/" + cmtId
+ "/comment?commentid=0";
WebURL firstCmtURL = WebURL.newWebURL()
.setURL(firstCmtURLString)
.setShouldDepthIncrease(false)
.setProperty("commentedDocID",
page.getWebURL()
.getDocid());
// 2.3 将构建的动态URL添加到最终的列表中
urlList.add(cmtCountURL);
urlList.add(firstCmtURL);
}
}
private String getCmtId(final String html) {
Matcher cmtIdMatcher = this.cmtIdPattern.matcher(html);
if (cmtIdMatcher.find()) {
String cmtId = cmtIdMatcher.group(1);
return cmtId;
}
return null;
}
}