/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.seo.rank.impl;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seo.rank.Ranker;
import org.seo.rank.tools.DynamicIp;
import org.seo.rank.list.UrlTools;
import org.seo.rank.list.impl.DefaultParser;
import org.seo.rank.model.Article;
import org.seo.rank.model.Rank;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* 判断网页是否被搜索引擎收录以及收录之后的排名情况
* @author 杨尚川
*/
public class BaiduRanker implements Ranker{
private static final Logger LOGGER = LoggerFactory.getLogger(BaiduRanker.class);
private static final String ACCEPT = "text/html, */*; q=0.01";
private static final String ENCODING = "gzip, deflate";
private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
private static final String CONNECTION = "keep-alive";
private static final String HOST = "www.baidu.com";
private static final String REFERER = "http://www.baidu.com";
private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0";
//获取多少页
private static final int PAGE = 15;
private static final int PAGESIZE = 10;
@Override
public void rank(List<Rank> ranks) {
for(Rank rank : ranks){
rank(rank);
}
}
@Override
public void rank(Rank rank){
doRank(rank);
}
/**
* 查询网页在百度的排名
* @param rank 排名数据结构
*/
public void doRank(Rank rank){
if(StringUtils.isBlank(rank.getKeyword()) || StringUtils.isBlank(rank.getUrl())){
return ;
}
//检查是否被百度收录
searchBaiduIndex(rank);
if(!rank.isIndex()){
return;
}
//检查百度排名
String query = null;
try {
query = URLEncoder.encode(rank.getKeyword(), "UTF-8");
} catch (UnsupportedEncodingException e) {
LOGGER.error("url构造失败", e);
return ;
}
if(StringUtils.isBlank(query)){
return ;
}
for (int i = 0; i < PAGE; i++) {
String path = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd=" + query+"&oq="+query+"&usm=3&f=8&bs="+query+"&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn=" + i * PAGESIZE;
LOGGER.debug(path);
int r = searchBaiduRank(path, rank);
if (r > 0){
rank.setRank(r+i*10);
//找到排名
return;
}
}
}
/**
* 检查百度是否收录
* @param rank
*/
private void searchBaiduIndex(Rank rank) {
String url = "url:"+rank.getUrl();
url = "http://www.baidu.com/s?wd=" + url;
LOGGER.debug(url);
try {
Document document = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("User-Agent", USER_AGENT)
.header("Host", HOST)
.get();
String notFoundCssQuery = "html body div div div div div p";
Elements elements = document.select(notFoundCssQuery);
for(Element element : elements){
String text = element.text();
if(text.contains("抱歉,没有找到与") && text.contains("相关的网页。")){
//未被百度收录
LOGGER.debug("未被百度收录");
rank.setIndex(false);
return;
}
}
String numberCssQuery = "html body div div div div.nums";
elements = document.select(numberCssQuery);
for(Element element : elements){
String text = element.text();
if(text.equals("百度为您找到相关结果约1个")){
//百度收录
LOGGER.debug("被百度收录");
rank.setIndex(true);
return;
}
}
} catch (IOException ex) {
LOGGER.error("搜索出错",ex);
}
LOGGER.debug("未被百度收录");
}
/**
* 检查百度排名
* @param url 检查百度的URL
* @param rank 网页排名
* @return
*/
private int searchBaiduRank(String url, Rank rank) {
String targetUrl = rank.getUrl();
try {
Document document = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Host", HOST)
.header("Referer", REFERER)
.header("User-Agent", USER_AGENT)
.get();
String titleCssQuery = "html body div div div div div h3.t a";
Elements elements = document.select(titleCssQuery);
int i=0;
for(Element element : elements){
String title = element.text();
if(StringUtils.isBlank(title)){
continue;
}
i++;
LOGGER.debug(i+":"+title);
if(!title.contains(rank.getKeyword())){
LOGGER.debug("搜索结果标题不包括关键词,忽略");
continue;
}
String href = element.attr("href");
href = UrlTools.normalizeUrl(url, href);
String realUrl = urlConvert(href);
LOGGER.debug("url:"+url);
LOGGER.debug("realUrl:"+realUrl);
LOGGER.debug("targetUrl:"+targetUrl);
if(targetUrl.equals(realUrl)){
return i;
}
}
} catch (Exception ex) {
LOGGER.error("搜索出错",ex);
}
return -1;
}
/**
* 将百度的链接转换为网页的链接
* @param url 百度链接
* @return 网页链接
*/
private static String urlConvert(String url){
try{
if(!url.startsWith("http://www.baidu.com/link?url=")){
//不需要转换URL
return url;
}
LOGGER.debug("转换前的URL:"+url);
Connection.Response response = getResponse(url);
//这里要处理爬虫限制
if(response==null || response.body().contains("请您点击按钮解除封锁")
|| response.body().contains("请输入以下验证码")){
//使用新的IP地址
DynamicIp.toNewIp();
response = getResponse(url);
}
String realUrl = response.header("Location");
LOGGER.debug("转换后的URL:"+realUrl);
//检查网页是否被重定向
//这个检查会导致速度有点慢
//这个检测基本没有必要,除非是那种极其特殊的网站,ITEYE曾经就是,后来在我的建议下改进了
/*
LOGGER.debug("检查是否有重定向:"+realUrl);
Connection.Response response = getResponse(realUrl);
//这里要处理爬虫限制
if(response==null || response.body().contains("请您点击按钮解除封锁")
|| response.body().contains("请输入以下验证码")){
//使用新的IP地址
DynamicIp.toNewIp();
response = getResponse(realUrl);
}
String realUrl2 = response.header("Location");
if(!StringUtils.isBlank(realUrl2)){
LOGGER.debug("检查到重定向到:"+realUrl2);
return realUrl2;
}
*/
return realUrl;
}catch(Exception e){
LOGGER.error("URL转换异常", e);
}
return url;
}
private static Connection.Response getResponse(String url) {
try{
Connection.Response response = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Host", HOST)
.header("Referer", REFERER)
.header("User-Agent", USER_AGENT)
.ignoreContentType(true)
.timeout(30000)
.followRedirects(false)
.execute();
return response;
} catch (Exception e){
LOGGER.debug("获取页面失败:", e);
}
return null;
}
public static void main(String[] args){
BaiduRanker ranker = new BaiduRanker();
/*
Rank rank = new Rank();
rank.setKeyword("Java应用级产品开发平台APDPlat作者杨尚川专访");
rank.setUrl("http://www.iteye.com/magazines/113");
ranker.searchBaiduIndex(rank);
System.out.println(rank);
rank = new Rank();
rank.setKeyword("Java应用级产品开发平台APDPlat作者杨尚川专访");
rank.setUrl("http://www.iteye.com/magazines/113");
ranker.rank(rank);
System.out.println(rank);
rank = new Rank();
rank.setKeyword("QuestionAnsweringSystem v1.1 发布,人机问答系统");
rank.setUrl("http://yangshangchuan.iteye.com/blog/2101533");
ranker.searchBaiduIndex(rank);
System.out.println(rank);
rank = new Rank();
rank.setKeyword("天天向上");
rank.setUrl("http://www.manmankan.com/dy2013/zongyi/201306/6.shtml");
ranker.rank(rank);
System.out.println(rank);
*/
//计算OSCHINA博文在百度的收录与排名情况
//List<Article> articles = DefaultParser.oschinaBlog();
//计算ITEYE博文在百度的收录与排名情况
List<Article> articles = DefaultParser.iteyeBlog();
//将博文转换为排名对象
List<Rank> ranks = new ArrayList<>();
articles.forEach(blog -> {
Rank rank = new Rank();
rank.setKeyword(blog.getTitle());
rank.setUrl(blog.getUrl());
ranks.add(rank);
});
//获取排名信息
ranker.rank(ranks);
//按排名排序
Map<String, Integer> map = new HashMap<>();
ranks.forEach(rank -> map.put(rank.getKeyword(), rank.getRank()));
LOGGER.info("排名博文数目:" + ranks.size());
LOGGER.info("<ol>");
map.entrySet().stream().sorted((a,b)->a.getValue()-b.getValue()).forEach(e -> {
String query = null;
try {
query = URLEncoder.encode(e.getKey(), "UTF-8");
} catch (UnsupportedEncodingException ex) {
LOGGER.error("url构造失败", ex);
return ;
}
LOGGER.info("<li><a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey() + "(" + e.getValue() + ")</a></li>");
});
LOGGER.info("</ol>");
}
}