/* * * * * * * APDPlat - Application Product Development Platform * * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * */ package org.seo.rank.impl; import org.apache.commons.lang.StringUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.seo.rank.CopyChecker; import org.seo.rank.tools.DynamicIp; import org.seo.rank.list.UrlTools; import org.seo.rank.list.impl.DefaultParser; import org.seo.rank.model.Article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLEncoder; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * 检查文章抄袭情况 * @author 杨尚川 */ public class BaiduCopyChecker implements CopyChecker { private static final Logger LOGGER = LoggerFactory.getLogger(BaiduCopyChecker.class); private static final String ACCEPT = "text/html, */*; q=0.01"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String HOST = "www.baidu.com"; private static final String REFERER = "http://www.baidu.com"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0"; //获取多少页 private static final int PAGE = 15; private static final int PAGESIZE = 10; @Override public Map<Article, Set<String>> check(List<Article> articles) { Map<Article, Set<String>> data = new HashMap<>(); articles.forEach(article -> { data.put(article, doCheck(article)); }); return data; } public Set<String> doCheck(Article article){ Set<String> data = new HashSet<>(); if(StringUtils.isBlank(article.getTitle()) || StringUtils.isBlank(article.getUrl())){ return data; } String query = null; try { query = URLEncoder.encode(article.getTitle(), "UTF-8"); } catch (UnsupportedEncodingException e) { LOGGER.error("url构造失败", e); return data; } if(StringUtils.isBlank(query)){ return data; } for (int i = 0; i < PAGE; i++) { String url = "http://www.baidu.com/s?tn=monline_5_dg&ie=utf-8&wd=" + query+"&oq="+query+"&usm=3&f=8&bs="+query+"&rsv_bp=1&rsv_sug3=1&rsv_sug4=141&rsv_sug1=1&rsv_sug=1&pn=" + i * PAGESIZE; LOGGER.debug(url); data.addAll(doCheck(url, article)); } return data; } private Set<String> doCheck(String url, Article article) { Set<String> data = new HashSet<>(); try { Document document = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("Host", HOST) .header("Referer", REFERER) .header("User-Agent", USER_AGENT) .get(); String titleCssQuery = "html body div div div div div h3.t a"; Elements elements = document.select(titleCssQuery); int i=0; for(Element element : elements){ String _title = element.text(); if(StringUtils.isBlank(_title)){ continue; } i++; LOGGER.debug(i+":"+_title); if(_title.contains("百度翻译") || !contains(_title, article.getTitle())){ LOGGER.debug("搜索结果检查通过"); continue; } String href = element.attr("href"); href = UrlTools.normalizeUrl(url, href); String realUrl = urlConvert(href); LOGGER.debug("url:"+url); LOGGER.debug("realUrl:"+realUrl); String[] target = new URL(realUrl).getHost().split("\\."); String[] source = new URL(article.getUrl()).getHost().split("\\."); if(target.length>1 && source.length>1 && !(target[target.length-2]+target[target.length-1]).equals(source[source.length-2]+source[source.length-1])) { data.add(realUrl); } } } catch (Exception ex) { LOGGER.error("搜索出错",ex); } return data; } /** * 判断title2是否包含title1,去除标题中的特殊字符 * @param title2 * @param title1 * @return */ private static boolean contains(String title2, String title1){ StringBuilder str2 = new StringBuilder(); StringBuilder str1 = new StringBuilder(); for(char c : title2.toCharArray()){ if(Character.isLetter(c)){ str2.append(c); } } for(char c : title1.toCharArray()){ if(Character.isLetter(c)){ str1.append(c); } } LOGGER.debug("转换标题前:"+title2); LOGGER.debug("转换标题后:"+str2.toString()); LOGGER.debug("转换标题前:"+title1); LOGGER.debug("转换标题后:"+str1.toString()); if(str2.toString().contains(str1.toString())){ LOGGER.debug(title2+" 【包含】 "+title1); return true; } LOGGER.debug(title2+" 【不包含】 "+title1); return false; } /** * 将百度的链接转换为网页的链接 * @param url 百度链接 * @return 网页链接 */ private static String urlConvert(String url){ try{ if(!url.startsWith("http://www.baidu.com/link?url=")){ //不需要转换URL return url; } LOGGER.debug("转换前的URL:"+url); Connection.Response response = getResponse(url); //这里要处理爬虫限制 if(response==null || response.body().contains("请您点击按钮解除封锁") || response.body().contains("请输入以下验证码")){ //使用新的IP地址 DynamicIp.toNewIp(); response = getResponse(url); } String realUrl = response.header("Location"); LOGGER.debug("转换后的URL:"+realUrl); //检查网页是否被重定向 //这个检查会导致速度有点慢 //这个检测基本没有必要,除非是那种极其特殊的网站,ITEYE曾经就是,后来在我的建议下改进了 /* LOGGER.debug("检查是否有重定向:"+realUrl); Connection.Response response = getResponse(realUrl); //这里要处理爬虫限制 if(response==null || response.body().contains("请您点击按钮解除封锁") || response.body().contains("请输入以下验证码")){ //使用新的IP地址 DynamicIp.toNewIp(); response = getResponse(realUrl); } String realUrl2 = response.header("Location"); if(!StringUtils.isBlank(realUrl2)){ LOGGER.debug("检查到重定向到:"+realUrl2); return realUrl2; } */ return realUrl; }catch(Exception e){ LOGGER.error("URL转换异常", e); } return url; } private static Connection.Response getResponse(String url) { try{ Connection.Response response = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("Host", HOST) .header("Referer", REFERER) .header("User-Agent", USER_AGENT) .ignoreContentType(true) .timeout(30000) .followRedirects(false) .execute(); return response; } catch (Exception e){ LOGGER.debug("获取页面失败:", e); } return null; } public static void main(String[] args){ CopyChecker copyChecker = new BaiduCopyChecker(); //计算OSCHINA博文被抄袭的情况 //List<Article> articles = DefaultParser.oschinaBlog(); //计算ITEYE博文被抄袭的情况 List<Article> articles = DefaultParser.iteyeBlog(); //这里排除不统计的博文 articles=articles.stream().filter(article -> !(article.getTitle().contains("idioms") || article.getTitle().contains("分布式内存文件系统:Tachyon") || article.getTitle().contains("Nutch视频") || article.getTitle().contains("如何解决BUG?") || article.getTitle().contains("采集电子报纸") || article.getTitle().contains("汉英双语的差异") || article.getTitle().contains("分布式搜索算法") || article.getTitle().contains("The Future of Compass & ElasticSearch") || article.getTitle().contains("1208个合成词") || article.getTitle().contains("Java远程调试") || article.getTitle().contains("What a Wonderful Code") || article.getTitle().contains("代码评审脚本") || article.getTitle().contains("Linux Netcat command – The swiss army knife of net") || article.getTitle().contains("common prefix different suffix")) ).collect(Collectors.toList()); //检查 Map<Article, Set<String>> result = copyChecker.check(articles); //输出检查报告 LOGGER.info("<h4>检查博文数目:" + articles.size()+"</h4>"); AtomicInteger i = new AtomicInteger(); result.entrySet().stream().sorted((a,b)->b.getValue().size()-a.getValue().size()).forEach(e -> { String query = null; try { query = URLEncoder.encode(e.getKey().getTitle(), "UTF-8"); } catch (UnsupportedEncodingException ex) { LOGGER.error("url构造失败", ex); return; } String originURL = e.getKey().getUrl(); if(e.getValue().size()>0) { LOGGER.info("<h4>"+i.incrementAndGet()+"、<a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey().getTitle() + "</a> 抄袭链接有("+e.getValue().size()+")个</h4>"); LOGGER.info("原文链接:<a target=\"_blank\" href=\"" + originURL + "\">" + originURL + "</a><br/>"); LOGGER.info("抄袭链接:"); LOGGER.info("<ol>"); e.getValue().stream().sorted().forEach(url-> LOGGER.info("<li><a target=\"_blank\" href=\"" + url + "\">" + url + "</a></li>")); LOGGER.info("</ol>"); }else{ LOGGER.info(i.incrementAndGet()+"、<a target=\"_blank\" href=\"http://www.baidu.com/s?wd=" + query + "\">" + e.getKey().getTitle() + "</a><br/>"); LOGGER.info("原文链接:<a target=\"_blank\" href=\"" + originURL + "\">" + originURL + "</a> 无抄袭链接<br/>"); } }); } }