/** * * APDPlat - Application Product Development Platform * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.seo.rank.impl; import org.apdplat.word.analysis.*; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.seo.rank.SimilarChecker; import org.seo.rank.list.impl.DefaultParser; import org.seo.rank.model.Article; import org.seo.rank.tools.DynamicIp; import org.seo.rank.tools.ProxyIp; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.math.BigDecimal; import java.net.URL; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * 基于word分词提供的文本相似度算法来实现通用的网页相似度检测 * @author 杨尚川 */ public class WordBasedGenericWebPageSimilarChecker implements SimilarChecker { private static final Logger LOGGER = LoggerFactory.getLogger(WordBasedGenericWebPageSimilarChecker.class); private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; private static final float THRESHOLD_RATE = 0.5F; private TextSimilarity textSimilarity = new EditDistanceTextSimilarity(); public WordBasedGenericWebPageSimilarChecker(){} public WordBasedGenericWebPageSimilarChecker(TextSimilarity textSimilarity){ this.textSimilarity = textSimilarity; } public void setTextSimilarity(TextSimilarity textSimilarity) { this.textSimilarity = textSimilarity; } @Override public boolean isSimilar(String url1, String url2) { return similarScore(url1, url2)>=THRESHOLD_RATE; } @Override public double similarScore(String url1, String url2) { WebPage webPage1 = getWebPage(url1); if(webPage1!=null) { WebPage webPage2 = getWebPage(url2); if(webPage2!=null) { double score = textSimilarity.similarScore(webPage1.getContent(), webPage2.getContent()); return score; } } return 0.0; } public String contrastSimilarScore(String url1, String url2, List<TextSimilarity> textSimilarities) { StringBuilder result = new StringBuilder(); WebPage webPage1 = getWebPage(url1); if(webPage1!=null) { WebPage webPage2 = getWebPage(url2); if(webPage2!=null) { textSimilarities.forEach(textSimilarity -> { double score = textSimilarity.similarScore(webPage1.getContent(), webPage2.getContent()); result.append(textSimilarity.getClass().getSimpleName().replace("TextSimilarity", "")) .append("=") .append(BigDecimal.valueOf(score).toString()) .append(" "); }); } } return result.toString(); } private WebPage getWebPage(String url){ WebPage webPage = getWebPageInternal(url); int times = 1; while (webPage==null && times<4){ times++; //使用新的IP地址 DynamicIp.toNewIp(); webPage = getWebPageInternal(url); } if(webPage==null){ return null; } times = 1; //LOGGER.debug("获取到的HTML:" +html); while((webPage.getContent().contains("非常抱歉,来自您ip的请求异常频繁") || webPage.getContent().contains("请您点击按钮解除封锁") || webPage.getContent().contains("请输入以下验证码")) && times<4){ times++; //使用新的IP地址 ProxyIp.toNewIp(); webPage = getWebPageInternal(url); } return webPage; } private WebPage getWebPageInternal(String url) { try { Document doc = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("Referer", "http://"+new URL(url).getHost()) .header("Host", new URL(url).getHost()) .header("User-Agent", USER_AGENT) .header("X-Forwarded-For", getRandomIp()) .header("Proxy-Client-IP", getRandomIp()) .header("WL-Proxy-Client-IP", getRandomIp()) .ignoreContentType(true) .timeout(30000) .get(); WebPage webPage = new WebPage(); webPage.setUrl(url); webPage.setContent(doc.text()); webPage.setTitle(doc.title()); return webPage; } catch (Exception e) { LOGGER.error("获取网页失败", e); } return null; } private String getRandomIp(){ int first = new Random().nextInt(254)+1; //排除A类私有地址0.0.0.0--10.255.255.255 while(first==10){ first = new Random().nextInt(254)+1; } int second = new Random().nextInt(254)+1; //排除B类私有地址172.16.0.0--172.31.255.255 while(first==172 && (second>=16 && second<=31)){ first = new Random().nextInt(254)+1; second = new Random().nextInt(254)+1; } //排除C类私有地址192.168.0.0--192.168.255.255 while(first==192 && second==168){ first = new Random().nextInt(254)+1; second = new Random().nextInt(254)+1; } int third = new Random().nextInt(254)+1; int forth = new Random().nextInt(254)+1; return first+"."+second+"."+second+"."+forth; } private static class WebPage{ private String url; private String title; private String content; public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } } /** * 我的ITEYE和OSCHINA博客有很多同样的博文,主要目的是备份 * 这里刚好用来测试相似性检测算法的效果 * http://yangshangchuan.iteye.com * http://my.oschina.net/apdplat/blog */ private void verifyYscBlog(List<TextSimilarity> textSimilarities){ List<Article> ob = DefaultParser.oschinaBlog(); List<Article> ib = DefaultParser.iteyeBlog(); Map<String, String> om = new HashMap<>(); Map<String, String> im = new HashMap<>(); ob.stream().forEach(b->om.put(b.getTitle(), b.getUrl())); ib.stream().forEach(b->im.put(b.getTitle(), b.getUrl())); List<String> oschinaBlog = ob.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList()); List<String> iteyeBlog = ib.stream().map(b -> b.getTitle()).sorted().collect(Collectors.toList()); List<String> commons = oschinaBlog.stream().filter(b -> iteyeBlog.contains(b)).collect(Collectors.toList()); commons.remove("自动更改IP地址反爬虫封锁,支持多线程"); Map<String, String> result = new HashMap<>(); AtomicInteger similarCount = new AtomicInteger(); AtomicInteger j = new AtomicInteger(); commons.forEach(title -> { String contrastResult = contrastSimilarScore(om.get(title), im.get(title), textSimilarities); LOGGER.info(contrastResult+" "+title+" "+om.get(title)+" "+im.get(title)); result.put(title, contrastResult); LOGGER.info("进度:" + commons.size() + "/" + j.incrementAndGet()); }); LOGGER.info("<h4>检查的博文数:" + commons.size() + "</h4>"); AtomicInteger i = new AtomicInteger(); result .entrySet() .stream() .forEach(e -> { LOGGER.info(""); LOGGER.info("<h4>"+i.incrementAndGet() + "、检查博文" + ":" + e.getKey()+",相似度分值:"+e.getValue()+"</h4>"); LOGGER.info("\t博文地址1:<a target=\"_blank\" href=\""+om.get(e.getKey())+"\">"+om.get(e.getKey())+"</a><br/>"); LOGGER.info("\t博文地址2:<a target=\"_blank\" href=\""+im.get(e.getKey())+"\">"+im.get(e.getKey())+"</a><br/>"); }); } public static void main(String[] args) throws Exception{ List<TextSimilarity> textSimilarities = Arrays.asList(new SimpleTextSimilarity(), new CosineTextSimilarity(), new EditDistanceTextSimilarity(), new EuclideanDistanceTextSimilarity(), new ManhattanDistanceTextSimilarity(), new JaccardTextSimilarity(), new JaroDistanceTextSimilarity(), new JaroWinklerDistanceTextSimilarity(), new SørensenDiceCoefficientTextSimilarity(), new SimHashPlusHammingDistanceTextSimilarity()); WordBasedGenericWebPageSimilarChecker similarChecker = new WordBasedGenericWebPageSimilarChecker(); similarChecker.verifyYscBlog(textSimilarities); } }