package xyz.anduo.crawler; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import org.htmlparser.Node; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.lexer.Lexer; import org.htmlparser.util.ParserException; public class StringUtils { public static double getPageSim(String urlStr1, String urlStr2) throws ParserException, IOException { ArrayList<Node> pageNodes1 = new ArrayList<Node>(); URL url1 = new URL(urlStr1); Node node; Lexer lexer = new Lexer(url1.openConnection()); lexer.setNodeFactory(new PrototypicalNodeFactory()); while (null != (node = lexer.nextNode())) { pageNodes1.add(node); } ArrayList<Node> pageNodes2 = new ArrayList<Node>(); URL url2 = new URL(urlStr2); lexer = new Lexer(url2.openConnection()); lexer.setNodeFactory(new PrototypicalNodeFactory()); while (null != (node = lexer.nextNode())) { pageNodes2.add(node); } double distance = PageDistance.longestCommonSubsequence(pageNodes1.toArray(), pageNodes2.toArray()).size(); return (2.0 * distance) / ((double) pageNodes1.size() + (double) pageNodes2.size()); } /** * 利用标题信息计算网页的内容和标题的距离 * * @param title * @param body * @return */ public static double getSimiarity(String title, String body) { int matchNum = 0; for (int i = 0; i < title.length(); ++i) { if (body.indexOf(title.charAt(i)) >= 0) { ++matchNum; } } double score = (double) matchNum / ((double) title.length()); return score; } }