package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * url及html处理工具类。<br> * @author code4crafter@gmail.com <br> * Date: 13-4-21 * Time: 下午1:52 */ public class UrlUtils { private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/"); /** * 将url想对地址转化为绝对地址 * @param url url地址 * @param refer url地址来自哪个页面 * @return url绝对地址 */ public static String canonicalizeUrl(String url, String refer) { if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { return url; } if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) { return url; } if (StringUtils.startsWith(url, "/")) { String host = getHost(refer); return host + url; } else if (!StringUtils.startsWith(url, ".")) { refer = reversePath(refer, 1); return refer + "/" + url; } else { Matcher matcher = relativePathPattern.matcher(url); if (matcher.find()) { int reverseDepth = matcher.group(1).length(); refer = reversePath(refer, reverseDepth); String substring = StringUtils.substring(url, matcher.end()); return refer + "/" + substring; } else { refer = reversePath(refer, 1); return refer + "/" + url; } } } public static String reversePath(String url, int depth) { int i = StringUtils.lastOrdinalIndexOf(url, "/", depth); if (i < 10) { url = getHost(url); } else { url = StringUtils.substring(url, 0, i); } return url; } public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); if (i > 0) { host = StringUtils.substring(url, 0, i); } return host; } private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); public static String removeProtocol(String url) { return patternForProtocal.matcher(url).replaceAll(""); } public static String getDomain(String url) { String domain = removeProtocol(url); int i = StringUtils.indexOf(domain, "/", 1); if (i > 0) { domain = StringUtils.substring(domain, 0, i); } return domain; } private static Pattern patternForHref = Pattern.compile("(<a[^<>]*href=)[\"']{0,1}([^\"'<>\\s]*)[\"']{0,1}", Pattern.CASE_INSENSITIVE); public static String fixAllRelativeHrefs(String html, String url) { StringBuilder stringBuilder = new StringBuilder(); Matcher matcher = patternForHref.matcher(html); int lastEnd = 0; while (matcher.find()) { stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(matcher.group(1)); stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } stringBuilder.append(StringUtils.substring(html, lastEnd)); return stringBuilder.toString(); } private static final Pattern patternForCharset = Pattern.compile("charset=([^\\s;]*)"); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { return matcher.group(1); } else { return null; } } }