package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Request; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * url and html utils. * * @author code4crafter@gmail.com <br> * @since 0.1.0 */ public class UrlUtils { /** * canonicalizeUrl * <br> * Borrowed from Jsoup. * * @param url url * @param refer refer * @return canonicalizeUrl */ public static String canonicalizeUrl(String url, String refer) { URL base; try { try { base = new URL(refer); } catch (MalformedURLException e) { // the base is unsuitable, but the attribute may be abs on its own, so try that URL abs = new URL(refer); return abs.toExternalForm(); } // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); return encodeIllegalCharacterInUrl(abs.toExternalForm()); } catch (MalformedURLException e) { return ""; } } /** * * @param url url * @return new url */ public static String encodeIllegalCharacterInUrl(String url) { //TODO more charator support return url.replace(" ", "%20"); } public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); if (i > 0) { host = StringUtils.substring(url, 0, i); } return host; } private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); public static String removeProtocol(String url) { return patternForProtocal.matcher(url).replaceAll(""); } public static String getDomain(String url) { String domain = removeProtocol(url); int i = StringUtils.indexOf(domain, "/", 1); if (i > 0) { domain = StringUtils.substring(domain, 0, i); } return removePort(domain); } public static String removePort(String domain) { int portIndex = domain.indexOf(":"); if (portIndex != -1) { return domain.substring(0, portIndex); }else { return domain; } } public static List<Request> convertToRequests(Collection<String> urls) { List<Request> requestList = new ArrayList<Request>(urls.size()); for (String url : urls) { requestList.add(new Request(url)); } return requestList; } public static List<String> convertToUrls(Collection<Request> requests) { List<String> urlList = new ArrayList<String>(requests.size()); for (Request request : requests) { urlList.add(request.getUrl()); } return urlList; } private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } return null; } }