package org.meaningfulweb.util; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.meaningfulweb.util.domain.DomainSuffix; import org.meaningfulweb.util.domain.DomainSuffixes; /** Utility class for URL analysis */ public class URLUtil { private final static String TOP_LEVEL_DOMAINS = "((?i)aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|" + "[a-z]{2})"; private final static String URL_HTTP = "(ht|f)tp(s?)\\:\\/\\/"; private final static String URL_REGEX_STRING_POST = // example (group #) "([\\w]+:\\w+@)?" + // username:password@ (4) "(" + "((www\\.)?" + // www. (7) "(([a-zA-Z0-9][\\w\\-]*\\.)+" + // ddrfreak. (8) TOP_LEVEL_DOMAINS + "))" + // com (8)(9) "|" + "((\\d{1,3})(\\.\\d{1,3}){3})" + // 64.71.156.35 ")" + "(:[\\d]{1,5})?" + // :80 "(" + // optional // (15) "(" + // any directory or filename block, if present, must start with "/" "(/+)" + // / "([\\S&&[^</\"]]+/+)*" + // locations/ "([\\S&&[^</\"]]*" + "([\\S&&[^</\".,;:!>'\\])\\?]])" + // locations "(\\.[\\w]{3,4})?" + // .php ")?" + ")?" + "(" + "(\\?\\w+(=[%\\w]+)?)" + // ?action=displayLocation "(&\\w+(=\\w+)?)*" + // &locationID=265 ")?" + ")?"; private final static Pattern URL_INCOMPLETE_REGEX = Pattern.compile("(" + URL_HTTP + ")?" + // http:// (optional) URL_REGEX_STRING_POST); public final static int DOMAIN_NAME = 8; public static String extractDomainFromUrl(String url) { if (url==null || url.length() == 0) return null; Matcher matcher = URL_INCOMPLETE_REGEX.matcher(url); if(matcher == null) return null; if (matcher.matches()) { String group = matcher.group(DOMAIN_NAME); if (group == null) return null; return group.trim().toLowerCase(); } return null; } private static Pattern IP_PATTERN = Pattern .compile("(\\d{1,3}\\.){3}(\\d{1,3})"); /** Returns the domain name of the url. The domain name of a url is * the substring of the url's hostname, w/o subdomain names. As an * example <br><code> * getDomainName(conf, new URL(http://lucene.apache.org/)) * </code><br> * will return <br><code> apache.org</code> * */ public static String getDomainName(URL url) { DomainSuffixes tlds = DomainSuffixes.getInstance(); String host = url.getHost(); // it seems that java returns hostnames ending with . if (host.endsWith(".")) host = host.substring(0, host.length() - 1); if (IP_PATTERN.matcher(host).matches()) return host; int index = 0; String candidate = host; for (; index >= 0;) { index = candidate.indexOf('.'); String subCandidate = candidate.substring(index + 1); if (tlds.isDomainSuffix(subCandidate)) { return candidate; } candidate = subCandidate; } return candidate; } /** Returns the domain name of the url. The domain name of a url is * the substring of the url's hostname, w/o subdomain names. As an * example <br><code> * getDomainName(conf, new http://lucene.apache.org/) * </code><br> * will return <br><code> apache.org</code> * @throws MalformedURLException */ public static String getDomainName(String url) throws MalformedURLException { return getDomainName(new URL(url)); } /** Returns whether the given urls have the same domain name. * As an example, <br> * <code> isSameDomain(new URL("http://lucene.apache.org") * , new URL("http://people.apache.org/")) * <br> will return true. </code> * * @return true if the domain names are equal */ public static boolean isSameDomainName(URL url1, URL url2) { return getDomainName(url1).equalsIgnoreCase(getDomainName(url2)); } /**Returns whether the given urls have the same domain name. * As an example, <br> * <code> isSameDomain("http://lucene.apache.org" * ,"http://people.apache.org/") * <br> will return true. </code> * @return true if the domain names are equal * @throws MalformedURLException */ public static boolean isSameDomainName(String url1, String url2) throws MalformedURLException { return isSameDomainName(new URL(url1), new URL(url2)); } /** Returns the {@link DomainSuffix} corresponding to the * last public part of the hostname */ public static DomainSuffix getDomainSuffix(URL url) { DomainSuffixes tlds = DomainSuffixes.getInstance(); String host = url.getHost(); if (IP_PATTERN.matcher(host).matches()) return null; int index = 0; String candidate = host; for (; index >= 0;) { index = candidate.indexOf('.'); String subCandidate = candidate.substring(index + 1); DomainSuffix d = tlds.get(subCandidate); if (d != null) { return d; } candidate = subCandidate; } return null; } /** Returns the {@link DomainSuffix} corresponding to the * last public part of the hostname */ public static DomainSuffix getDomainSuffix(String url) throws MalformedURLException { return getDomainSuffix(new URL(url)); } /** Partitions of the hostname of the url by "." */ public static String[] getHostSegments(URL url) { String host = url.getHost(); // return whole hostname, if it is an ipv4 // TODO : handle ipv6 if (IP_PATTERN.matcher(host).matches()) return new String[]{host}; return host.split("\\."); } /** Partitions of the hostname of the url by "." * @throws MalformedURLException */ public static String[] getHostSegments(String url) throws MalformedURLException { return getHostSegments(new URL(url)); } /** * Returns the lowercased hostname for the url or null if the url is not well * formed. * * @param url The url to check. * @return String The hostname for the url. */ public static String getHost(String url) { try { return new URL(url).getHost().toLowerCase(); } catch (MalformedURLException e) { return null; } } /** * Returns the page for the url. The page consists of the protocol, host, * and path, but does not include the query string. The host is lowercased * but the path is not. * * @param url The url to check. * @return String The page for the url. */ public static String getPage(String url) { try { // get the full url, and replace the query string with and empty string url = url.toLowerCase(); String queryStr = new URL(url).getQuery(); return (queryStr != null) ? url.replace("?" + queryStr, "") : url; } catch (MalformedURLException e) { return null; } } public static String getProtocol(String url) { try { // get the full url, and replace the query string with and empty string url = url.toLowerCase(); String protocolStr = new URL(url).getProtocol(); return protocolStr; } catch (MalformedURLException e) { return null; } } public static String stripProtocol(String url) { try { // get the full url, and replace the query string with and empty string url = url.toLowerCase(); String protocolStr = new URL(url).getProtocol(); return (protocolStr != null) ? url.replace(protocolStr + "://", "") : url; } catch (MalformedURLException e) { return null; } } public static String getExtension(String filename) { int extIndex = StringUtils.lastIndexOf(filename, "."); return extIndex > 0 && extIndex + 1 < filename.length() ? StringUtils .substring(filename, extIndex + 1) : null; } public static String toAbsoluteURL(String baseURL, String contextUrl,String relativeURL) { // first we try to use java.net.URL to perform the conversion, if that // fails we can try using our own routine. try { URL base = relativeURL.startsWith("/") ? new URL(baseURL) : new URL(contextUrl); String absolute = new URL(base, relativeURL).toExternalForm(); return absolute; } catch (MalformedURLException e) { if (baseURL == null || baseURL.length() < 8) throw new IllegalArgumentException("baseURL must be a valid URL"); if (relativeURL == null) return null; // rooted relative URL if (relativeURL.startsWith("/")) { int pos = baseURL.indexOf("/", 8); if (pos > -1) { baseURL = baseURL.substring(0, pos); } } else { int slashPosition = baseURL.lastIndexOf('/'); if (slashPosition < 0) throw new IllegalArgumentException("baseURL must be a valid URL"); baseURL = baseURL.substring(0, slashPosition); relativeURL = "/" + relativeURL; } return baseURL + relativeURL; } } }