/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.util; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Pattern; import org.apache.nutch.util.domain.DomainSuffix; import org.apache.nutch.util.domain.DomainSuffixes; /** Utility class for URL analysis */ public class URLUtil { private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})"); /** Returns the domain name of the url. The domain name of a url is * the substring of the url's hostname, w/o subdomain names. As an * example <br><code> * getDomainName(conf, new URL(http://lucene.apache.org/)) * </code><br> * will return <br><code> apache.org</code> * */ public static String getDomainName(URL url) { DomainSuffixes tlds = DomainSuffixes.getInstance(); String host = url.getHost(); //it seems that java returns hostnames ending with . if(host.endsWith(".")) host = host.substring(0, host.length() - 1); if(IP_PATTERN.matcher(host).matches()) return host; int index = 0; String candidate = host; for(;index >= 0;) { index = candidate.indexOf('.'); String subCandidate = candidate.substring(index+1); if(tlds.isDomainSuffix(subCandidate)) { return candidate; } candidate = subCandidate; } return candidate; } /** Returns the domain name of the url. The domain name of a url is * the substring of the url's hostname, w/o subdomain names. As an * example <br><code> * getDomainName(conf, new http://lucene.apache.org/) * </code><br> * will return <br><code> apache.org</code> * @throws MalformedURLException */ public static String getDomainName(String url) throws MalformedURLException { return getDomainName(new URL(url)); } /** Returns whether the given urls have the same domain name. * As an example, <br> * <code> isSameDomain(new URL("http://lucene.apache.org") * , new URL("http://people.apache.org/")) * <br> will return true. </code> * * @return true if the domain names are equal */ public static boolean isSameDomainName(URL url1, URL url2) { return getDomainName(url1).equalsIgnoreCase(getDomainName(url2)); } /**Returns whether the given urls have the same domain name. * As an example, <br> * <code> isSameDomain("http://lucene.apache.org" * ,"http://people.apache.org/") * <br> will return true. </code> * @return true if the domain names are equal * @throws MalformedURLException */ public static boolean isSameDomainName(String url1, String url2) throws MalformedURLException { return isSameDomainName(new URL(url1), new URL(url2)); } /** Returns the {@link DomainSuffix} corresponding to the * last public part of the hostname */ public static DomainSuffix getDomainSuffix(URL url) { DomainSuffixes tlds = DomainSuffixes.getInstance(); String host = url.getHost(); if(IP_PATTERN.matcher(host).matches()) return null; int index = 0; String candidate = host; for(;index >= 0;) { index = candidate.indexOf('.'); String subCandidate = candidate.substring(index+1); DomainSuffix d = tlds.get(subCandidate); if(d != null) { return d; } candidate = subCandidate; } return null; } /** Returns the {@link DomainSuffix} corresponding to the * last public part of the hostname */ public static DomainSuffix getDomainSuffix(String url) throws MalformedURLException { return getDomainSuffix(new URL(url)); } /** Partitions of the hostname of the url by "." */ public static String[] getHostSegments(URL url) { String host = url.getHost(); //return whole hostname, if it is an ipv4 //TODO : handle ipv6 if(IP_PATTERN.matcher(host).matches()) return new String[] {host}; return host.split("\\."); } /** Partitions of the hostname of the url by "." * @throws MalformedURLException */ public static String[] getHostSegments(String url) throws MalformedURLException { return getHostSegments(new URL(url)); } /** Given two urls (source and destination of the redirect), * returns the representative one. * * <p>Implements the algorithm described here: * <br> * <a href="http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> * How does the Yahoo! webcrawler handle redirects?</a> * <br><br> * The algorithm is as follows: * <ol> * <li>Choose target url if either url is malformed.</li> * <li>When a page in one domain redirects to a page in another domain, * choose the "target" URL.</li> * <li>When a top-level page in a domain presents a permanent redirect * to a page deep within the same domain, choose the "source" URL.</li> * <li>When a page deep within a domain presents a permanent redirect * to a page deep within the same domain, choose the "target" URL.</li> * <li>When a page in a domain presents a temporary redirect to * another page in the same domain, choose the "source" URL.<li> * <ol> * </p> * * @param src Source url of redirect * @param dst Destination url of redirect * @param temp Flag to indicate if redirect is temporary * @return Representative url (either src or dst) */ public static String chooseRepr(String src, String dst, boolean temp) { URL srcUrl; URL dstUrl; try { srcUrl = new URL(src); dstUrl = new URL(dst); } catch (MalformedURLException e) { return dst; } String srcDomain = URLUtil.getDomainName(srcUrl); String dstDomain = URLUtil.getDomainName(dstUrl); if (!srcDomain.equals(dstDomain)) { return dst; } String srcFile = srcUrl.getFile(); if (!temp && srcFile.equals("/")) { return src; } return temp ? src : dst; } /** For testing */ public static void main(String[] args){ if(args.length!=1) { System.err.println("Usage : URLUtil <url>"); return ; } String url = args[0]; try { System.out.println(URLUtil.getDomainName(new URL(url))); } catch (MalformedURLException ex) { ex.printStackTrace(); } } }