URLUtil.java example

Explorer

meaningfulweb-master
- meaningfulweb-app
  - src
    - main
      - java
        org
        meaningfulweb
        servlet
        FileFactoryBean.java
        HtmlExtractorController.java
        MeaningfulWebServlet.java
- meaningfulweb-core
  - src
    - main
      - java
        org
        meaningfulweb
        api
        MeaningfulWebObject.java
        MetaContentExtractor.java
        StressTest.java
        cext
        Extract.java
        ExtractForm.java
        ExtractUtils.java
        HtmlContentPipeline.java
        HtmlContentProcessor.java
        HtmlContentProcessorFactory.java
        HtmlExtractor.java
        processors
        ArticleProcessor.java
        BestImageProcessor.java
        BoilerpipeArticleProcessor.java
        DomainSpecifiedImageProcessor.java
        ElementProcessor.java
        FullContentProcessor.java
        HyperlinkProcessor.java
        ImageProcessor.java
        MainContentProcessor.java
        MeaningfulwebCompositeProcessor.java
        OpengraphContentProcessor.java
        ParagraphProcessor.java
        RegexProcessor.java
        ScriptProcessor.java
        SystemCommandProcessor.java
        TwitpicExtractionHandler.java
        XPathCleanerProcessor.java
        XPathProcessor.java
        detector
        DetectorFactory.java
        imgext
        ExtractedContents.java
        ImageFetcher.java
        ImageFilter.java
        ImageHeader.java
        ImageInfo.java
        ImageMeta.java
        ImageProp.java
        ImageSelector.java
        ImageSizeExtractor.java
        util
        EncodingUtils.java
        HTMLOutputter.java
        HtmlExtractUtils.java
        ImageUtil.java
        JDomUtils.java
        JsonUtils.java
        ProcessResponse.java
        ProcessUtils.java
        SystemCommand.java
        TempDirUtils.java
        URIUtils.java
        URLUtil.java
        XMLUtils.java
        domain
        DomainSuffix.java
        DomainSuffixes.java
        DomainSuffixesReader.java
        TopLevelDomain.java
        http
        HttpClientFactory.java
        HttpClientService.java
        HttpComponentsServiceImpl.java
        HttpException.java
        security
        AuthenticationService.java
        ReloadableFileAuthenticationServiceImpl.java
    - test
      - java
        org
        meaningfulweb
        core
        test
        MWCoreTest.java
- meaningfulweb-opengraph
  - src
    - main
      - java
        org
        meaningfulweb
        opengraph
        OGObject.java
        OpenGraphContentHandler.java
        OpenGraphParser.java
        OpenGraphVocabulary.java
    - test
      - java
        org
        meaningfulweb
        opengraph
        test
        Og4jTestCase.java
        Og4jTestSuite.java

package org.meaningfulweb.util;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.meaningfulweb.util.domain.DomainSuffix;
import org.meaningfulweb.util.domain.DomainSuffixes;

/** Utility class for URL analysis */
public class URLUtil {
	
	 private final static String           TOP_LEVEL_DOMAINS =
		    "((?i)aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|" +
		    "[a-z]{2})";
	  private final static String           URL_HTTP = "(ht|f)tp(s?)\\:\\/\\/";
	  
	  
	  private final static String           URL_REGEX_STRING_POST = 
	      // example               (group #)
	     "([\\w]+:\\w+@)?" +                             // username:password@    (4)
	     "(" +
	     "((www\\.)?" +                                // www.                  (7)
	     "(([a-zA-Z0-9][\\w\\-]*\\.)+" +             // ddrfreak.             (8)
	     TOP_LEVEL_DOMAINS + "))" +                  // com                   (8)(9)
	     "|" +
	     "((\\d{1,3})(\\.\\d{1,3}){3})" +              // 64.71.156.35
	     ")" +                                           
	     "(:[\\d]{1,5})?" +                              // :80
	     "(" +  // optional                              //                       (15)

	     "(" +  // any directory or filename block, if present, must start with "/"
	     "(/+)" +                                    // /
	     "([\\S&&[^</\"]]+/+)*" +                    // locations/
	     "([\\S&&[^</\"]]*" +                            
	     "([\\S&&[^</\".,;:!>'\\])\\?]])" +        // locations
	     "(\\.[\\w]{3,4})?" +                      // .php
	     ")?" +
	     ")?" +

	     "(" +
	     "(\\?\\w+(=[%\\w]+)?)" +                    // ?action=displayLocation
	     "(&\\w+(=\\w+)?)*" +                        // &locationID=265
	     ")?" +
	     ")?";
	  
	  private final static Pattern          URL_INCOMPLETE_REGEX = 
		    Pattern.compile("(" + URL_HTTP + ")?" +             // http:// (optional)
		                    URL_REGEX_STRING_POST);
	  
	  public final static int DOMAIN_NAME = 8;
	  
	  public static String extractDomainFromUrl(String url)
	  {
	    if (url==null || url.length() == 0) return null;
	    Matcher matcher = URL_INCOMPLETE_REGEX.matcher(url);
	    if(matcher == null) return null;
	    if (matcher.matches())
	    {
	      String group = matcher.group(DOMAIN_NAME);
	      if (group == null)
	        return null;
	      return group.trim().toLowerCase();
	    }
	      return null;
	    }

  private static Pattern IP_PATTERN = Pattern
    .compile("(\\d{1,3}\\.){3}(\\d{1,3})");

  /** Returns the domain name of the url. The domain name of a url is
   *  the substring of the url's hostname, w/o subdomain names. As an
   *  example <br><code>
   *  getDomainName(conf, new URL(http://lucene.apache.org/))
   *  </code><br>
   *  will return <br><code> apache.org</code>
   *   */
  public static String getDomainName(URL url) {
    DomainSuffixes tlds = DomainSuffixes.getInstance();
    String host = url.getHost();
    // it seems that java returns hostnames ending with .
    if (host.endsWith("."))
      host = host.substring(0, host.length() - 1);
    if (IP_PATTERN.matcher(host).matches())
      return host;

    int index = 0;
    String candidate = host;
    for (; index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index + 1);
      if (tlds.isDomainSuffix(subCandidate)) {
        return candidate;
      }
      candidate = subCandidate;
    }
    return candidate;
  }

  /** Returns the domain name of the url. The domain name of a url is
   *  the substring of the url's hostname, w/o subdomain names. As an
   *  example <br><code>
   *  getDomainName(conf, new http://lucene.apache.org/)
   *  </code><br>
   *  will return <br><code> apache.org</code>
   * @throws MalformedURLException
   */
  public static String getDomainName(String url)
    throws MalformedURLException {
    return getDomainName(new URL(url));
  }

  /** Returns whether the given urls have the same domain name.
   * As an example, <br>
   * <code> isSameDomain(new URL("http://lucene.apache.org")
   * , new URL("http://people.apache.org/"))
   * <br> will return true. </code>
   *
   * @return true if the domain names are equal
   */
  public static boolean isSameDomainName(URL url1, URL url2) {
    return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
  }

  /**Returns whether the given urls have the same domain name.
  * As an example, <br>
  * <code> isSameDomain("http://lucene.apache.org"
  * ,"http://people.apache.org/")
  * <br> will return true. </code>
  * @return true if the domain names are equal
  * @throws MalformedURLException
  */
  public static boolean isSameDomainName(String url1, String url2)
    throws MalformedURLException {
    return isSameDomainName(new URL(url1), new URL(url2));
  }

  /** Returns the {@link DomainSuffix} corresponding to the
   * last public part of the hostname
   */
  public static DomainSuffix getDomainSuffix(URL url) {
    DomainSuffixes tlds = DomainSuffixes.getInstance();
    String host = url.getHost();
    if (IP_PATTERN.matcher(host).matches())
      return null;

    int index = 0;
    String candidate = host;
    for (; index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index + 1);
      DomainSuffix d = tlds.get(subCandidate);
      if (d != null) {
        return d;
      }
      candidate = subCandidate;
    }
    return null;
  }

  /** Returns the {@link DomainSuffix} corresponding to the
   * last public part of the hostname
   */
  public static DomainSuffix getDomainSuffix(String url)
    throws MalformedURLException {
    return getDomainSuffix(new URL(url));
  }

  /** Partitions of the hostname of the url by "."  */
  public static String[] getHostSegments(URL url) {
    String host = url.getHost();
    // return whole hostname, if it is an ipv4
    // TODO : handle ipv6
    if (IP_PATTERN.matcher(host).matches())
      return new String[]{host};
    return host.split("\\.");
  }

  /** Partitions of the hostname of the url by "."
   * @throws MalformedURLException */
  public static String[] getHostSegments(String url)
    throws MalformedURLException {
    return getHostSegments(new URL(url));
  }

  /**
   * Returns the lowercased hostname for the url or null if the url is not well
   * formed.
   * 
   * @param url The url to check.
   * @return String The hostname for the url.
   */
  public static String getHost(String url) {
    try {
      return new URL(url).getHost().toLowerCase();
    }
    catch (MalformedURLException e) {
      return null;
    }
  }

  /**
   * Returns the page for the url.  The page consists of the protocol, host,
   * and path, but does not include the query string.  The host is lowercased
   * but the path is not.
   * 
   * @param url The url to check.
   * @return String The page for the url.
   */
  public static String getPage(String url) {
    try {
      // get the full url, and replace the query string with and empty string
      url = url.toLowerCase();
      String queryStr = new URL(url).getQuery();
      return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
    }
    catch (MalformedURLException e) {
      return null;
    }
  }

  public static String getProtocol(String url) {
    try {
      // get the full url, and replace the query string with and empty string
      url = url.toLowerCase();
      String protocolStr = new URL(url).getProtocol();
      return protocolStr;
    }
    catch (MalformedURLException e) {
      return null;
    }
  }

  public static String stripProtocol(String url) {
    try {
      // get the full url, and replace the query string with and empty string
      url = url.toLowerCase();
      String protocolStr = new URL(url).getProtocol();
      return (protocolStr != null) ? url.replace(protocolStr + "://", "") : url;
    }
    catch (MalformedURLException e) {
      return null;
    }
  }

  public static String getExtension(String filename) {
    int extIndex = StringUtils.lastIndexOf(filename, ".");
    return extIndex > 0 && extIndex + 1 < filename.length() ? StringUtils
      .substring(filename, extIndex + 1) : null;
  }

  public static String toAbsoluteURL(String baseURL, String contextUrl,String relativeURL) {
    
    // first we try to use java.net.URL to perform the conversion, if that
    // fails we can try using our own routine.
    try {
      URL base = relativeURL.startsWith("/") ? new URL(baseURL) : new URL(contextUrl);
      String absolute = new URL(base, relativeURL).toExternalForm();
      return absolute;
    }
    catch (MalformedURLException e) {

      if (baseURL == null || baseURL.length() < 8)
        throw new IllegalArgumentException("baseURL must be a valid URL");
      if (relativeURL == null)
        return null;

      // rooted relative URL
      if (relativeURL.startsWith("/")) {
        int pos = baseURL.indexOf("/", 8);
        if (pos > -1) {
          baseURL = baseURL.substring(0, pos);
        }
      }
      else {
        int slashPosition = baseURL.lastIndexOf('/');
        if (slashPosition < 0)
          throw new IllegalArgumentException("baseURL must be a valid URL");
        baseURL = baseURL.substring(0, slashPosition);
        relativeURL = "/" + relativeURL;
      }

      return baseURL + relativeURL;
    }
  }
}