URLUtils.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.util;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.commoncrawl.util.GoogleURL;
import org.junit.Assert;

import com.google.common.collect.ImmutableMultimap;

/**
 * 
 * @author rana
 * 
 */
public class URLUtils {

  private static final Log              LOG                  = LogFactory
                                                                 .getLog(URLUtils.class);

  /** session id normalizer **/
  public static SessionIDURLNormalizer sessionIdNormalizer = new SessionIDURLNormalizer();

  /**
   * canonicalize url
   * 
   * @param incomingURL
   * @param stripLeadingWWW
   *          - set to true to string www. prefix from the domain if present
   * @return a canonical representation of the passed in URL that can be safely
   *         used as a replacement for the original url
   * @throws MalformedURLException
   */

  public static String canonicalizeURL(String incomingURL,
      boolean stripLeadingWWW) throws MalformedURLException {

    GoogleURL urlObject = new GoogleURL(incomingURL);

    if (!urlObject.isValid()) {
      throw new MalformedURLException("URL:" + incomingURL + " is invalid");
    }

    return canonicalizeURL(urlObject, stripLeadingWWW);
  }

  public static String canonicalizeURL(GoogleURL urlObject,
      boolean stripLeadingWWW) throws MalformedURLException {

    StringBuilder urlOut = new StringBuilder();

    urlOut.append(urlObject.getScheme());
    urlOut.append("://");

    if (urlObject.getUserName() != GoogleURL.emptyString) {
      urlOut.append(urlObject.getUserName());
      if (urlObject.getPassword() != GoogleURL.emptyString) {
        urlOut.append(":");
        urlOut.append(urlObject.getPassword());
      }
      urlOut.append("@");
    }

    String host = urlObject.getHost();
    if (host.endsWith(".")) {
      host = host.substring(0, host.length() - 1);
    }

    if (stripLeadingWWW) {
      if (host.startsWith("www.")) {
        // ok now. one nasty hack ... :-(
        // if root name is null or root name does not equal full host name ...
        String rootName = extractRootDomainName(host);
        if (rootName == null || !rootName.equals(host)) {
          // striping the www. prefix
          host = host.substring(4);
        }
      }
    }
    urlOut.append(host);

    if (urlObject.getPort() != GoogleURL.emptyString
        && !urlObject.getPort().equals("80")) {
      urlOut.append(":");
      urlOut.append(urlObject.getPort());
    }
    if (urlObject.getPath() != GoogleURL.emptyString) {
      int indexOfSemiColon = urlObject.getPath().indexOf(';');
      if (indexOfSemiColon != -1) {
        urlOut.append(urlObject.getPath().substring(0, indexOfSemiColon));
      } else {
        urlOut.append(urlObject.getPath());
      }
    }
    if (urlObject.getQuery() != GoogleURL.emptyString) {
      urlOut.append("?");
      urlOut.append(urlObject.getQuery());
    }

    String canonicalizedURL = urlOut.toString();

    // phase 2 - remove common session id patterns
    canonicalizedURL = sessionIdNormalizer.normalize(canonicalizedURL, "");
    
    // phase 3 - stir back in ref if #!
    if (urlObject.getRef().length() != 0 && urlObject.getRef().charAt(0) == '!') { 
      canonicalizedURL += "#" + urlObject.getRef();
    }
    return canonicalizedURL;
  }

  /**
   * www
   * 
   * @param a
   *          url
   * 
   * @return canonical representation of the url that can be used to identify
   *         all possible occurunces of the specified url. This is
   */

  /*
   * public static String canonicalizeURL(String incomingURL) throws
   * MalformedURLException { //TODO: Make this thread safe !!! synchronized
   * (URLUtils.class) { //phase 1 String normalizedURL =
   * URLNormalizer.normalizeString(incomingURL); // phase 2 normalize host name
   * and remove www
   * 
   * // get hostname URLUtils.fastGetResult hostNameLoc =
   * URLUtils.fastGetHostFromURL(normalizedURL);
   * 
   * if (hostNameLoc != null) { String hostName =
   * normalizedURL.substring(hostNameLoc.offset,hostNameLoc.offset +
   * hostNameLoc.length); String normalizedName =
   * URLUtils.normalizeHostName(hostName);
   * 
   * String newNormalizedURL = normalizedURL.substring(0,hostNameLoc.offset) +
   * normalizedName; newNormalizedURL +=
   * normalizedURL.substring(hostNameLoc.offset+hostNameLoc.length);
   * 
   * // phase 3 - remove common session id patterns normalizedURL =
   * _sessionIdNormalizer.normalize(newNormalizedURL, "");
   * 
   * // return the url return normalizedURL; } return null; } }
   */

  /**
   * get canonical url fingerprint for the the given url
   * 
   * @param urlString
   * @return canonicalized url's fingerprint id
   */
  public static long getCanonicalURLFingerprint(String incomingURL,
      boolean stripLeadingWWW) throws MalformedURLException {
    String canonicalURL = canonicalizeURL(incomingURL, stripLeadingWWW);
    if (canonicalURL != null) {
      return URLFingerprint.generate64BitURLFPrint(canonicalURL);
    }
    return 0;
  }

  /**
   * get a full url fingerprint (domain hash and url fingeprint) for the passed
   * in url
   * 
   * @param urlString
   * @return
   */
  public static URLFP getURLFPFromURL(String urlString, boolean stripLeadingWWW) {

    try {
      // canonicalize the incoming url ...
      String canonicalURL = URLUtils
          .canonicalizeURL(urlString, stripLeadingWWW);

      if (canonicalURL != null) {
        return getURLFPFromCanonicalURL(canonicalURL);
      }
    } catch (MalformedURLException e) {
    }
    return null;
  }

  public static URLFP getURLFPFromURLObject(GoogleURL urlObject) {
    try {
      // canonicalize the incoming url ...
      String canonicalURL = URLUtils.canonicalizeURL(urlObject, false);

      if (canonicalURL != null) {
        return getURLFPFromCanonicalURL(canonicalURL);
      }
    } catch (MalformedURLException e) {
    }
    return null;
  }

  public static URLFP getURLFPFromCanonicalURL(String canonicalURL) {
    // get url object
    GoogleURL urlObject = new GoogleURL(canonicalURL);

    if (urlObject.isValid()) {

      // create a url fp record
      URLFP urlFP = new URLFP();

      urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL));

      String hostName = urlObject.getHost();
      String rootDomainName = URLUtils.extractRootDomainName(hostName);

      if (hostName != null && rootDomainName != null) {
        urlFP.setDomainHash(URLFingerprint.generate32BitHostFP(hostName));
        urlFP.setRootDomainHash(URLFingerprint
            .generate32BitHostFP(rootDomainName));
        return urlFP;
      }
    }
    LOG.warn("####FAILED TO CANONCALIZER INVALID URL:" + canonicalURL);
    return null;
  }

  /**
   * get URLFPV2 for a host
   * 
   */
  public static URLFPV2 getURLFPV2FromHost(String host) {
    return getURLFPV2FromURL("http://" + host + "/");
  }

  /**
   * get new urlfp from urstring... always string leading www
   * 
   * @param urlString
   * @return
   */
  public static URLFPV2 getURLFPV2FromURL(String urlString) {

    try {
      // canonicalize the incoming url ...
      String canonicalURL = URLUtils.canonicalizeURL(urlString, false);

      if (canonicalURL != null) {
        return getURLFPV2FromCanonicalURL(canonicalURL);
      }
    } catch (MalformedURLException e) {
    }
    return null;
  }

  public static URLFPV2 getURLFPV2FromURLObject(GoogleURL urlObject) {
    try {
      // canonicalize the incoming url ...
      String canonicalURL = URLUtils.canonicalizeURL(urlObject, false);

      if (canonicalURL != null) {
        return getURLFPV2FromCanonicalURL(canonicalURL);
      }
    } catch (MalformedURLException e) {
    }
    return null;
  }

  public static URLFPV2 getURLFPV2FromCanonicalURL(String canonicalURL) {

    // create a url fp record
    URLFPV2 urlFP = new URLFPV2();

    urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL));

    String hostName = fastGetHostFromURL(canonicalURL);
    String rootDomainName = null;

    if (hostName != null)
      rootDomainName = URLUtils.extractRootDomainName(hostName);

    if (hostName != null && rootDomainName != null) {
      // ok we want to strip the leading www. if necessary
      if (hostName.startsWith("www.")) {
        // ok now. one nasty hack ... :-(
        // if root name does not equal full host name ...
        if (!rootDomainName.equals(hostName)) {
          // strip the www. prefix
          hostName = hostName.substring(4);
        }
      }
      urlFP.setDomainHash(FPGenerator.std64.fp(hostName));
      urlFP.setRootDomainHash(FPGenerator.std64.fp(rootDomainName));
      return urlFP;
    }
    return null;
  }

  public static String fastGetHostFromURL(String urlString) {

    int hostStart = urlString.indexOf(":");
    if (hostStart != -1) {

      hostStart++;

      int urlLength = urlString.length();

      while (hostStart < urlString.length()) {
        char nextChar = urlString.charAt(hostStart);
        if (nextChar != '/' && nextChar != '\\' && nextChar != '\n'
            && nextChar != '\r' && nextChar != '\t' && nextChar != ' ') {
          break;
        }
        hostStart++;
      }

      if (hostStart < urlLength) {

        int hostEnd = hostStart + 1;

        while (hostEnd < urlLength) {
          char nextChar = urlString.charAt(hostEnd);
          if (nextChar == '/' || nextChar == '?' || nextChar == ';'
              || nextChar == '#')
            break;
          hostEnd++;
        }

        int indexOfAt = urlString.indexOf("@", hostStart);
        if (indexOfAt != -1 && indexOfAt < hostEnd) {
          hostStart = indexOfAt + 1;
        }

        String host = urlString.substring(hostStart, hostEnd);

        int hostLength = host.length();
        int colonEnd = host.indexOf(":");
        if (colonEnd != -1) {
          hostLength = colonEnd;
          host = urlString.substring(hostStart, hostStart + hostLength);
        }

        GoogleURL urlObject = new GoogleURL("http://" + host);

        if (urlObject.isValid()) {
          return urlObject.getHost();
        }
      }
      /*
       * host = host.replaceAll("((%20)|\\s)", "");
       * 
       * if (!invalidDomainCharactersRegEx.matcher(host).matches()) {
       * 
       * if (host.length() >= 1) { if (host.charAt(0) >= '0' && host.charAt(0)
       * <= '9') { if (numericOnly.matcher(host).matches()) { try { int
       * ipAddress = (int) Long.parseLong(host); return
       * IPAddressUtils.IntegerToIPAddressString(ipAddress); } catch
       * (NumberFormatException e) { return null; } } } } return host; }
       */

    }
    return null;
  }

  public static class fastGetResult {

    public fastGetResult(int offset, int length) {
      this.offset = offset;
      this.length = length;
    }

    public int offset;
    public int length;
  }

  public static fastGetResult fastGetHostFromTextURL(byte[] charStream,
      int offset, int length) {

    char schemeEnd[] = { ':', '/', '/' };
    char at[] = { '@' };
    char slash[] = { '/' };
    char questionMark[] = { '?' };
    char hashMark[] = { '#' };
    char colon[] = { ':' };

    int indexOfSchemeEnd = indexOf(charStream, offset, length, schemeEnd, 0, 3,
        0);

    if (indexOfSchemeEnd != -1) {
      int hostStart = indexOfSchemeEnd + 3;
      int lengthRemaining = length - hostStart;
      int hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining,
          slash, 0, 1, 0);
      if (hostEnd == -1) {
        hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining,
            questionMark, 0, 1, 0);
      }
      if (hostEnd == -1) {
        hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining,
            hashMark, 0, 1, 0);
      }
      if (hostEnd != -1) {
        lengthRemaining = hostEnd;
      }
      int indexOfColon = indexOf(charStream, offset + hostStart,
          lengthRemaining, colon, 0, 1, 0);

      if (indexOfColon != -1) {
        lengthRemaining = indexOfColon;
      }
      int indexOfAt = indexOf(charStream, offset + hostStart, lengthRemaining,
          at, 0, 1, 0);
      if (indexOfAt != -1) {
        lengthRemaining = lengthRemaining - (indexOfAt + 1);
        hostStart = hostStart + indexOfAt + 1;
      }

      return new fastGetResult(hostStart, lengthRemaining);
    }
    return null;
  }

  static int indexOf(byte[] source, int sourceOffset, int sourceCount,
      char[] target, int targetOffset, int targetCount, int fromIndex) {
    if (fromIndex >= sourceCount) {
      return (targetCount == 0 ? sourceCount : -1);
    }
    if (fromIndex < 0) {
      fromIndex = 0;
    }
    if (targetCount == 0) {
      return fromIndex;
    }

    char first = target[targetOffset];
    int max = sourceOffset + (sourceCount - targetCount);

    for (int i = sourceOffset + fromIndex; i <= max; i++) {
      /* Look for first character. */
      if (source[i] != first) {
        while (++i <= max && source[i] != first)
          ;
      }

      /* Found first character, now look at the rest of v2 */
      if (i <= max) {
        int j = i + 1;
        int end = j + targetCount - 1;
        for (int k = targetOffset + 1; j < end && source[j] == target[k]; j++, k++)
          ;

        if (j == end) {
          /* Found whole string. */
          return i - sourceOffset;
        }
      }
    }
    return -1;
  }

  public static String invertHostName(String hostNameIn) {
    StringBuffer hostNameOut = new StringBuffer();

    char tokens[] = hostNameIn.toCharArray();
    int lastScanStart = hostNameIn.length() - 1;
    int currentIndex = lastScanStart;

    while (currentIndex != -1) {
      if (tokens[currentIndex] == '.') {
        if (lastScanStart - currentIndex != 0) {
          hostNameOut.append(tokens, currentIndex + 1, lastScanStart
              - currentIndex);
          if (currentIndex != 0)
            hostNameOut.append('.');
        }
        lastScanStart = currentIndex - 1;
      } else if (currentIndex == 0) {
        if (lastScanStart - currentIndex + 1 != 0) {
          hostNameOut.append(tokens, 0, lastScanStart + 1);
        }
      }
      currentIndex--;
    }

    return hostNameOut.toString();
  }

  public static int invertHostNameFast(byte[] tokens, int offset, int length,
      byte[] destinationBuffer) {

    int lastScanStart = offset + length - 1;
    int currentIndex = lastScanStart;
    int destinationOffset = 0;

    while (currentIndex >= offset) {
      if (tokens[currentIndex] == '.') {
        if (lastScanStart - currentIndex != 0) {
          System.arraycopy(tokens, currentIndex + 1, destinationBuffer,
              destinationOffset, lastScanStart - currentIndex);
          destinationOffset += (lastScanStart - currentIndex);
          if (currentIndex != 0) {
            destinationBuffer[destinationOffset++] = '.';
          }
        }
        lastScanStart = currentIndex - 1;
      } else if (currentIndex == offset) {
        if (lastScanStart - currentIndex + 1 != 0) {
          System.arraycopy(tokens, offset, destinationBuffer,
              destinationOffset, (lastScanStart - currentIndex + 1));
          destinationOffset += (lastScanStart - currentIndex + 1);
        }
      }
      currentIndex--;
    }

    return destinationOffset;
  }

  public static String normalizeHostName(String hostName,
      boolean stripLeadingWWW) {

    if (ipAddressRegEx.matcher(hostName).matches()) {
      return hostName;
    }

    // we are going to normalize it first , so make a copy
    String normalizedHostName = hostName.toLowerCase();
    // next check for trailing .
    while (normalizedHostName.endsWith(".")) {
      normalizedHostName = normalizedHostName.substring(0, normalizedHostName
          .length() - 1);
    }
    while (normalizedHostName.startsWith(".")) {
      normalizedHostName = normalizedHostName.substring(1);
    }

    normalizedHostName = normalizedHostName.replaceAll("((%20)|\\s)", "");

    if (!invalidDomainCharactersRegEx.matcher(normalizedHostName).matches()) {

      if (stripLeadingWWW) {
        String rootName = extractRootDomainName(normalizedHostName);

        if (rootName != null) {
          String subDomain = "";

          if (rootName.length() != normalizedHostName.length()) {
            subDomain = normalizedHostName.substring(0, normalizedHostName
                .length()
                - rootName.length());

            if (subDomain.startsWith("www.")) {
              normalizedHostName = normalizedHostName.substring(4);
            }
          }
        }
      }
      return normalizedHostName;
    }
    return null;
  }

  public static String getHostNameFromURLKey(Text key) {

    fastGetResult result = fastGetHostFromTextURL(key.getBytes(), 0, key
        .getLength());

    if (result != null && result.length != 0) {
      String hostName = new String(key.getBytes(), result.offset, result.length);
      return hostName;
    }
    return null;
  }

  private static void testURL(String url) {
    String hostName = getHostNameFromURLKey(new Text(url));
    try {
      URL urlObject = new URL(url);
      if (hostName == null) {
        Assert.assertTrue(urlObject.getHost().length() == 0);
      } else {
        Assert.assertTrue(urlObject.getHost().equals(hostName));
      }
    } catch (MalformedURLException e) {
      Assert.assertTrue(getHostNameFromURLKey(new Text(url)) == null);
    }
  }

  private static int findTLDNameEndLength(byte[] stream, int offset, int length) {
    boolean foundTLDStartMarker = false;
    int i = 0;
    for (i = 0; i < length; ++i) {
      if (stream[offset + i] == '!' && !foundTLDStartMarker) {
        foundTLDStartMarker = true;
      } else if (stream[offset + i] == '.' && foundTLDStartMarker) {
        break;
      }
    }
    return i;
  }

  private static ImmutableMultimap<String, String> gTLDMultiMap = null;

  private static int getNextTokenPos(String candidate, int startPos) {
    while (startPos > 0) {
      if (candidate.charAt(startPos - 1) == '.') {
        break;
      }
      --startPos;
    }
    return startPos;
  }

  public static boolean isTLDStopWord(String candidate) {
    return TLDNamesCollection.getSecondaryNames(candidate).size() != 0;
  }

  private static String buildRootNameString(String candidateString,
      String[] parts, int rootNameIndex) {
    int partsToInclude = parts.length - rootNameIndex;
    int dotsToInclude = partsToInclude - 1;

    // initial root name length is dot count
    int rootNameLength = dotsToInclude;
    for (int i = rootNameIndex; i < parts.length; ++i) {
      rootNameLength += parts[i].length();
    }
    return candidateString.substring(candidateString.length() - rootNameLength);
  }

  public static String extractTLDName(String candidate) {

    // special case for ip addresses
    if (ipAddressRegEx.matcher(candidate).matches()) {
      return "inaddr-arpa.arpa";
    }

    if (candidate.endsWith(".")) {
      candidate = candidate.substring(0, candidate.length() - 1);
    }
    if (candidate.startsWith("*") && candidate.length() > 1) {
      candidate = candidate.substring(1);
    }
    if (candidate.length() != 0) {
      if (!invalidDomainCharactersRegEx.matcher(candidate).find()) {
        String parts[] = candidate.split("\\.");
        if (parts.length >= 2) {
          Collection<String> secondaryNames = TLDNamesCollection
              .getSecondaryNames(parts[parts.length - 1]);

          if (secondaryNames.size() != 0) {
            // see if second to last part matches secondary names for this TLD
            // or there is a wildcard expression for secondary name in rule set
            if (secondaryNames.contains(parts[parts.length - 2])
                || secondaryNames.contains("*")) {
              // ok secondary part is potentianlly part of secondary name ...

              // check to see the part in not explicitly excluded ...
              if (secondaryNames.contains("!" + parts[parts.length - 2])) {
                // in this case, second to last part is NOT part of secondary
                // name
                return buildRootNameString(candidate, parts, parts.length - 1);
              } else {
                // otherwise, TLD contains 2 parts
                return buildRootNameString(candidate, parts, parts.length - 2);
              }
            }
            // ok second to last part does not match set of known secondary
            // names
            else {
              // make a wildcard string matching secondary name
              String extendedWildcard = "*." + parts[parts.length - 2];
              // if match, then this implies secondary name has two components
              if (secondaryNames.contains(extendedWildcard)) {

                if (parts.length >= 3) {
                  // this implies that there must be four parts to the name to
                  // extract root
                  // unless exlusion rule applies
                  String exclusionRule2 = "!" + parts[parts.length - 3] + "."
                      + parts[parts.length - 2];

                  // if exclusion rule is present ...
                  if (secondaryNames.contains(exclusionRule2)) {
                    // third part is NOT part of secondary name
                    return buildRootNameString(candidate, parts,
                        parts.length - 2);
                  } else {
                    // ok extended wildcard matched. last 3 parts are part of
                    // the TLD
                    if (parts.length >= 4) {
                      return buildRootNameString(candidate, parts,
                          parts.length - 3);
                    }
                  }
                }
              }
              // at this point ... if the null name exists ...
              else if (secondaryNames.contains("")) {
                // only last item is part of TLD
                return buildRootNameString(candidate, parts, parts.length - 1);
              }
            }
          }
        }
      }
    }
    return null;
  }

  public static String extractRootDomainName(String candidate) {

    // special case for ip addresses
    if (ipAddressRegEx.matcher(candidate).matches()) {
      return candidate;
    }

    if (candidate.endsWith(".")) {
      candidate = candidate.substring(0, candidate.length() - 1);
    }
    if (candidate.startsWith("*") && candidate.length() > 1) {
      candidate = candidate.substring(1);
    }
    if (candidate.length() != 0) {
      if (!invalidDomainCharactersRegEx.matcher(candidate).find()) {
        String parts[] = candidate.split("\\.");
        if (parts.length >= 2) {
          Collection<String> secondaryNames = TLDNamesCollection
              .getSecondaryNames(parts[parts.length - 1]);

          if (secondaryNames.size() != 0) {
            // see if second to last part matches secondary names for this TLD
            // or there is a wildcard expression for secondary name in rule set
            if (secondaryNames.contains(parts[parts.length - 2])
                || secondaryNames.contains("*")) {
              // ok secondary part is potentianlly part of secondary name ...

              // check to see the part in not explicitly excluded ...
              if (secondaryNames.contains("!" + parts[parts.length - 2])) {
                // in this case, this is an explicit override. second to last
                // part is NOT part of secondary name
                return buildRootNameString(candidate, parts, parts.length - 2);
              } else {
                // otherwise, we need at least three parts
                if (parts.length >= 3) {
                  return buildRootNameString(candidate, parts, parts.length - 3);
                }
              }
            }
            // ok second to last part does not match set of known secondary
            // names
            else {
              // make a wildcard string matching secondary name
              String extendedWildcard = "*." + parts[parts.length - 2];
              // if match, then this implies secondary name has two components
              if (secondaryNames.contains(extendedWildcard)) {

                if (parts.length >= 3) {
                  // this implies that there must be four parts to the name to
                  // extract root
                  // unless exlusion rule applies
                  String exclusionRule2 = "!" + parts[parts.length - 3] + "."
                      + parts[parts.length - 2];

                  // if exclusion rule is present ...
                  if (secondaryNames.contains(exclusionRule2)) {
                    // third part is NOT part of secondary name
                    return buildRootNameString(candidate, parts,
                        parts.length - 3);
                  } else {
                    // ok extended wildcard matched. we need 4 parts minimum
                    if (parts.length >= 4) {
                      return buildRootNameString(candidate, parts,
                          parts.length - 4);
                    }
                  }
                }
              }
              // at this point ... if the null name exists ...
              else if (secondaryNames.contains("")) {
                // return second part as root name
                return buildRootNameString(candidate, parts, parts.length - 2);
              }
            }
          }
        }
      }
    }
    return null;
  }

  /** The maximum length of a Name */
  private static final int MAXNAME                      = 255;

  /** The maximum length of a label a Name */
  private static final int MAXLABEL                     = 63;

  /** The maximum number of labels in a Name */
  private static final int MAXLABELS                    = 128;

  static Pattern           invalidDomainCharactersRegEx = Pattern
                                                            .compile("[^0-9a-z\\-\\._]");
  static Pattern           ipAddressRegEx               = Pattern
                                                            .compile("^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$");
  static Pattern           numericOnly                  = Pattern
                                                            .compile("[0-9]*$");

  public static boolean isValidDomainName(String name) {

    // check for invalid length (max 255 characters)
    if (name.length() > MAXNAME) {
      return false;
    }

    String candidate = name.toLowerCase();

    // check to see if this is an ip address
    if (ipAddressRegEx.matcher(candidate).matches()) {
      return true;
    }

    // check for invalid characters
    if (invalidDomainCharactersRegEx.matcher(candidate).matches()) {
      return false;
    }
    // split into parts
    String[] parts = name.split("\\.");

    // check for max labels constraint
    if (parts.length > MAXLABELS) {
      return false;
    }
    return extractRootDomainName(candidate) != null;
  }

  public static String invertAndMarkTLDNameStartInString(String hostName) {
    // and invert it ...
    hostName = URLUtils.invertHostName(hostName);
    // LOG.info("Inverted HostName for Key:" + key.toString() +":" + hostName);
    // create a buffer
    StringBuffer tempBuffer = new StringBuffer(hostName.length());
    // now walk it skipping tld names
    StringTokenizer tokenizer = new StringTokenizer(hostName, ".");
    boolean foundTLDName = false;
    while (tokenizer.hasMoreElements()) {
      char delimiterToUse = '.';

      String token = tokenizer.nextToken();
      if (!foundTLDName) {
        if (!URLUtils.isTLDStopWord(token)) {
          foundTLDName = true;
          delimiterToUse = '!';
        }
      }
      if (tempBuffer.length() != 0)
        tempBuffer.append(delimiterToUse);
      tempBuffer.append(token);
    }
    return tempBuffer.toString();
  }

  public static int findTLDNameEndLengthInMarkedString(String markedString) {
    boolean foundTLDStartMarker = false;
    int i = 0;
    for (i = 0; i < markedString.length(); ++i) {
      if (markedString.charAt(i) == '!' && !foundTLDStartMarker) {
        foundTLDStartMarker = true;
      } else if (markedString.charAt(i) == '.' && foundTLDStartMarker) {
        break;
      }
    }
    return i;
  }

  public static int findTLDNameEndLengthInMarkedStream(byte[] stream,
      int offset, int length) {
    boolean foundTLDStartMarker = false;
    int i = 0;
    for (i = 0; i < length; ++i) {
      if (stream[offset + i] == '!' && !foundTLDStartMarker) {
        foundTLDStartMarker = true;
      } else if (stream[offset + i] == '.' && foundTLDStartMarker) {
        break;
      }
    }
    return i;
  }

  private static void testURLNameInversion(String name) {
    System.out.println("Inverting name:" + name + " result:"
        + invertHostName(name));
  }

  private static void testTLDNameDetection(String name) {
    byte[] bytes = null;
    try {
      bytes = name.getBytes("UTF8");
    } catch (UnsupportedEncodingException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    int nameLen = findTLDNameEndLength(bytes, 0, bytes.length);
    System.out.println("TLD Name for:" + name + " is:"
        + name.substring(0, nameLen));
  }

  private static void testHostNameInvertFast(int offset, String hostName) {
    byte[] tokenArray = hostName.getBytes();
    byte[] duplicate = new byte[tokenArray.length + offset + 4];
    System.arraycopy(tokenArray, 0, duplicate, offset, tokenArray.length);
    byte[] destinationArray = new byte[tokenArray.length + 4];
    int destinationBufferSize = invertHostNameFast(duplicate, offset,
        tokenArray.length, destinationArray);

    System.out.println("Inverted:" + hostName + " Produced:"
        + new String(destinationArray, 0, destinationBufferSize));
  }

  static String replicateNameNormalization(String hostNameIn) {
    // and invert it ...
    String hostName = URLUtils.invertHostName(hostNameIn);
    // LOG.info("Inverted HostName for Key:" + key.toString() +":" + hostName);
    // create a buffer
    StringBuffer tempBuffer = new StringBuffer(hostName.length());
    // now walk it skipping tld names
    StringTokenizer tokenizer = new StringTokenizer(hostName, ".");
    boolean foundTLDName = false;
    while (tokenizer.hasMoreElements()) {
      char delimiterToUse = '.';

      String token = tokenizer.nextToken();
      if (!foundTLDName) {
        if (!URLUtils.isTLDStopWord(token)) {
          foundTLDName = true;
          delimiterToUse = '!';
        }
      }
      tempBuffer.append(delimiterToUse);
      tempBuffer.append(token);
    }
    hostName = tempBuffer.toString();

    return hostName;
  }

  static class CanonicalizationTestCase {
    String originalURL;
    String expectedURL;

    CanonicalizationTestCase(String originalURL, String expectedURL) {
      this.originalURL = originalURL;
      this.expectedURL = expectedURL;
    }

    void validate() {
      try {
        String resultingURL = canonicalizeURL(originalURL, false);
        Assert.assertEquals(resultingURL, expectedURL);
      } catch (MalformedURLException e) {
        if (expectedURL != null) {
          Assert.assertTrue(false);
        }
      }
    }

  }

  public static class URLFPV2RawComparator implements RawComparator<URLFPV2> {

    DataInputBuffer keyReader1 = new DataInputBuffer();
    DataInputBuffer keyReader2 = new DataInputBuffer();
    URLFPV2         fp1        = new URLFPV2();
    URLFPV2         fp2        = new URLFPV2();

    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
      keyReader1.reset(b1, s1, l1);
      keyReader2.reset(b2, s2, l2);

      try {
        // read first byte of both streams
        int s1FirstByte = keyReader1.read();
        int s2FirstByte = keyReader2.read();

        boolean s1IsOldFormat = false;
        boolean s2IsOldFormat = false;

        if (s1FirstByte == 0 || s1FirstByte == -1) {
          s1IsOldFormat = true;
        }

        if (s2FirstByte == 0 || s2FirstByte == -1) {
          s2IsOldFormat = true;
        }

        keyReader1.skip(1); // skip next byte
        fp1.setDomainHash(WritableUtils.readVLong(keyReader1));
        keyReader2.skip(1); // skip next byte
        fp2.setDomainHash(WritableUtils.readVLong(keyReader2));

        int result = ((Long) fp1.getDomainHash())
            .compareTo(fp2.getDomainHash());

        if (result == 0) {
          keyReader1.skip((s1IsOldFormat) ? 2 : 1); // id field only
          fp1.setUrlHash(WritableUtils.readVLong(keyReader1));

          keyReader2.skip((s2IsOldFormat) ? 2 : 1); // id field only
          fp2.setUrlHash(WritableUtils.readVLong(keyReader2));

          result = ((Long) fp1.getUrlHash()).compareTo(fp2.getUrlHash());
        }
        return result;
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public int compare(URLFPV2 fp1, URLFPV2 fp2) {
      int result = ((Long) fp1.getDomainHash()).compareTo(fp2.getDomainHash());

      if (result == 0) {
        result = ((Long) fp1.getUrlHash()).compareTo(fp2.getUrlHash());
      }
      return result;
    }

    static void validateComparator() {
      URLFPV2 fp1 = new URLFPV2();
      URLFPV2 fp2 = new URLFPV2();
      URLFPV2 fp3 = new URLFPV2();
      URLFPV2 fp4 = new URLFPV2();

      fp1.setDomainHash(1L);
      fp2.setDomainHash(1L);
      fp3.setDomainHash(2L);
      fp4.setDomainHash(2L);

      fp1.setUrlHash(10L);
      fp2.setUrlHash(9L);
      fp3.setUrlHash(18L);
      fp4.setUrlHash(20L);

      DataOutputBuffer buffer1 = new DataOutputBuffer();
      DataOutputBuffer buffer2 = new DataOutputBuffer();
      DataOutputBuffer buffer3 = new DataOutputBuffer();
      DataOutputBuffer buffer4 = new DataOutputBuffer();

      URLFPV2RawComparator comparator = new URLFPV2RawComparator();

      Assert.assertTrue(comparator.compare(fp1, fp2) == 1);
      Assert.assertTrue(comparator.compare(fp2, fp1) == -1);
      Assert.assertTrue(comparator.compare(fp1, fp3) == -1);
      Assert.assertTrue(comparator.compare(fp3, fp1) == 1);
      Assert.assertTrue(comparator.compare(fp4, fp3) == 1);
      Assert.assertTrue(comparator.compare(fp3, fp4) == -1);

      try {
        BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_SHORT;
        fp1.write(buffer1);
        BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_VINT;
        fp2.write(buffer2);
        BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_SHORT;
        fp3.write(buffer3);
        BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_VINT;
        fp4.write(buffer4);

      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }

      Assert.assertTrue(comparator.compare(buffer1.getData(), 0, buffer1
          .getLength(), buffer2.getData(), 0, buffer2.getLength()) == 1);
      Assert.assertTrue(comparator.compare(buffer2.getData(), 0, buffer2
          .getLength(), buffer1.getData(), 0, buffer1.getLength()) == -1);
      Assert.assertTrue(comparator.compare(buffer1.getData(), 0, buffer1
          .getLength(), buffer3.getData(), 0, buffer3.getLength()) == -1);
      Assert.assertTrue(comparator.compare(buffer3.getData(), 0, buffer3
          .getLength(), buffer1.getData(), 0, buffer1.getLength()) == 1);
      Assert.assertTrue(comparator.compare(buffer3.getData(), 0, buffer3
          .getLength(), buffer4.getData(), 0, buffer4.getLength()) == -1);
      Assert.assertTrue(comparator.compare(buffer4.getData(), 0, buffer4
          .getLength(), buffer3.getData(), 0, buffer3.getLength()) == 1);

    }

  }

  public static CanonicalizationTestCase[] testCases = {
      new CanonicalizationTestCase("http://foo.bar.com.#?",
          "http://foo.bar.com/"),
      new CanonicalizationTestCase(
          "http://foo.bar.com./;msg1234FDF FDFDFDF FDFD?param1=test",
          "http://foo.bar.com/?param1=test"),
      new CanonicalizationTestCase(
          "http://foo.bar.com./;msg1234FDF FDFDFDF FDFD", "http://foo.bar.com/"),
      new CanonicalizationTestCase(
          "http://foo.bar.com/subpath/;msg1234FDF FDFDFDF FDFD",
          "http://foo.bar.com/subpath/"),
      new CanonicalizationTestCase(
          "http://foo.bar.com/subpath/;msg1234FDF FDFDFDF FDFD?param=1",
          "http://foo.bar.com/subpath/?param=1"),
      new CanonicalizationTestCase("http://foo.bar.com.#REF=24242",
          "http://foo.bar.com/"),
      new CanonicalizationTestCase(
          "http://www.lakeshorelearning.com/order/onlineOrder.jsp;jsessionid=KxMMpRGgPpC1ktZ1pJJCZF1MmmFxZHPnyrNJhBmWJGHkhcL5Hd4p!-617247554!NONE?FOLDER%3C%3Efolder_id=2534374302096766&ASSORTMENT%3C%3East_id=1408474395181113&bmUID=1257311436941",
          "http://www.lakeshorelearning.com/order/onlineOrder.jsp?FOLDER%3C%3Efolder_id=2534374302096766&ASSORTMENT%3C%3East_id=1408474395181113&bmUID=1257311436941"),
      new CanonicalizationTestCase(
          "http://www.emeraldinsight.com/Insight/menuNavigation.do;jsessionid=A17FC93E864C2F8B3709F63558BA69DB?hdAction=InsightHome",
          "http://www.emeraldinsight.com/Insight/menuNavigation.do?hdAction=InsightHome")

                                                     };

  public static void validatateCanonicalization() {
    for (CanonicalizationTestCase testCase : testCases) {
      testCase.validate();
    }
  }

  public static void main(String[] args) {

    URLFPV2RawComparator.validateComparator();

    URLFPV2 fingerprint = getURLFPV2FromURL("http://www.gmail.fr/");
    URLFPV2 fingerprint2 = getURLFPV2FromURL("http://gmail.fr/");

    Assert.assertTrue(fingerprint.getDomainHash() == fingerprint2
        .getDomainHash());

    testRootDomainExtractor();
    Assert.assertTrue(isValidDomainName("192.168.0.1"));
    Assert.assertFalse(isValidDomainName("192.168.0.1.1"));
    Assert.assertTrue(URLUtils.normalizeHostName("192.168.0.1", false).equals(
        "192.168.0.1"));

    validatateCanonicalization();
  }

  private static void testRootDomainExtractor() {

    System.out.println(extractRootDomainName("www.ret.gov.au") + ","
        + extractTLDName("www.ret.gov.au"));
    System.out.println(extractRootDomainName("www.jobshop.ro") + ","
        + extractTLDName("www.jobshop.ro"));
    System.out.println(extractRootDomainName("www.ne.jp") + ","
        + extractTLDName("www.ne.jp"));
    System.out.println(extractRootDomainName("foo.ac.jp") + ","
        + extractTLDName("foo.ac.jp"));
    System.out.println(extractRootDomainName("aichi.jp") + ","
        + extractTLDName("aichi.jp"));
    System.out.println(extractRootDomainName("bochi.aichi.jp") + ","
        + extractTLDName("bochi.aichi.jp"));
    System.out.println(extractRootDomainName("more.bochi.aichi.jp") + ","
        + extractTLDName("more.bochi.aichi.jp"));
    System.out.println(extractRootDomainName("metro.tokyo.jp") + ","
        + extractTLDName("metro.tokyo.jp"));
    System.out.println(extractRootDomainName("fluff.metro.tokyo.jp") + ","
        + extractTLDName("fluff.metro.tokyo.jp"));
    System.out.println(extractRootDomainName("www.pref.hokkaido.jp") + ","
        + extractTLDName("www.pref.hokkaido.jp"));
    System.out.println(extractRootDomainName("www.subdomain.pref2.hokkaido.jp")
        + "," + extractTLDName("www.subdomain.pref2.hokkaido.jp"));
    System.out.println(extractRootDomainName("gigaom.com") + ","
        + extractTLDName("gigaom.com"));
    System.out.println(extractRootDomainName("www.gigaom.com.cn") + ","
        + extractTLDName("www.gigaom.com.cn"));
    System.out.println(extractRootDomainName("www.foobar.idf.il") + ","
        + extractTLDName("www.foobar.idf.il"));
    System.out.println(extractRootDomainName("192.168.0.1") + ","
        + extractTLDName("192.168.0.1"));

    Assert
        .assertTrue(extractRootDomainName(".gigaom.com").equals("gigaom.com"));
    Assert.assertTrue(extractRootDomainName("*.gigaom.com")
        .equals("gigaom.com"));
    Assert.assertTrue(extractRootDomainName("www.gigaom.com").equals(
        "gigaom.com"));
    Assert.assertTrue(extractRootDomainName("foobar.foo.cn").equals("foo.cn"));
    Assert.assertTrue(extractRootDomainName("foobar.google.com.cn").equals(
        "google.com.cn"));
    Assert.assertTrue(extractRootDomainName("google.com.cn").equals(
        "google.com.cn"));
    Assert.assertTrue(extractRootDomainName("cn") == null);
    Assert.assertTrue(extractRootDomainName("ab.ca") == null);
    Assert.assertTrue(extractRootDomainName("somedomain.ab.ca").equals(
        "somedomain.ab.ca"));
    Assert.assertTrue(extractRootDomainName("www.somedomain.ab.ca").equals(
        "somedomain.ab.ca"));
    Assert.assertTrue(extractRootDomainName("www.somedomain .ab.ca") == null);

  }

  private static void utilsTest() throws Exception {

    /*
     * testHostNameInvertFast(4,"www.google.com");
     * testHostNameInvertFast(4,"google.com.");
     * 
     * System.out.println(invertHostName("news.bbc.co.uk."));
     * System.out.println(invertHostName("www.zubia-alam.blogspot.com")+".");
     * System.out.println(invertHostName("zubia-alam.blogspot.com")+".");
     * System.out.println("compareTo returned:" +
     * (invertHostName("zubia-alam.blogspot.com"
     * )+".").compareTo((invertHostName("zubia-alam.blogspot.com"))));
     * System.out.println("compareTo returned:" +
     * "x-factor-e.".compareTo("x-factor."));
     * 
     * testHostNameInvertFast(4,"www.google.com");
     * testHostNameInvertFast(4,"google.com.");
     * testHostNameInvertFast(4,".google.com");
     * testHostNameInvertFast(4,"google.com.");
     * 
     * testHostNameInvertFast(4,"www.google.com");
     * testHostNameInvertFast(4,"google.com.");
     * testHostNameInvertFast(4,".google.com");
     * testHostNameInvertFast(4,"google.com.");
     */

    Assert.assertTrue(normalizeHostName(".gigaom. com", true).equals(
        "gigaom.com"));
    Assert.assertTrue(normalizeHostName("%20gigaom.com", true).equals(
        "gigaom.com"));
    Assert.assertTrue(normalizeHostName("www.gigaom.com", true).equals(
        "gigaom.com"));
    Assert.assertTrue(normalizeHostName("www.gigaom.com.", true).equals(
        "gigaom.com"));
    Assert.assertTrue(normalizeHostName("www.gigaom.com", false).equals(
        "www.gigaom.com"));
    Assert.assertTrue(normalizeHostName("www.gigaom.com.", false).equals(
        "www.gigaom.com"));
    Assert.assertTrue(normalizeHostName(".com.", true).equals("com"));
    Assert.assertTrue(normalizeHostName("..gigaom.com..", true).equals(
        "gigaom.com"));

    Assert.assertTrue(normalizeHostName("aisa.org.af.", true).equals(
        "aisa.org.af"));

    /*
     * testTLDNameDetection("com!google.www");
     * testTLDNameDetection("au.com!google.www");
     * testTLDNameDetection("com.google.www");
     * 
     * testURLNameInversion("www.google.com");
     * testURLNameInversion("google.com."); testURLNameInversion(".google.com");
     * testURLNameInversion("google.com.");
     * 
     * testURLNameInversion(invertHostName("www.google.com"));
     * testURLNameInversion(invertHostName("google.com."));
     * testURLNameInversion(invertHostName(".google.com"));
     * testURLNameInversion(invertHostName("google.com."));
     * 
     * testURL("http://www.google.com/"); testURL("http://google.com:8080/");
     * testURL("http://google.com:8080"); testURL("http://google.com");
     * testURL("http://ahad@google.com");
     * testURL("http://ahad:password@google.com");
     * testURL("http://ahad:password@google.com/"); testURL("http:///");
     */
  }

}