DomainNameUtils.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.util.Collection;
import java.util.regex.Pattern;


/**
 * Utility class for validating domain names and extracting top level domain 
 * names.
 * 
 * @author rana
 *
 */
public class DomainNameUtils {

  static Pattern           invalidDomainCharactersRegEx = Pattern
                                                            .compile("[^0-9a-z\\-\\._]");
  static Pattern           ipAddressRegEx               = Pattern
                                                            .compile("^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$");
  static Pattern           numericOnly                  = Pattern
                                                            .compile("[0-9]*$");

  /** The maximum length of a Name */
  private static final int MAXNAME                      = 255;

  /** The maximum length of a label a Name */
  private static final int MAXLABEL                     = 63;

  /** The maximum number of labels in a Name */
  private static final int MAXLABELS                    = 128;

  public static boolean isValidDomainName(String name) {

    // check for invalid length (max 255 characters)
    if (name.length() > MAXNAME) {
      return false;
    }

    String candidate = name.toLowerCase();

    // check to see if this is an ip address
    if (ipAddressRegEx.matcher(candidate).matches()) {
      return true;
    }

    // check for invalid characters
    if (invalidDomainCharactersRegEx.matcher(candidate).matches()) {
      return false;
    }
    // split into parts
    String[] parts = name.split("\\.");

    // check for max labels constraint
    if (parts.length > MAXLABELS) {
      return false;
    }
    return extractRootDomainName(candidate) != null;
  }

  private static String buildRootNameString(String candidateString,
      String[] parts, int rootNameIndex) {
    int partsToInclude = parts.length - rootNameIndex;
    int dotsToInclude = partsToInclude - 1;

    // initial root name length is dot count
    int rootNameLength = dotsToInclude;
    for (int i = rootNameIndex; i < parts.length; ++i) {
      rootNameLength += parts[i].length();
    }
    return candidateString.substring(candidateString.length() - rootNameLength);
  }

  public static String extractRootDomainName(String candidate) {

    // special case for ip addresses
    if (ipAddressRegEx.matcher(candidate).matches()) {
      return candidate;
    }

    if (candidate.endsWith(".")) {
      candidate = candidate.substring(0, candidate.length() - 1);
    }
    if (candidate.startsWith("*") && candidate.length() > 1) {
      candidate = candidate.substring(1);
    }
    if (candidate.length() != 0) {
      if (!invalidDomainCharactersRegEx.matcher(candidate).find()) {
        String parts[] = candidate.split("\\.");
        if (parts.length >= 2) {
          Collection<String> secondaryNames = TLDNamesCollection
              .getSecondaryNames(parts[parts.length - 1]);

          if (secondaryNames.size() != 0) {
            // see if second to last part matches secondary names for this TLD
            // or there is a wildcard expression for secondary name in rule set
            if (secondaryNames.contains(parts[parts.length - 2])
                || secondaryNames.contains("*")) {
              // ok secondary part is potentianlly part of secondary name ...

              // check to see the part in not explicitly excluded ...
              if (secondaryNames.contains("!" + parts[parts.length - 2])) {
                // in this case, this is an explicit override. second to last
                // part is NOT part of secondary name
                return buildRootNameString(candidate, parts, parts.length - 2);
              } else {
                // otherwise, we need at least three parts
                if (parts.length >= 3) {
                  return buildRootNameString(candidate, parts, parts.length - 3);
                }
              }
            }
            // ok second to last part does not match set of known secondary
            // names
            else {
              // make a wildcard string matching secondary name
              String extendedWildcard = "*." + parts[parts.length - 2];
              // if match, then this implies secondary name has two components
              if (secondaryNames.contains(extendedWildcard)) {

                if (parts.length >= 3) {
                  // this implies that there must be four parts to the name to
                  // extract root
                  // unless exlusion rule applies
                  String exclusionRule2 = "!" + parts[parts.length - 3] + "."
                      + parts[parts.length - 2];

                  // if exclusion rule is present ...
                  if (secondaryNames.contains(exclusionRule2)) {
                    // third part is NOT part of secondary name
                    return buildRootNameString(candidate, parts,
                        parts.length - 3);
                  } else {
                    // ok extended wildcard matched. we need 4 parts minimum
                    if (parts.length >= 4) {
                      return buildRootNameString(candidate, parts,
                          parts.length - 4);
                    }
                  }
                }
              }
              // at this point ... if the null name exists ...
              else if (secondaryNames.contains("")) {
                // return second part as root name
                return buildRootNameString(candidate, parts, parts.length - 2);
              }
            }
          }
        }
      }
    }
    return null;
  }

}