/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.util.Collection;
import java.util.regex.Pattern;
/**
* Utility class for validating domain names and extracting top level domain
* names.
*
* @author rana
*
*/
public class DomainNameUtils {
static Pattern invalidDomainCharactersRegEx = Pattern
.compile("[^0-9a-z\\-\\._]");
static Pattern ipAddressRegEx = Pattern
.compile("^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$");
static Pattern numericOnly = Pattern
.compile("[0-9]*$");
/** The maximum length of a Name */
private static final int MAXNAME = 255;
/** The maximum length of a label a Name */
private static final int MAXLABEL = 63;
/** The maximum number of labels in a Name */
private static final int MAXLABELS = 128;
public static boolean isValidDomainName(String name) {
// check for invalid length (max 255 characters)
if (name.length() > MAXNAME) {
return false;
}
String candidate = name.toLowerCase();
// check to see if this is an ip address
if (ipAddressRegEx.matcher(candidate).matches()) {
return true;
}
// check for invalid characters
if (invalidDomainCharactersRegEx.matcher(candidate).matches()) {
return false;
}
// split into parts
String[] parts = name.split("\\.");
// check for max labels constraint
if (parts.length > MAXLABELS) {
return false;
}
return extractRootDomainName(candidate) != null;
}
private static String buildRootNameString(String candidateString,
String[] parts, int rootNameIndex) {
int partsToInclude = parts.length - rootNameIndex;
int dotsToInclude = partsToInclude - 1;
// initial root name length is dot count
int rootNameLength = dotsToInclude;
for (int i = rootNameIndex; i < parts.length; ++i) {
rootNameLength += parts[i].length();
}
return candidateString.substring(candidateString.length() - rootNameLength);
}
public static String extractRootDomainName(String candidate) {
// special case for ip addresses
if (ipAddressRegEx.matcher(candidate).matches()) {
return candidate;
}
if (candidate.endsWith(".")) {
candidate = candidate.substring(0, candidate.length() - 1);
}
if (candidate.startsWith("*") && candidate.length() > 1) {
candidate = candidate.substring(1);
}
if (candidate.length() != 0) {
if (!invalidDomainCharactersRegEx.matcher(candidate).find()) {
String parts[] = candidate.split("\\.");
if (parts.length >= 2) {
Collection<String> secondaryNames = TLDNamesCollection
.getSecondaryNames(parts[parts.length - 1]);
if (secondaryNames.size() != 0) {
// see if second to last part matches secondary names for this TLD
// or there is a wildcard expression for secondary name in rule set
if (secondaryNames.contains(parts[parts.length - 2])
|| secondaryNames.contains("*")) {
// ok secondary part is potentianlly part of secondary name ...
// check to see the part in not explicitly excluded ...
if (secondaryNames.contains("!" + parts[parts.length - 2])) {
// in this case, this is an explicit override. second to last
// part is NOT part of secondary name
return buildRootNameString(candidate, parts, parts.length - 2);
} else {
// otherwise, we need at least three parts
if (parts.length >= 3) {
return buildRootNameString(candidate, parts, parts.length - 3);
}
}
}
// ok second to last part does not match set of known secondary
// names
else {
// make a wildcard string matching secondary name
String extendedWildcard = "*." + parts[parts.length - 2];
// if match, then this implies secondary name has two components
if (secondaryNames.contains(extendedWildcard)) {
if (parts.length >= 3) {
// this implies that there must be four parts to the name to
// extract root
// unless exlusion rule applies
String exclusionRule2 = "!" + parts[parts.length - 3] + "."
+ parts[parts.length - 2];
// if exclusion rule is present ...
if (secondaryNames.contains(exclusionRule2)) {
// third part is NOT part of secondary name
return buildRootNameString(candidate, parts,
parts.length - 3);
} else {
// ok extended wildcard matched. we need 4 parts minimum
if (parts.length >= 4) {
return buildRootNameString(candidate, parts,
parts.length - 4);
}
}
}
}
// at this point ... if the null name exists ...
else if (secondaryNames.contains("")) {
// return second part as root name
return buildRootNameString(candidate, parts, parts.length - 2);
}
}
}
}
}
}
return null;
}
}