/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.Collection; import java.util.StringTokenizer; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.commoncrawl.protocol.URLFP; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.rpc.base.shared.BinaryProtocol; import org.commoncrawl.util.GoogleURL; import org.junit.Assert; import com.google.common.collect.ImmutableMultimap; /** * * @author rana * */ public class URLUtils { private static final Log LOG = LogFactory .getLog(URLUtils.class); /** session id normalizer **/ public static SessionIDURLNormalizer sessionIdNormalizer = new SessionIDURLNormalizer(); /** * canonicalize url * * @param incomingURL * @param stripLeadingWWW * - set to true to string www. prefix from the domain if present * @return a canonical representation of the passed in URL that can be safely * used as a replacement for the original url * @throws MalformedURLException */ public static String canonicalizeURL(String incomingURL, boolean stripLeadingWWW) throws MalformedURLException { GoogleURL urlObject = new GoogleURL(incomingURL); if (!urlObject.isValid()) { throw new MalformedURLException("URL:" + incomingURL + " is invalid"); } return canonicalizeURL(urlObject, stripLeadingWWW); } public static String canonicalizeURL(GoogleURL urlObject, boolean stripLeadingWWW) throws MalformedURLException { StringBuilder urlOut = new StringBuilder(); urlOut.append(urlObject.getScheme()); urlOut.append("://"); if (urlObject.getUserName() != GoogleURL.emptyString) { urlOut.append(urlObject.getUserName()); if (urlObject.getPassword() != GoogleURL.emptyString) { urlOut.append(":"); urlOut.append(urlObject.getPassword()); } urlOut.append("@"); } String host = urlObject.getHost(); if (host.endsWith(".")) { host = host.substring(0, host.length() - 1); } if (stripLeadingWWW) { if (host.startsWith("www.")) { // ok now. one nasty hack ... :-( // if root name is null or root name does not equal full host name ... String rootName = extractRootDomainName(host); if (rootName == null || !rootName.equals(host)) { // striping the www. prefix host = host.substring(4); } } } urlOut.append(host); if (urlObject.getPort() != GoogleURL.emptyString && !urlObject.getPort().equals("80")) { urlOut.append(":"); urlOut.append(urlObject.getPort()); } if (urlObject.getPath() != GoogleURL.emptyString) { int indexOfSemiColon = urlObject.getPath().indexOf(';'); if (indexOfSemiColon != -1) { urlOut.append(urlObject.getPath().substring(0, indexOfSemiColon)); } else { urlOut.append(urlObject.getPath()); } } if (urlObject.getQuery() != GoogleURL.emptyString) { urlOut.append("?"); urlOut.append(urlObject.getQuery()); } String canonicalizedURL = urlOut.toString(); // phase 2 - remove common session id patterns canonicalizedURL = sessionIdNormalizer.normalize(canonicalizedURL, ""); // phase 3 - stir back in ref if #! if (urlObject.getRef().length() != 0 && urlObject.getRef().charAt(0) == '!') { canonicalizedURL += "#" + urlObject.getRef(); } return canonicalizedURL; } /** * www * * @param a * url * * @return canonical representation of the url that can be used to identify * all possible occurunces of the specified url. This is */ /* * public static String canonicalizeURL(String incomingURL) throws * MalformedURLException { //TODO: Make this thread safe !!! synchronized * (URLUtils.class) { //phase 1 String normalizedURL = * URLNormalizer.normalizeString(incomingURL); // phase 2 normalize host name * and remove www * * // get hostname URLUtils.fastGetResult hostNameLoc = * URLUtils.fastGetHostFromURL(normalizedURL); * * if (hostNameLoc != null) { String hostName = * normalizedURL.substring(hostNameLoc.offset,hostNameLoc.offset + * hostNameLoc.length); String normalizedName = * URLUtils.normalizeHostName(hostName); * * String newNormalizedURL = normalizedURL.substring(0,hostNameLoc.offset) + * normalizedName; newNormalizedURL += * normalizedURL.substring(hostNameLoc.offset+hostNameLoc.length); * * // phase 3 - remove common session id patterns normalizedURL = * _sessionIdNormalizer.normalize(newNormalizedURL, ""); * * // return the url return normalizedURL; } return null; } } */ /** * get canonical url fingerprint for the the given url * * @param urlString * @return canonicalized url's fingerprint id */ public static long getCanonicalURLFingerprint(String incomingURL, boolean stripLeadingWWW) throws MalformedURLException { String canonicalURL = canonicalizeURL(incomingURL, stripLeadingWWW); if (canonicalURL != null) { return URLFingerprint.generate64BitURLFPrint(canonicalURL); } return 0; } /** * get a full url fingerprint (domain hash and url fingeprint) for the passed * in url * * @param urlString * @return */ public static URLFP getURLFPFromURL(String urlString, boolean stripLeadingWWW) { try { // canonicalize the incoming url ... String canonicalURL = URLUtils .canonicalizeURL(urlString, stripLeadingWWW); if (canonicalURL != null) { return getURLFPFromCanonicalURL(canonicalURL); } } catch (MalformedURLException e) { } return null; } public static URLFP getURLFPFromURLObject(GoogleURL urlObject) { try { // canonicalize the incoming url ... String canonicalURL = URLUtils.canonicalizeURL(urlObject, false); if (canonicalURL != null) { return getURLFPFromCanonicalURL(canonicalURL); } } catch (MalformedURLException e) { } return null; } public static URLFP getURLFPFromCanonicalURL(String canonicalURL) { // get url object GoogleURL urlObject = new GoogleURL(canonicalURL); if (urlObject.isValid()) { // create a url fp record URLFP urlFP = new URLFP(); urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL)); String hostName = urlObject.getHost(); String rootDomainName = URLUtils.extractRootDomainName(hostName); if (hostName != null && rootDomainName != null) { urlFP.setDomainHash(URLFingerprint.generate32BitHostFP(hostName)); urlFP.setRootDomainHash(URLFingerprint .generate32BitHostFP(rootDomainName)); return urlFP; } } LOG.warn("####FAILED TO CANONCALIZER INVALID URL:" + canonicalURL); return null; } /** * get URLFPV2 for a host * */ public static URLFPV2 getURLFPV2FromHost(String host) { return getURLFPV2FromURL("http://" + host + "/"); } /** * get new urlfp from urstring... always string leading www * * @param urlString * @return */ public static URLFPV2 getURLFPV2FromURL(String urlString) { try { // canonicalize the incoming url ... String canonicalURL = URLUtils.canonicalizeURL(urlString, false); if (canonicalURL != null) { return getURLFPV2FromCanonicalURL(canonicalURL); } } catch (MalformedURLException e) { } return null; } public static URLFPV2 getURLFPV2FromURLObject(GoogleURL urlObject) { try { // canonicalize the incoming url ... String canonicalURL = URLUtils.canonicalizeURL(urlObject, false); if (canonicalURL != null) { return getURLFPV2FromCanonicalURL(canonicalURL); } } catch (MalformedURLException e) { } return null; } public static URLFPV2 getURLFPV2FromCanonicalURL(String canonicalURL) { // create a url fp record URLFPV2 urlFP = new URLFPV2(); urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL)); String hostName = fastGetHostFromURL(canonicalURL); String rootDomainName = null; if (hostName != null) rootDomainName = URLUtils.extractRootDomainName(hostName); if (hostName != null && rootDomainName != null) { // ok we want to strip the leading www. if necessary if (hostName.startsWith("www.")) { // ok now. one nasty hack ... :-( // if root name does not equal full host name ... if (!rootDomainName.equals(hostName)) { // strip the www. prefix hostName = hostName.substring(4); } } urlFP.setDomainHash(FPGenerator.std64.fp(hostName)); urlFP.setRootDomainHash(FPGenerator.std64.fp(rootDomainName)); return urlFP; } return null; } public static String fastGetHostFromURL(String urlString) { int hostStart = urlString.indexOf(":"); if (hostStart != -1) { hostStart++; int urlLength = urlString.length(); while (hostStart < urlString.length()) { char nextChar = urlString.charAt(hostStart); if (nextChar != '/' && nextChar != '\\' && nextChar != '\n' && nextChar != '\r' && nextChar != '\t' && nextChar != ' ') { break; } hostStart++; } if (hostStart < urlLength) { int hostEnd = hostStart + 1; while (hostEnd < urlLength) { char nextChar = urlString.charAt(hostEnd); if (nextChar == '/' || nextChar == '?' || nextChar == ';' || nextChar == '#') break; hostEnd++; } int indexOfAt = urlString.indexOf("@", hostStart); if (indexOfAt != -1 && indexOfAt < hostEnd) { hostStart = indexOfAt + 1; } String host = urlString.substring(hostStart, hostEnd); int hostLength = host.length(); int colonEnd = host.indexOf(":"); if (colonEnd != -1) { hostLength = colonEnd; host = urlString.substring(hostStart, hostStart + hostLength); } GoogleURL urlObject = new GoogleURL("http://" + host); if (urlObject.isValid()) { return urlObject.getHost(); } } /* * host = host.replaceAll("((%20)|\\s)", ""); * * if (!invalidDomainCharactersRegEx.matcher(host).matches()) { * * if (host.length() >= 1) { if (host.charAt(0) >= '0' && host.charAt(0) * <= '9') { if (numericOnly.matcher(host).matches()) { try { int * ipAddress = (int) Long.parseLong(host); return * IPAddressUtils.IntegerToIPAddressString(ipAddress); } catch * (NumberFormatException e) { return null; } } } } return host; } */ } return null; } public static class fastGetResult { public fastGetResult(int offset, int length) { this.offset = offset; this.length = length; } public int offset; public int length; } public static fastGetResult fastGetHostFromTextURL(byte[] charStream, int offset, int length) { char schemeEnd[] = { ':', '/', '/' }; char at[] = { '@' }; char slash[] = { '/' }; char questionMark[] = { '?' }; char hashMark[] = { '#' }; char colon[] = { ':' }; int indexOfSchemeEnd = indexOf(charStream, offset, length, schemeEnd, 0, 3, 0); if (indexOfSchemeEnd != -1) { int hostStart = indexOfSchemeEnd + 3; int lengthRemaining = length - hostStart; int hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining, slash, 0, 1, 0); if (hostEnd == -1) { hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining, questionMark, 0, 1, 0); } if (hostEnd == -1) { hostEnd = indexOf(charStream, offset + hostStart, lengthRemaining, hashMark, 0, 1, 0); } if (hostEnd != -1) { lengthRemaining = hostEnd; } int indexOfColon = indexOf(charStream, offset + hostStart, lengthRemaining, colon, 0, 1, 0); if (indexOfColon != -1) { lengthRemaining = indexOfColon; } int indexOfAt = indexOf(charStream, offset + hostStart, lengthRemaining, at, 0, 1, 0); if (indexOfAt != -1) { lengthRemaining = lengthRemaining - (indexOfAt + 1); hostStart = hostStart + indexOfAt + 1; } return new fastGetResult(hostStart, lengthRemaining); } return null; } static int indexOf(byte[] source, int sourceOffset, int sourceCount, char[] target, int targetOffset, int targetCount, int fromIndex) { if (fromIndex >= sourceCount) { return (targetCount == 0 ? sourceCount : -1); } if (fromIndex < 0) { fromIndex = 0; } if (targetCount == 0) { return fromIndex; } char first = target[targetOffset]; int max = sourceOffset + (sourceCount - targetCount); for (int i = sourceOffset + fromIndex; i <= max; i++) { /* Look for first character. */ if (source[i] != first) { while (++i <= max && source[i] != first) ; } /* Found first character, now look at the rest of v2 */ if (i <= max) { int j = i + 1; int end = j + targetCount - 1; for (int k = targetOffset + 1; j < end && source[j] == target[k]; j++, k++) ; if (j == end) { /* Found whole string. */ return i - sourceOffset; } } } return -1; } public static String invertHostName(String hostNameIn) { StringBuffer hostNameOut = new StringBuffer(); char tokens[] = hostNameIn.toCharArray(); int lastScanStart = hostNameIn.length() - 1; int currentIndex = lastScanStart; while (currentIndex != -1) { if (tokens[currentIndex] == '.') { if (lastScanStart - currentIndex != 0) { hostNameOut.append(tokens, currentIndex + 1, lastScanStart - currentIndex); if (currentIndex != 0) hostNameOut.append('.'); } lastScanStart = currentIndex - 1; } else if (currentIndex == 0) { if (lastScanStart - currentIndex + 1 != 0) { hostNameOut.append(tokens, 0, lastScanStart + 1); } } currentIndex--; } return hostNameOut.toString(); } public static int invertHostNameFast(byte[] tokens, int offset, int length, byte[] destinationBuffer) { int lastScanStart = offset + length - 1; int currentIndex = lastScanStart; int destinationOffset = 0; while (currentIndex >= offset) { if (tokens[currentIndex] == '.') { if (lastScanStart - currentIndex != 0) { System.arraycopy(tokens, currentIndex + 1, destinationBuffer, destinationOffset, lastScanStart - currentIndex); destinationOffset += (lastScanStart - currentIndex); if (currentIndex != 0) { destinationBuffer[destinationOffset++] = '.'; } } lastScanStart = currentIndex - 1; } else if (currentIndex == offset) { if (lastScanStart - currentIndex + 1 != 0) { System.arraycopy(tokens, offset, destinationBuffer, destinationOffset, (lastScanStart - currentIndex + 1)); destinationOffset += (lastScanStart - currentIndex + 1); } } currentIndex--; } return destinationOffset; } public static String normalizeHostName(String hostName, boolean stripLeadingWWW) { if (ipAddressRegEx.matcher(hostName).matches()) { return hostName; } // we are going to normalize it first , so make a copy String normalizedHostName = hostName.toLowerCase(); // next check for trailing . while (normalizedHostName.endsWith(".")) { normalizedHostName = normalizedHostName.substring(0, normalizedHostName .length() - 1); } while (normalizedHostName.startsWith(".")) { normalizedHostName = normalizedHostName.substring(1); } normalizedHostName = normalizedHostName.replaceAll("((%20)|\\s)", ""); if (!invalidDomainCharactersRegEx.matcher(normalizedHostName).matches()) { if (stripLeadingWWW) { String rootName = extractRootDomainName(normalizedHostName); if (rootName != null) { String subDomain = ""; if (rootName.length() != normalizedHostName.length()) { subDomain = normalizedHostName.substring(0, normalizedHostName .length() - rootName.length()); if (subDomain.startsWith("www.")) { normalizedHostName = normalizedHostName.substring(4); } } } } return normalizedHostName; } return null; } public static String getHostNameFromURLKey(Text key) { fastGetResult result = fastGetHostFromTextURL(key.getBytes(), 0, key .getLength()); if (result != null && result.length != 0) { String hostName = new String(key.getBytes(), result.offset, result.length); return hostName; } return null; } private static void testURL(String url) { String hostName = getHostNameFromURLKey(new Text(url)); try { URL urlObject = new URL(url); if (hostName == null) { Assert.assertTrue(urlObject.getHost().length() == 0); } else { Assert.assertTrue(urlObject.getHost().equals(hostName)); } } catch (MalformedURLException e) { Assert.assertTrue(getHostNameFromURLKey(new Text(url)) == null); } } private static int findTLDNameEndLength(byte[] stream, int offset, int length) { boolean foundTLDStartMarker = false; int i = 0; for (i = 0; i < length; ++i) { if (stream[offset + i] == '!' && !foundTLDStartMarker) { foundTLDStartMarker = true; } else if (stream[offset + i] == '.' && foundTLDStartMarker) { break; } } return i; } private static ImmutableMultimap<String, String> gTLDMultiMap = null; private static int getNextTokenPos(String candidate, int startPos) { while (startPos > 0) { if (candidate.charAt(startPos - 1) == '.') { break; } --startPos; } return startPos; } public static boolean isTLDStopWord(String candidate) { return TLDNamesCollection.getSecondaryNames(candidate).size() != 0; } private static String buildRootNameString(String candidateString, String[] parts, int rootNameIndex) { int partsToInclude = parts.length - rootNameIndex; int dotsToInclude = partsToInclude - 1; // initial root name length is dot count int rootNameLength = dotsToInclude; for (int i = rootNameIndex; i < parts.length; ++i) { rootNameLength += parts[i].length(); } return candidateString.substring(candidateString.length() - rootNameLength); } public static String extractTLDName(String candidate) { // special case for ip addresses if (ipAddressRegEx.matcher(candidate).matches()) { return "inaddr-arpa.arpa"; } if (candidate.endsWith(".")) { candidate = candidate.substring(0, candidate.length() - 1); } if (candidate.startsWith("*") && candidate.length() > 1) { candidate = candidate.substring(1); } if (candidate.length() != 0) { if (!invalidDomainCharactersRegEx.matcher(candidate).find()) { String parts[] = candidate.split("\\."); if (parts.length >= 2) { Collection<String> secondaryNames = TLDNamesCollection .getSecondaryNames(parts[parts.length - 1]); if (secondaryNames.size() != 0) { // see if second to last part matches secondary names for this TLD // or there is a wildcard expression for secondary name in rule set if (secondaryNames.contains(parts[parts.length - 2]) || secondaryNames.contains("*")) { // ok secondary part is potentianlly part of secondary name ... // check to see the part in not explicitly excluded ... if (secondaryNames.contains("!" + parts[parts.length - 2])) { // in this case, second to last part is NOT part of secondary // name return buildRootNameString(candidate, parts, parts.length - 1); } else { // otherwise, TLD contains 2 parts return buildRootNameString(candidate, parts, parts.length - 2); } } // ok second to last part does not match set of known secondary // names else { // make a wildcard string matching secondary name String extendedWildcard = "*." + parts[parts.length - 2]; // if match, then this implies secondary name has two components if (secondaryNames.contains(extendedWildcard)) { if (parts.length >= 3) { // this implies that there must be four parts to the name to // extract root // unless exlusion rule applies String exclusionRule2 = "!" + parts[parts.length - 3] + "." + parts[parts.length - 2]; // if exclusion rule is present ... if (secondaryNames.contains(exclusionRule2)) { // third part is NOT part of secondary name return buildRootNameString(candidate, parts, parts.length - 2); } else { // ok extended wildcard matched. last 3 parts are part of // the TLD if (parts.length >= 4) { return buildRootNameString(candidate, parts, parts.length - 3); } } } } // at this point ... if the null name exists ... else if (secondaryNames.contains("")) { // only last item is part of TLD return buildRootNameString(candidate, parts, parts.length - 1); } } } } } } return null; } public static String extractRootDomainName(String candidate) { // special case for ip addresses if (ipAddressRegEx.matcher(candidate).matches()) { return candidate; } if (candidate.endsWith(".")) { candidate = candidate.substring(0, candidate.length() - 1); } if (candidate.startsWith("*") && candidate.length() > 1) { candidate = candidate.substring(1); } if (candidate.length() != 0) { if (!invalidDomainCharactersRegEx.matcher(candidate).find()) { String parts[] = candidate.split("\\."); if (parts.length >= 2) { Collection<String> secondaryNames = TLDNamesCollection .getSecondaryNames(parts[parts.length - 1]); if (secondaryNames.size() != 0) { // see if second to last part matches secondary names for this TLD // or there is a wildcard expression for secondary name in rule set if (secondaryNames.contains(parts[parts.length - 2]) || secondaryNames.contains("*")) { // ok secondary part is potentianlly part of secondary name ... // check to see the part in not explicitly excluded ... if (secondaryNames.contains("!" + parts[parts.length - 2])) { // in this case, this is an explicit override. second to last // part is NOT part of secondary name return buildRootNameString(candidate, parts, parts.length - 2); } else { // otherwise, we need at least three parts if (parts.length >= 3) { return buildRootNameString(candidate, parts, parts.length - 3); } } } // ok second to last part does not match set of known secondary // names else { // make a wildcard string matching secondary name String extendedWildcard = "*." + parts[parts.length - 2]; // if match, then this implies secondary name has two components if (secondaryNames.contains(extendedWildcard)) { if (parts.length >= 3) { // this implies that there must be four parts to the name to // extract root // unless exlusion rule applies String exclusionRule2 = "!" + parts[parts.length - 3] + "." + parts[parts.length - 2]; // if exclusion rule is present ... if (secondaryNames.contains(exclusionRule2)) { // third part is NOT part of secondary name return buildRootNameString(candidate, parts, parts.length - 3); } else { // ok extended wildcard matched. we need 4 parts minimum if (parts.length >= 4) { return buildRootNameString(candidate, parts, parts.length - 4); } } } } // at this point ... if the null name exists ... else if (secondaryNames.contains("")) { // return second part as root name return buildRootNameString(candidate, parts, parts.length - 2); } } } } } } return null; } /** The maximum length of a Name */ private static final int MAXNAME = 255; /** The maximum length of a label a Name */ private static final int MAXLABEL = 63; /** The maximum number of labels in a Name */ private static final int MAXLABELS = 128; static Pattern invalidDomainCharactersRegEx = Pattern .compile("[^0-9a-z\\-\\._]"); static Pattern ipAddressRegEx = Pattern .compile("^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$"); static Pattern numericOnly = Pattern .compile("[0-9]*$"); public static boolean isValidDomainName(String name) { // check for invalid length (max 255 characters) if (name.length() > MAXNAME) { return false; } String candidate = name.toLowerCase(); // check to see if this is an ip address if (ipAddressRegEx.matcher(candidate).matches()) { return true; } // check for invalid characters if (invalidDomainCharactersRegEx.matcher(candidate).matches()) { return false; } // split into parts String[] parts = name.split("\\."); // check for max labels constraint if (parts.length > MAXLABELS) { return false; } return extractRootDomainName(candidate) != null; } public static String invertAndMarkTLDNameStartInString(String hostName) { // and invert it ... hostName = URLUtils.invertHostName(hostName); // LOG.info("Inverted HostName for Key:" + key.toString() +":" + hostName); // create a buffer StringBuffer tempBuffer = new StringBuffer(hostName.length()); // now walk it skipping tld names StringTokenizer tokenizer = new StringTokenizer(hostName, "."); boolean foundTLDName = false; while (tokenizer.hasMoreElements()) { char delimiterToUse = '.'; String token = tokenizer.nextToken(); if (!foundTLDName) { if (!URLUtils.isTLDStopWord(token)) { foundTLDName = true; delimiterToUse = '!'; } } if (tempBuffer.length() != 0) tempBuffer.append(delimiterToUse); tempBuffer.append(token); } return tempBuffer.toString(); } public static int findTLDNameEndLengthInMarkedString(String markedString) { boolean foundTLDStartMarker = false; int i = 0; for (i = 0; i < markedString.length(); ++i) { if (markedString.charAt(i) == '!' && !foundTLDStartMarker) { foundTLDStartMarker = true; } else if (markedString.charAt(i) == '.' && foundTLDStartMarker) { break; } } return i; } public static int findTLDNameEndLengthInMarkedStream(byte[] stream, int offset, int length) { boolean foundTLDStartMarker = false; int i = 0; for (i = 0; i < length; ++i) { if (stream[offset + i] == '!' && !foundTLDStartMarker) { foundTLDStartMarker = true; } else if (stream[offset + i] == '.' && foundTLDStartMarker) { break; } } return i; } private static void testURLNameInversion(String name) { System.out.println("Inverting name:" + name + " result:" + invertHostName(name)); } private static void testTLDNameDetection(String name) { byte[] bytes = null; try { bytes = name.getBytes("UTF8"); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } int nameLen = findTLDNameEndLength(bytes, 0, bytes.length); System.out.println("TLD Name for:" + name + " is:" + name.substring(0, nameLen)); } private static void testHostNameInvertFast(int offset, String hostName) { byte[] tokenArray = hostName.getBytes(); byte[] duplicate = new byte[tokenArray.length + offset + 4]; System.arraycopy(tokenArray, 0, duplicate, offset, tokenArray.length); byte[] destinationArray = new byte[tokenArray.length + 4]; int destinationBufferSize = invertHostNameFast(duplicate, offset, tokenArray.length, destinationArray); System.out.println("Inverted:" + hostName + " Produced:" + new String(destinationArray, 0, destinationBufferSize)); } static String replicateNameNormalization(String hostNameIn) { // and invert it ... String hostName = URLUtils.invertHostName(hostNameIn); // LOG.info("Inverted HostName for Key:" + key.toString() +":" + hostName); // create a buffer StringBuffer tempBuffer = new StringBuffer(hostName.length()); // now walk it skipping tld names StringTokenizer tokenizer = new StringTokenizer(hostName, "."); boolean foundTLDName = false; while (tokenizer.hasMoreElements()) { char delimiterToUse = '.'; String token = tokenizer.nextToken(); if (!foundTLDName) { if (!URLUtils.isTLDStopWord(token)) { foundTLDName = true; delimiterToUse = '!'; } } tempBuffer.append(delimiterToUse); tempBuffer.append(token); } hostName = tempBuffer.toString(); return hostName; } static class CanonicalizationTestCase { String originalURL; String expectedURL; CanonicalizationTestCase(String originalURL, String expectedURL) { this.originalURL = originalURL; this.expectedURL = expectedURL; } void validate() { try { String resultingURL = canonicalizeURL(originalURL, false); Assert.assertEquals(resultingURL, expectedURL); } catch (MalformedURLException e) { if (expectedURL != null) { Assert.assertTrue(false); } } } } public static class URLFPV2RawComparator implements RawComparator<URLFPV2> { DataInputBuffer keyReader1 = new DataInputBuffer(); DataInputBuffer keyReader2 = new DataInputBuffer(); URLFPV2 fp1 = new URLFPV2(); URLFPV2 fp2 = new URLFPV2(); @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { keyReader1.reset(b1, s1, l1); keyReader2.reset(b2, s2, l2); try { // read first byte of both streams int s1FirstByte = keyReader1.read(); int s2FirstByte = keyReader2.read(); boolean s1IsOldFormat = false; boolean s2IsOldFormat = false; if (s1FirstByte == 0 || s1FirstByte == -1) { s1IsOldFormat = true; } if (s2FirstByte == 0 || s2FirstByte == -1) { s2IsOldFormat = true; } keyReader1.skip(1); // skip next byte fp1.setDomainHash(WritableUtils.readVLong(keyReader1)); keyReader2.skip(1); // skip next byte fp2.setDomainHash(WritableUtils.readVLong(keyReader2)); int result = ((Long) fp1.getDomainHash()) .compareTo(fp2.getDomainHash()); if (result == 0) { keyReader1.skip((s1IsOldFormat) ? 2 : 1); // id field only fp1.setUrlHash(WritableUtils.readVLong(keyReader1)); keyReader2.skip((s2IsOldFormat) ? 2 : 1); // id field only fp2.setUrlHash(WritableUtils.readVLong(keyReader2)); result = ((Long) fp1.getUrlHash()).compareTo(fp2.getUrlHash()); } return result; } catch (IOException e) { throw new RuntimeException(e); } } @Override public int compare(URLFPV2 fp1, URLFPV2 fp2) { int result = ((Long) fp1.getDomainHash()).compareTo(fp2.getDomainHash()); if (result == 0) { result = ((Long) fp1.getUrlHash()).compareTo(fp2.getUrlHash()); } return result; } static void validateComparator() { URLFPV2 fp1 = new URLFPV2(); URLFPV2 fp2 = new URLFPV2(); URLFPV2 fp3 = new URLFPV2(); URLFPV2 fp4 = new URLFPV2(); fp1.setDomainHash(1L); fp2.setDomainHash(1L); fp3.setDomainHash(2L); fp4.setDomainHash(2L); fp1.setUrlHash(10L); fp2.setUrlHash(9L); fp3.setUrlHash(18L); fp4.setUrlHash(20L); DataOutputBuffer buffer1 = new DataOutputBuffer(); DataOutputBuffer buffer2 = new DataOutputBuffer(); DataOutputBuffer buffer3 = new DataOutputBuffer(); DataOutputBuffer buffer4 = new DataOutputBuffer(); URLFPV2RawComparator comparator = new URLFPV2RawComparator(); Assert.assertTrue(comparator.compare(fp1, fp2) == 1); Assert.assertTrue(comparator.compare(fp2, fp1) == -1); Assert.assertTrue(comparator.compare(fp1, fp3) == -1); Assert.assertTrue(comparator.compare(fp3, fp1) == 1); Assert.assertTrue(comparator.compare(fp4, fp3) == 1); Assert.assertTrue(comparator.compare(fp3, fp4) == -1); try { BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_SHORT; fp1.write(buffer1); BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_VINT; fp2.write(buffer2); BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_SHORT; fp3.write(buffer3); BinaryProtocol.DEFAULT_PROTOCOL_ENCODING_MODE = BinaryProtocol.FIELD_ID_ENCODING_MODE_VINT; fp4.write(buffer4); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } Assert.assertTrue(comparator.compare(buffer1.getData(), 0, buffer1 .getLength(), buffer2.getData(), 0, buffer2.getLength()) == 1); Assert.assertTrue(comparator.compare(buffer2.getData(), 0, buffer2 .getLength(), buffer1.getData(), 0, buffer1.getLength()) == -1); Assert.assertTrue(comparator.compare(buffer1.getData(), 0, buffer1 .getLength(), buffer3.getData(), 0, buffer3.getLength()) == -1); Assert.assertTrue(comparator.compare(buffer3.getData(), 0, buffer3 .getLength(), buffer1.getData(), 0, buffer1.getLength()) == 1); Assert.assertTrue(comparator.compare(buffer3.getData(), 0, buffer3 .getLength(), buffer4.getData(), 0, buffer4.getLength()) == -1); Assert.assertTrue(comparator.compare(buffer4.getData(), 0, buffer4 .getLength(), buffer3.getData(), 0, buffer3.getLength()) == 1); } } public static CanonicalizationTestCase[] testCases = { new CanonicalizationTestCase("http://foo.bar.com.#?", "http://foo.bar.com/"), new CanonicalizationTestCase( "http://foo.bar.com./;msg1234FDF FDFDFDF FDFD?param1=test", "http://foo.bar.com/?param1=test"), new CanonicalizationTestCase( "http://foo.bar.com./;msg1234FDF FDFDFDF FDFD", "http://foo.bar.com/"), new CanonicalizationTestCase( "http://foo.bar.com/subpath/;msg1234FDF FDFDFDF FDFD", "http://foo.bar.com/subpath/"), new CanonicalizationTestCase( "http://foo.bar.com/subpath/;msg1234FDF FDFDFDF FDFD?param=1", "http://foo.bar.com/subpath/?param=1"), new CanonicalizationTestCase("http://foo.bar.com.#REF=24242", "http://foo.bar.com/"), new CanonicalizationTestCase( "http://www.lakeshorelearning.com/order/onlineOrder.jsp;jsessionid=KxMMpRGgPpC1ktZ1pJJCZF1MmmFxZHPnyrNJhBmWJGHkhcL5Hd4p!-617247554!NONE?FOLDER%3C%3Efolder_id=2534374302096766&ASSORTMENT%3C%3East_id=1408474395181113&bmUID=1257311436941", "http://www.lakeshorelearning.com/order/onlineOrder.jsp?FOLDER%3C%3Efolder_id=2534374302096766&ASSORTMENT%3C%3East_id=1408474395181113&bmUID=1257311436941"), new CanonicalizationTestCase( "http://www.emeraldinsight.com/Insight/menuNavigation.do;jsessionid=A17FC93E864C2F8B3709F63558BA69DB?hdAction=InsightHome", "http://www.emeraldinsight.com/Insight/menuNavigation.do?hdAction=InsightHome") }; public static void validatateCanonicalization() { for (CanonicalizationTestCase testCase : testCases) { testCase.validate(); } } public static void main(String[] args) { URLFPV2RawComparator.validateComparator(); URLFPV2 fingerprint = getURLFPV2FromURL("http://www.gmail.fr/"); URLFPV2 fingerprint2 = getURLFPV2FromURL("http://gmail.fr/"); Assert.assertTrue(fingerprint.getDomainHash() == fingerprint2 .getDomainHash()); testRootDomainExtractor(); Assert.assertTrue(isValidDomainName("192.168.0.1")); Assert.assertFalse(isValidDomainName("192.168.0.1.1")); Assert.assertTrue(URLUtils.normalizeHostName("192.168.0.1", false).equals( "192.168.0.1")); validatateCanonicalization(); } private static void testRootDomainExtractor() { System.out.println(extractRootDomainName("www.ret.gov.au") + "," + extractTLDName("www.ret.gov.au")); System.out.println(extractRootDomainName("www.jobshop.ro") + "," + extractTLDName("www.jobshop.ro")); System.out.println(extractRootDomainName("www.ne.jp") + "," + extractTLDName("www.ne.jp")); System.out.println(extractRootDomainName("foo.ac.jp") + "," + extractTLDName("foo.ac.jp")); System.out.println(extractRootDomainName("aichi.jp") + "," + extractTLDName("aichi.jp")); System.out.println(extractRootDomainName("bochi.aichi.jp") + "," + extractTLDName("bochi.aichi.jp")); System.out.println(extractRootDomainName("more.bochi.aichi.jp") + "," + extractTLDName("more.bochi.aichi.jp")); System.out.println(extractRootDomainName("metro.tokyo.jp") + "," + extractTLDName("metro.tokyo.jp")); System.out.println(extractRootDomainName("fluff.metro.tokyo.jp") + "," + extractTLDName("fluff.metro.tokyo.jp")); System.out.println(extractRootDomainName("www.pref.hokkaido.jp") + "," + extractTLDName("www.pref.hokkaido.jp")); System.out.println(extractRootDomainName("www.subdomain.pref2.hokkaido.jp") + "," + extractTLDName("www.subdomain.pref2.hokkaido.jp")); System.out.println(extractRootDomainName("gigaom.com") + "," + extractTLDName("gigaom.com")); System.out.println(extractRootDomainName("www.gigaom.com.cn") + "," + extractTLDName("www.gigaom.com.cn")); System.out.println(extractRootDomainName("www.foobar.idf.il") + "," + extractTLDName("www.foobar.idf.il")); System.out.println(extractRootDomainName("192.168.0.1") + "," + extractTLDName("192.168.0.1")); Assert .assertTrue(extractRootDomainName(".gigaom.com").equals("gigaom.com")); Assert.assertTrue(extractRootDomainName("*.gigaom.com") .equals("gigaom.com")); Assert.assertTrue(extractRootDomainName("www.gigaom.com").equals( "gigaom.com")); Assert.assertTrue(extractRootDomainName("foobar.foo.cn").equals("foo.cn")); Assert.assertTrue(extractRootDomainName("foobar.google.com.cn").equals( "google.com.cn")); Assert.assertTrue(extractRootDomainName("google.com.cn").equals( "google.com.cn")); Assert.assertTrue(extractRootDomainName("cn") == null); Assert.assertTrue(extractRootDomainName("ab.ca") == null); Assert.assertTrue(extractRootDomainName("somedomain.ab.ca").equals( "somedomain.ab.ca")); Assert.assertTrue(extractRootDomainName("www.somedomain.ab.ca").equals( "somedomain.ab.ca")); Assert.assertTrue(extractRootDomainName("www.somedomain .ab.ca") == null); } private static void utilsTest() throws Exception { /* * testHostNameInvertFast(4,"www.google.com"); * testHostNameInvertFast(4,"google.com."); * * System.out.println(invertHostName("news.bbc.co.uk.")); * System.out.println(invertHostName("www.zubia-alam.blogspot.com")+"."); * System.out.println(invertHostName("zubia-alam.blogspot.com")+"."); * System.out.println("compareTo returned:" + * (invertHostName("zubia-alam.blogspot.com" * )+".").compareTo((invertHostName("zubia-alam.blogspot.com")))); * System.out.println("compareTo returned:" + * "x-factor-e.".compareTo("x-factor.")); * * testHostNameInvertFast(4,"www.google.com"); * testHostNameInvertFast(4,"google.com."); * testHostNameInvertFast(4,".google.com"); * testHostNameInvertFast(4,"google.com."); * * testHostNameInvertFast(4,"www.google.com"); * testHostNameInvertFast(4,"google.com."); * testHostNameInvertFast(4,".google.com"); * testHostNameInvertFast(4,"google.com."); */ Assert.assertTrue(normalizeHostName(".gigaom. com", true).equals( "gigaom.com")); Assert.assertTrue(normalizeHostName("%20gigaom.com", true).equals( "gigaom.com")); Assert.assertTrue(normalizeHostName("www.gigaom.com", true).equals( "gigaom.com")); Assert.assertTrue(normalizeHostName("www.gigaom.com.", true).equals( "gigaom.com")); Assert.assertTrue(normalizeHostName("www.gigaom.com", false).equals( "www.gigaom.com")); Assert.assertTrue(normalizeHostName("www.gigaom.com.", false).equals( "www.gigaom.com")); Assert.assertTrue(normalizeHostName(".com.", true).equals("com")); Assert.assertTrue(normalizeHostName("..gigaom.com..", true).equals( "gigaom.com")); Assert.assertTrue(normalizeHostName("aisa.org.af.", true).equals( "aisa.org.af")); /* * testTLDNameDetection("com!google.www"); * testTLDNameDetection("au.com!google.www"); * testTLDNameDetection("com.google.www"); * * testURLNameInversion("www.google.com"); * testURLNameInversion("google.com."); testURLNameInversion(".google.com"); * testURLNameInversion("google.com."); * * testURLNameInversion(invertHostName("www.google.com")); * testURLNameInversion(invertHostName("google.com.")); * testURLNameInversion(invertHostName(".google.com")); * testURLNameInversion(invertHostName("google.com.")); * * testURL("http://www.google.com/"); testURL("http://google.com:8080/"); * testURL("http://google.com:8080"); testURL("http://google.com"); * testURL("http://ahad@google.com"); * testURL("http://ahad:password@google.com"); * testURL("http://ahad:password@google.com/"); testURL("http:///"); */ } }