/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;
import org.junit.Test;
/**
* helper routines dealing with url normalization
*
* @author rana
*/
public class URLNormalize {
public static String wwwNormalizeHost(String url) {
if (url.startsWith("www.") || url.startsWith("WWW.")) {
return url.substring(4);
}
return url;
}
public static String wwwNormalize(String url) {
return tweakURL(true, url);
}
public static String wwwDeNormalize(String url) {
return tweakURL(false, url);
}
public static boolean isWWWNormalized(String url) {
return url.startsWith("+");
}
public static String stripNormalizationMetadata(String url) {
if (isWWWNormalized(url)) {
return url.substring(1);
}
return url;
}
private static final String tweakURL(boolean normalize,String url) {
boolean modify = false;
if (!normalize && url.startsWith("+")) {
modify = true;
url = url.substring(1);
}
if (modify || normalize) {
GoogleURL urlObject = new GoogleURL(url);
if (urlObject.isValid()) {
if (normalize && (urlObject.getHost().startsWith("www.") || urlObject.getHost().startsWith("WWW."))) {
modify = true;
}
if (modify) {
StringBuilder urlOut = new StringBuilder();
if (normalize) {
urlOut.append("+");
}
urlOut.append(urlObject.getScheme());
urlOut.append("://");
if (urlObject.getUserName() != GoogleURL.emptyString) {
urlOut.append(urlObject.getUserName());
if (urlObject.getPassword() != GoogleURL.emptyString) {
urlOut.append(":");
urlOut.append(urlObject.getPassword());
}
urlOut.append("@");
}
if (normalize) {
urlOut.append(urlObject.getHost().substring(4));
}
else {
urlOut.append("www.");
urlOut.append(urlObject.getHost());
}
if (urlObject.getPort() != GoogleURL.emptyString) {
urlOut.append(":");
urlOut.append(urlObject.getPort());
}
if (urlObject.getPath() != GoogleURL.emptyString) {
urlOut.append(urlObject.getPath());
}
if (urlObject.getQuery() != GoogleURL.emptyString) {
urlOut.append("?");
urlOut.append(urlObject.getQuery());
}
if (urlObject.getRef() != GoogleURL.emptyString) {
urlOut.append("#");
urlOut.append(urlObject.getRef());
}
return urlOut.toString();
}
}
}
return url;
}
public static class WWWNormalizedURLComparator extends WritableComparator {
public WWWNormalizedURLComparator() {
super(Text.class,true);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
// adjust for the + metadata symbol in source and target urls ...
if (b1[s1] == '+') {
s1 += 1;
l1 -= 1;
}
if (b2[s2] == '+') {
s2 += 1;
l2 -= 1;
}
return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
}
}
@Test
public void testname() throws Exception {
String preNormalizedURL = "http:///ahad:password@www.google.com/././../.../foobar http://foobarz.com/foobarz{1}?FOO=10#1282838383838";
System.out.println("Pre-Normalized URL:"+ preNormalizedURL);
System.out.println("Normalized URL:"+ wwwNormalize(preNormalizedURL));
System.out.println("Stripping Normalized Metadata for Normalized URL:"+ stripNormalizationMetadata(wwwNormalize(preNormalizedURL)));
System.out.println("DeNormalized URL:"+ wwwDeNormalize(wwwNormalize(preNormalizedURL)));
}
}