package org.archive.url;
import java.net.IDN;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.net.InetAddresses;
public class GoogleURLCanonicalizer implements URLCanonicalizer {
// Pattern OCTAL_IP = Pattern.compile("^(0[0-7]*)(\\.[0-7]+)*$");
// Pattern DECIMAL_IP = Pattern.compile("^([0-9]+)(\\.[0-9]+)*$");
Pattern OCTAL_IP = Pattern.compile("^(0[0-7]*)(\\.[0-7]+)?(\\.[0-7]+)?(\\.[0-7]+)?$");
Pattern DECIMAL_IP = Pattern.compile("^([1-9][0-9]*)(\\.[0-9]+)?(\\.[0-9]+)?(\\.[0-9]+)?$");
public void canonicalize(HandyURL url) {
url.setHash(null);
url.setAuthUser(minimalEscape(url.getAuthUser()));
url.setAuthPass(minimalEscape(url.getAuthPass()));
url.setQuery(minimalEscape(url.getQuery()));
String hostE = unescapeRepeatedly(url.getHost());
String host = null;
if (hostE != null) {
try {
host = IDN.toASCII(hostE);
} catch(IllegalArgumentException e) {
if(!e.getMessage().contains("A prohibited code point was found")) {
// TODO: What to do???
// throw e;
}
host = hostE;
}
host = host.replaceAll("^\\.+", "").
replaceAll("\\.\\.+", ".").
replaceAll("\\.$", "");
}
String ip = null;
// try {
ip = attemptIPFormats(host);
// } catch (URIException e) {
// e.printStackTrace();
// }
if(ip != null) {
host = ip;
} else if (host != null) {
host = escapeOnce(host.toLowerCase());
}
url.setHost(host);
// now the path:
String path = unescapeRepeatedly(url.getPath());
url.setPath(escapeOnce(normalizePath(path)));
}
private static final Pattern SINGLE_FORWARDSLASH_PATTERN =
Pattern.compile("/");
public String normalizePath(String path) {
if(path == null) {
path = "/";
} else {
// -1 gives an empty trailing element if path ends with '/':
String[] paths = SINGLE_FORWARDSLASH_PATTERN.split(path,-1);
ArrayList<String> keptPaths = new ArrayList<String>();
boolean first = true;
for(String p : paths) {
if(first) {
first = false;
continue;
} else if(p.compareTo(".") == 0) {
// skip
continue;
} else if(p.compareTo("..") == 0) {
// pop the last path, if present:
if(keptPaths.size() > 0) {
keptPaths.remove(keptPaths.size()-1);
} else {
// TODO: leave it? let's do for now...
keptPaths.add(p);
}
} else {
keptPaths.add(p);
}
}
int numKept = keptPaths.size();
if(numKept == 0) {
path = "/";
} else {
StringBuilder sb = new StringBuilder();
sb.append("/");
for(int i = 0; i < numKept - 1; i++) {
String p = keptPaths.get(i);
if(p.length() > 0) {
// this will omit multiple slashes:
sb.append(p).append("/");
}
}
sb.append(keptPaths.get(numKept-1));
path = sb.toString();
}
}
return path;
}
public String attemptIPFormats(String host) { //throws URIException {
if(host == null) {
return null;
}
if(host.matches("^\\d+$")) {
try {
Long l = Long.parseLong(host);
return InetAddresses.fromInteger(l.intValue()).getHostAddress();
} catch(NumberFormatException e) {
}
} else {
// check for octal:
Matcher m = OCTAL_IP.matcher(host);
if(m.matches()) {
int parts = m.groupCount();
if(parts > 4) {
// WHAT TO DO?
return null;
// throw new URIException("Bad Host("+host+")");
}
int[] ip = new int[]{0,0,0,0};
for(int i=0; i < parts; i++) {
int octet = Integer.parseInt(m.group(i+1).substring((i==0)?0:1),8);
if((octet < 0) || (octet > 255)) {
return null;
// throw new URIException("Bad Host("+host+")");
}
ip[i] = octet;
}
return String.format("%d.%d.%d.%d",ip[0],ip[1],ip[2],ip[3]);
} else {
Matcher m2 = DECIMAL_IP.matcher(host);
if(m2.matches()) {
int parts = m2.groupCount();
if(parts > 4) {
// WHAT TO DO?
return null;
// throw new URIException("Bad Host("+host+")");
}
int[] ip = new int[]{0,0,0,0};
for(int i=0; i < parts; i++) {
String m2Group = m2.group(i+1);
if(m2Group == null)
return null;
//int octet = Integer.parseInt(m2.group(i+1).substring((i==0)?0:1));
int octet;
try {
octet = Integer.parseInt(m2Group.substring((i==0)?0:1));
} catch (NumberFormatException e){
return null;
}
if((octet < 0) || (octet > 255)) {
return null;
// throw new URIException("Bad Host("+host+")");
}
ip[i] = octet;
}
return String.format("%d.%d.%d.%d",ip[0],ip[1],ip[2],ip[3]);
}
}
}
return null;
}
public String minimalEscape(String input) {
return escapeOnce(unescapeRepeatedly(input));
}
public String escapeOnce(String input) {
if(input == null) {
return null;
}
StringBuilder sb = null;
int len = input.length();
boolean ok = false;;
for(int i = 0; i < len; i++) {
char c = input.charAt(i);
ok = false;
if(c > 32) {
if(c < 128) {
if(c != '#') {
ok = (c != '%');
}
}
}
if(ok) {
if(sb != null) {
sb.append(c);
}
} else {
if(sb == null) {
sb = new StringBuilder(input.substring(0,i));
// } else {
// // BUGBUG: What about chars > 255?!
// sb.append('%').append(Integer.toHexString(c).toUpperCase());
// }
}
// BUGBUG: What about chars > 255?!
sb.append("%");
String hex = Integer.toHexString(c).toUpperCase();
if(hex.length() == 1) {
sb.append('0');
}
sb.append(hex);
}
}
if(sb == null) {
return input;
}
return sb.toString();
}
public String unescapeRepeatedly(String input) {
if(input == null) {
return null;
}
while(true) {
String un = decode(input);
if(un.compareTo(input) == 0) {
return input;
}
input = un;
}
}
public String decode(String input) {
int len = input.length();
int i = 0;
StringBuilder sb = null;
boolean foundHex = false;
while(i < len-2) {
char c = input.charAt(i);
foundHex = false;
if(c == '%') {
// are next two hex chars?
int h1 = getHex(input.charAt(i+1));
if(h1 > -1) {
int h2 = getHex(input.charAt(i+2));
if(h2 > -1) {
if(sb == null) {
sb = new StringBuilder(len);
if(i > 0) {
sb.append(input.substring(0,i));
}
}
foundHex = true;
i += 2;
char f = (char) ((h1 << 4) + h2);
sb.append(f);
}
}
}
if(!foundHex) {
if(sb != null) {
sb.append(c);
}
}
i++;
}
if(sb == null) {
return input;
}
// append the last chars if missed:
for(int i2 = i; i2 < len; i2++) {
sb.append(input.charAt(i2));
}
return sb.toString();
}
public int getHex(final char c) {
if(c < '0') {
return -1;
}
if(c <= '9') {
return c - '0';
}
if(c < 'A') {
return -1;
}
if(c <= 'F') {
return 10 + (c - 'A');
}
if(c < 'a') {
return -1;
}
if(c <= 'f') {
return 10 + (c - 'a');
}
return -1;
}
}