/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.util.url; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; import org.archive.url.UsableURI; import org.archive.url.UsableURIFactory; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.util.ByteOp; /** * Class that performs the standard Heritrix URL canonicalization. Eventually, * this should all be configurable, or perhaps be able to read the settings * used within a Heritrix crawler... or even multiple crawlers... this is hard. * * @author brad * @version $Date$, $Revision$ */ public class AggressiveUrlCanonicalizer implements UrlCanonicalizer { private static final Logger LOGGER = Logger.getLogger( AggressiveUrlCanonicalizer.class.getName()); private static final String CDX_PREFIX = " CDX "; /** * Strip leading 'www.' */ private static final Pattern STRIP_WWW_REGEX = Pattern.compile("(?i)^(?:https?://)(www[0-9]*\\.)(?:[^/]*/.+)$"); private static final String STRIP_WWW_CHOOSER = "/www"; // /** // * Strip leading 'www44.', 'www3.', etc. // */ // private static final Pattern STRIP_WWWN_REGEX = // Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); /** * Strip userinfo. */ private static final Pattern STRIP_USERINFO_REGEX = Pattern.compile("^(?:(?:(?:https?)|(?:ftps?))://)([^/]+@)(?:.*)$", Pattern.CASE_INSENSITIVE); private static final String STRIP_USERINFO_CHOOSER = "@"; /** * Example: PHPSESSID=9682993c8daa2c5497996114facdc805. */ private static final Pattern STRIP_PHPSESSION_ID_REGEX = Pattern.compile("^(?:.+)(phpsessid=" + "[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); private static final String STRIP_PHPSESSION_ID_CHOOSER = "phpsessid="; /** * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. */ private static final Pattern STRIP_JSESSION_ID_REGEX = Pattern.compile("^.*(jsessionid=[0-9a-zA-Z]{32}&?).*$", Pattern.CASE_INSENSITIVE); private static final String STRIP_JSESSION_ID_CHOOSER = "jsessionid="; /** * Example: sid=9682993c8daa2c5497996114facdc805. * 'sid=' can be tricky but all sid= followed by 32 byte string * so far seen have been session ids. Sid is a 32 byte string * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid' * so have to have it run after the phpsessid elimination. */ private static final Pattern STRIP_SID_REGEX = Pattern.compile("^(?:.+)" + "(sid=[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); private static final String STRIP_SID_CHOOSER = "sid="; /** * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM. */ private static final Pattern STRIP_ASPSESSION_REGEX = Pattern.compile("^(?:.+)" + "(ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); private static final String STRIP_ASPSESSION_CHOOSER = "aspsessionid"; /** * Examples: * * (.NET 2.0) * http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx * => http://legislature.mi.gov/mileg.aspx * * (.NET 1.0/1.1) * http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx * => http://legislature.mi.gov/mileg.aspx * * For more info, see: * http://msdn2.microsoft.com/en-us/library/aa479315.aspx * */ private static final Pattern STRIP_ASPSESSION2_REGEX = Pattern.compile(".*/(\\([0-9a-z]{24}\\)/)(?:[^\\?]+\\.aspx.*)$", Pattern.CASE_INSENSITIVE); private static final String STRIP_ASPSESSION2_CHOOSER = ".aspx"; /** * Examples: * * (.NET 2.0) * http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules * => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules * * For more info, see: * http://msdn2.microsoft.com/en-us/library/aa479315.aspx * */ private static final Pattern STRIP_ASPSESSION3_REGEX = Pattern.compile(".*/(\\((?:[a-z]\\([0-9a-z]{24}\\))+\\)/)[^\\?]+\\.aspx.*$", Pattern.CASE_INSENSITIVE); private static final String STRIP_ASPSESSION3_CHOOSER = ".aspx"; /** * Strip ColdFusion session IDs. Remove sessionids that look like the * following: * CFID=12412453&CFTOKEN=15501799 * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A */ private static final Pattern STRIP_CFSESSION_REGEX = Pattern.compile(".+(cfid=[^&]+&cftoken=[^&]+(?:&jsessionid=[^&]+)?&?).*$", Pattern.CASE_INSENSITIVE); private static final String STRIP_CFSESSION_CHOOSER = "cftoken="; private static final String choosers[] = { STRIP_USERINFO_CHOOSER, STRIP_WWW_CHOOSER, STRIP_PHPSESSION_ID_CHOOSER, STRIP_JSESSION_ID_CHOOSER, STRIP_ASPSESSION_CHOOSER, STRIP_ASPSESSION2_CHOOSER, STRIP_ASPSESSION3_CHOOSER, STRIP_SID_CHOOSER, STRIP_CFSESSION_CHOOSER }; private static final Pattern strippers[] = { STRIP_USERINFO_REGEX, STRIP_WWW_REGEX, STRIP_PHPSESSION_ID_REGEX, STRIP_JSESSION_ID_REGEX, STRIP_ASPSESSION_REGEX, STRIP_ASPSESSION2_REGEX, STRIP_ASPSESSION3_REGEX, STRIP_SID_REGEX, STRIP_CFSESSION_REGEX }; private List<CanonicalizationRule> processingRules = new ArrayList<CanonicalizationRule>(); public List<CanonicalizationRule> getProcessingRules() { return processingRules; } public void setProcessingRules(List<CanonicalizationRule> processingRules) { this.processingRules = processingRules; } /** * Run a regex against a StringBuilder, removing group 1 if it matches. * * Assumes the regex has a form that wants to strip elements of the passed * string. Assumes that if a match, group 1 should be removed * @param url Url to search in. * @param matcher Matcher whose form yields a group to remove * @return true if the StringBuilder was modified */ protected boolean doStripRegexMatch(StringBuilder url, Matcher matcher) { if(matcher != null && matcher.matches()) { url.delete(matcher.start(1), matcher.end(1)); return true; } return false; } public String urlStringToKey(final String urlString) throws URIException { if(urlString.startsWith("dns:")) { return urlString; } String searchUrl = canonicalize(urlString); String scheme = UrlOperations.urlToScheme(searchUrl); if(scheme != null) { searchUrl = searchUrl.substring(scheme.length()); } else { scheme = UrlOperations.HTTP_SCHEME; } if (-1 == searchUrl.indexOf("/")) { searchUrl = scheme + searchUrl + "/"; } else { searchUrl = scheme + searchUrl; } // Custom rules for (CanonicalizationRule rule : getProcessingRules()) { searchUrl = rule.processIfMatches(new CanonicalizationInput(searchUrl)); } // Core rules // TODO: These next few lines look crazy -- need to be reworked.. This // was the only easy way I could find to get the correct unescaping // out of UsableURIs, possible a bug. Definitely needs some TLC in any case, // as building UsableURIs is *not* a cheap operation. // unescape anything that can be: UsableURI tmpURI = null; try { tmpURI = UsableURIFactory.getInstance(searchUrl); } catch (StringIndexOutOfBoundsException e) { LOGGER.warning(e.getMessage() + ": " + searchUrl); return searchUrl; // } catch(URIException e) { // LOGGER.warning(e.getMessage() + ": " + searchUrl); // return searchUrl; } tmpURI.setPath(tmpURI.getPath()); // convert to UsableURI to perform required URI fixup: UsableURI searchURI = UsableURIFactory.getInstance(tmpURI.getURI()); // replace ' ' with '+' (this is only to match Alexa's canonicalization) String newPath = searchURI.getEscapedPath().replace("%20","+"); // replace multiple consecutive '/'s in the path. while(newPath.contains("//")) { newPath = newPath.replace("//","/"); } // this would remove trailing a '/' character, unless the path is empty // but we're not going to do this just yet.. // if((newPath.length() > 1) && newPath.endsWith("/")) { // newPath = newPath.substring(0,newPath.length()-1); // } StringBuilder sb = new StringBuilder(searchUrl.length()); sb.append(searchURI.getHostBasename()); // omit port if scheme default: int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme); if(searchURI.getPort() != defaultSchemePort && searchURI.getPort() != -1) { sb.append(":").append(searchURI.getPort()); } sb.append(newPath); if(searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); } return sb.toString(); } /** * Idempotent operation that will determine the 'fuzziest' * form of the url argument. This operation is done prior to adding records * to the ResourceIndex, and prior to lookup. Current version is exactly * the default found in Heritrix. When the configuration system for * Heritrix stabilizes, hopefully this can use the system directly within * Heritrix. * * @param url to be canonicalized. * @return canonicalized version of url argument. */ public String canonicalize(String url) { if (url == null || url.length() <= 0) { return url; } // hang on, we're about to get aggressive: url = url.toLowerCase(); StringBuilder sb = new StringBuilder(url); boolean changed = false; for(int i=0; i<choosers.length; i++) { if(sb.indexOf(choosers[i]) != -1) { changed |= doStripRegexMatch(sb,strippers[i].matcher(sb)); } } if(changed) { url = sb.toString(); } int index = url.lastIndexOf('?'); if (index > 0) { if (index == (url.length() - 1)) { // '?' is last char in url. Strip it. url = url.substring(0, url.length() - 1); } else if (url.charAt(index + 1) == '&') { // Next char is '&'. Strip it. if (url.length() == (index + 2)) { // Then url ends with '?&'. Strip them. url = url.substring(0, url.length() - 2); } else { // The '&' is redundant. Strip it. url = url.substring(0, index + 1) + url.substring(index + 2); } } else if (url.charAt(url.length() - 1) == '&') { // If we have a lone '&' on end of query str, // strip it. url = url.substring(0, url.length() - 1); } } return url; } private static void USAGE() { System.err.println("Usage: [-f FIELD] [-d DELIM]"); System.exit(3); } /** * @param args program arguments */ public static void main(String[] args) { AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); int n = 0; int i = 0; ArrayList<Integer> columns = new ArrayList<Integer>(); long lineNumber = 0; boolean cdxPassThru = false; String delimiter = " "; while(n < args.length) { String arg = args[n]; if(arg.compareTo("-cdx") == 0) { cdxPassThru = true; n++; continue; } if(n == (args.length -1)) { USAGE(); } String val = args[n+1]; if(arg.compareTo("-f") == 0) { columns.add(new Integer(val)); } else if(arg.compareTo("-d") == 0) { delimiter = val; } else { USAGE(); } n += 2; } // place default '0' in case none specified: if(columns.size() == 0) { columns.add(new Integer(1)); } // convert to int[]: int[] cols = new int[columns.size()]; for(int idx = 0; idx < columns.size(); idx++) { cols[idx] = columns.get(idx).intValue() - 1; } BufferedReader r = new BufferedReader(new InputStreamReader(System.in,ByteOp.UTF8)); StringBuilder sb = new StringBuilder(); String line = null; while(true) { try { line = r.readLine(); } catch (IOException e) { e.printStackTrace(); System.exit(1); } if(line == null) { break; } lineNumber++; if(cdxPassThru && line.startsWith(CDX_PREFIX)) { System.out.println(line); continue; } String parts[] = line.split(delimiter); for(int column : cols) { if(column >= parts.length) { System.err.println("Invalid line " + lineNumber + " (" + line + ") skipped"); } else { try { parts[column] = canonicalizer.urlStringToKey(parts[column]); } catch (URIException e) { System.err.println("Invalid URL in line " + lineNumber + " (" + line + ") skipped (" + parts[column] + ")"); e.printStackTrace(); continue; } catch (StringIndexOutOfBoundsException e) { System.err.println("Invalid URL in line " + lineNumber + " (" + line + ") skipped (" + parts[column] + ")"); e.printStackTrace(); continue; } } } sb.setLength(0); for(i = 0; i < parts.length; i++) { sb.append(parts[i]); if(i < (parts.length-1)) { sb.append(delimiter); } } System.out.println(sb.toString()); } } public boolean isSurtForm() { return false; } }