/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.util.url; import org.apache.commons.httpclient.URIException; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import junit.framework.TestCase; /** * * * @author brad * @version $Date$, $Revision$ */ public class AggressiveUrlCanonicalizerTest extends TestCase { private AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); /** * Test method for 'org.archive.wayback.cdx.CDXRecord.urlStringToKey(String)' */ public void testUrlStringToKey() { // simple strip of http:// checkCanonicalization("http://foo.com/","foo.com/"); // simple strip of https:// checkCanonicalization("https://foo.com/","foo.com/"); // simple strip of ftp:// checkCanonicalization("ftp://foo.com/","foo.com/"); // simple strip of rtsp:// checkCanonicalization("rtsp://foo.com/","foo.com/"); // strip leading 'www.' checkCanonicalization("http://www.foo.com/","foo.com/"); // add trailing '/' with empty path checkCanonicalization("http://www.foo.com","foo.com/"); // strip leading 'www##.' checkCanonicalization("http://www12.foo.com/","foo.com/"); // strip leading 'www##.' with https checkCanonicalization("https://www12.foo.com/","foo.com/"); // strip leading 'www##.' with no protocol checkCanonicalization("www12.foo.com/","foo.com/"); checkCanonicalization("http://www.example.com/","example.com/"); checkCanonicalization("http://www.example.com","example.com/"); checkCanonicalization("http://www.example.com/index.html","example.com/index.html"); // leave alone an url with no protocol but non-empty path checkCanonicalization("foo.com/","foo.com/"); // add trailing '/' with empty path and without protocol checkCanonicalization("foo.com","foo.com/"); // add trailing '/' to with empty path and no protocol, plus massage checkCanonicalization("www12.foo.com","foo.com/"); // do not add trailing '/' non-empty path and without protocol checkCanonicalization("foo.com/boo","foo.com/boo"); // TEST // replace escaped ' ' with '+' in path plus keep trailing slash and query checkCanonicalization("foo.com/pa%20th?a=b","foo.com/pa+th?a=b"); // replace escaped ' ' with '+' in path checkCanonicalization("foo.com/pa%20th","foo.com/pa+th"); // replace escaped ' ' with '+' in path plus leave trailing slash checkCanonicalization("foo.com/pa%20th/","foo.com/pa+th/"); // replace multiple consecutive /'s in path checkCanonicalization("foo.com//goo","foo.com/goo"); // replace multiple consecutive /'s in path checkCanonicalization("foo.com///goo","foo.com/goo"); // leave alone consecutive /'s after ? checkCanonicalization("foo.com/b?jar=//goo","foo.com/b?jar=//goo"); // replace multiple consecutive /'s in path, plus leave trailing / checkCanonicalization("foo.com///goo/","foo.com/goo/"); // replace escaped ' ' with '+' in path plus keep trailing slash and query checkCanonicalization("foo.com/pa%20th/?a=b","foo.com/pa+th/?a=b"); // replace escaped ' ' with '+' in path but not in query key checkCanonicalization("foo.com/pa%20th?a%20a=b","foo.com/pa+th?a%20a=b"); // replace escaped ' ' with '+' in path but not in query value checkCanonicalization("foo.com/pa%20th?a=b%20b","foo.com/pa+th?a=b%20b"); // no change in '!' escaping checkCanonicalization("foo.com/pa!th","foo.com/pa!th"); // no change in '+' escaping checkCanonicalization("foo.com/pa+th","foo.com/pa+th"); // unescape legal escaped '!' (%21) checkCanonicalization("foo.com/pa%21th","foo.com/pa!th"); // leave '%' (%25) checkCanonicalization("foo.com/pa%th","foo.com/pa%th"); // unescape '%' (%25) checkCanonicalization("foo.com/pa%25th","foo.com/pa%th"); //"http://wayback.archive-it.org/1726/20091231154920cs_/http://alumni.creighton.edu/atf/cf/%257B82F49357-B0BC-48DA-B47F-5701CAC6EDFE%257D/MENU-CSSPLAY.css" checkCanonicalization("foo.com/{a}b","foo.com/%7Ba%7Db"); checkCanonicalization("foo.com/%7Ba%7Db","foo.com/%7Ba%7Db"); // replace escaped ' ' with '+' in path, unescape legal '!' in path // no change in query escaping checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b"); // replace escaped ' ' with '+' in path, leave illegal '%02' in path // no change in query escaping checkCanonicalization("foo.com/pa%20t%02h?a%20a=b","foo.com/pa+t%02h?a%20a=b"); // strip jsessionid String sid1 = "jsessionid=0123456789abcdefghijklemopqrstuv"; String sid2 = "PHPSESSID=9682993c8daa2c5497996114facdc805"; String sid3 = "sid=9682993c8daa2c5497996114facdc805"; String sid4 = "ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM"; String sid5 = "CFID=12412453&CFTOKEN=15501799"; String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A"; String fore = "http://foo.com/bar?bo=lo&"; String aft = "&gum=yum"; String want = "foo.com/bar?bo=lo&gum=yum"; // String fore = "http://www.archive.org/index.html?"; // String aft = ""; // String want = "archive.org/index.html"; checkCanonicalization(fore + sid1 + aft,want); checkCanonicalization(fore + sid2 + aft,want); checkCanonicalization(fore + sid3 + aft,want); checkCanonicalization(fore + sid4 + aft,want); checkCanonicalization(fore + sid5 + aft,want); checkCanonicalization(fore + sid6 + aft,want); // Check ASP_SESSIONID2: checkCanonicalization( "http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx", "legislature.mi.gov/mileg.aspx"); // Check ASP_SESSIONID2 (again): checkCanonicalization( "http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx", "legislature.mi.gov/mileg.aspx"); // Check ASP_SESSIONID3: checkCanonicalization( "http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules", "legislature.mi.gov/mileg.aspx?page=sessionschedules"); // '@' in path: checkCanonicalization( "http://www.flickr.com/photos/36050182@N05/", "flickr.com/photos/36050182@n05/"); // default port stripping: // FIRST the easy-on-the-eyes // strip port 80 checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo"); // but not other ports... checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo"); // but not other ports... with "www#." massage checkCanonicalization("http://www232.chub.org:8080/foo","chub.org:8080/foo"); // default HTTP (:80) stripping without a scheme: checkCanonicalization("www.chub.org:80/foo","chub.org/foo"); // no strip https port (443) without scheme: checkCanonicalization("www.chub.org:443/foo","chub.org:443/foo"); // yes strip https port (443) with scheme: checkCanonicalization("https://www.chub.org:443/foo","chub.org/foo"); // NEXT the exhaustive: String origHost = "www.chub.org"; String massagedHost = "chub.org"; String path = "/foo"; for(String scheme : UrlOperations.ALL_SCHEMES) { int defaultPort = UrlOperations.schemeToDefaultPort(scheme); int nonDefaultPort = 19991; String origDefault = scheme + origHost + ":" + defaultPort + path; String canonDefault = massagedHost + path; String origNonDefault = scheme + origHost + ":" + nonDefaultPort + path; String canonNonDefault = massagedHost + ":" + nonDefaultPort + path; checkCanonicalization(origDefault,canonDefault); checkCanonicalization(origNonDefault,canonNonDefault); } // should we try to pass all of these, too? // found in section 6.1 of: // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec // Canonicalize("http://host/%25%32%35") = "http://host/%25"; // Canonicalize("http://host/%25%32%35%25%32%35") = "http://host/%25%25"; // Canonicalize("http://host/%2525252525252525") = "http://host/%25"; // Canonicalize("http://host/asdf%25%32%35asd") = "http://host/asdf%25asd"; // Canonicalize("http://host/%%%25%32%35asd%%") = "http://host/%25%25%25asd%25%25"; // Canonicalize("http://www.google.com/") = "http://www.google.com/"; // Canonicalize("http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/") = "http://168.188.99.26/.secure/www.ebay.com/"; // Canonicalize("http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/") = "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/"; // Canonicalize("http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B") = "http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+"; // Canonicalize("http://3279880203/blah") = "http://195.127.0.11/blah"; // Canonicalize("http://www.google.com/blah/..") = "http://www.google.com/"; // Canonicalize("www.google.com/") = "http://www.google.com/"; // Canonicalize("www.google.com") = "http://www.google.com/"; // Canonicalize("http://www.evil.com/blah#frag") = "http://www.evil.com/blah"; // Canonicalize("http://www.GOOgle.com/") = "http://www.google.com/"; // Canonicalize("http://www.google.com.../") = "http://www.google.com/"; // Canonicalize("http://www.google.com/foo\tbar\rbaz\n2") ="http://www.google.com/foobarbaz2"; // Canonicalize("http://www.google.com/q?") = "http://www.google.com/q?"; // Canonicalize("http://www.google.com/q?r?") = "http://www.google.com/q?r?"; // Canonicalize("http://www.google.com/q?r?s") = "http://www.google.com/q?r?s"; // Canonicalize("http://evil.com/foo#bar#baz") = "http://evil.com/foo"; // Canonicalize("http://evil.com/foo;") = "http://evil.com/foo;"; // Canonicalize("http://evil.com/foo?bar;") = "http://evil.com/foo?bar;"; // Canonicalize("http://\x01\x80.com/") = "http://%01%80.com/"; // Canonicalize("http://notrailingslash.com") = "http://notrailingslash.com/"; // Canonicalize("http://www.gotaport.com:1234/") = "http://www.gotaport.com:1234/"; // Canonicalize(" http://www.google.com/ ") = "http://www.google.com/"; // Canonicalize("http:// leadingspace.com/") = "http://%20leadingspace.com/"; // Canonicalize("http://%20leadingspace.com/") = "http://%20leadingspace.com/"; // Canonicalize("%20leadingspace.com/") = "http://%20leadingspace.com/"; // Canonicalize("https://www.securesite.com/") = "https://www.securesite.com/"; // Canonicalize("http://host.com/ab%23cd") = "http://host.com/ab%23cd"; // Canonicalize("http://host.com//twoslashes?more//slashes") = "http://host.com/twoslashes?more//slashes"; } private void checkCanonicalization(String orig, String want) { String got; try { got = canonicalizer.urlStringToKey(orig); assertEquals("Failed canonicalization (" + orig + ") => (" + got + ") and not (" + want + ") as expected",want,got); String got2 = canonicalizer.urlStringToKey(got); assertEquals("Failed 2nd canonicalization (" + got + ") => (" + got2 + ") and not (" + want + ") as expected",want,got2); } catch (URIException e) { e.printStackTrace(); assertTrue("Exception converting(" + orig + ")",false); } } }