/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.url; import java.util.Iterator; import java.util.TreeMap; import junit.framework.TestCase; import org.apache.commons.httpclient.URIException; import org.apache.commons.lang.SerializationUtils; import org.archive.url.UsableURI; import org.archive.url.UsableURIFactory; /** * Test UURIFactory for proper UURI creation across variety of * important/tricky cases. * * Be careful writing this file. Make sure you write it with UTF-8 encoding. * * @author igor stack gojomo */ public class UsableURIFactoryTest extends TestCase { public final void testEscaping() throws URIException { // Note: single quote is not being escaped by URI class. final String ESCAPED_URISTR = "http://archive.org/" + UsableURIFactory.ESCAPED_SPACE + UsableURIFactory.ESCAPED_SPACE + UsableURIFactory.ESCAPED_CIRCUMFLEX + UsableURIFactory.ESCAPED_QUOT + UsableURIFactory.SQUOT + UsableURIFactory.ESCAPED_APOSTROPH + UsableURIFactory.ESCAPED_LSQRBRACKET + UsableURIFactory.ESCAPED_RSQRBRACKET + UsableURIFactory.ESCAPED_LCURBRACKET + UsableURIFactory.ESCAPED_RCURBRACKET + UsableURIFactory.SLASH + "a.gif"; // NBSP and SPACE should be trimmed; final String URISTR = "http://archive.org/.././" + "\u00A0" + UsableURIFactory.SPACE + UsableURIFactory.CIRCUMFLEX + UsableURIFactory.QUOT + UsableURIFactory.SQUOT + UsableURIFactory.APOSTROPH + UsableURIFactory.LSQRBRACKET + UsableURIFactory.RSQRBRACKET + UsableURIFactory.LCURBRACKET + UsableURIFactory.RCURBRACKET + UsableURIFactory.BACKSLASH + "test/../a.gif" + "\u00A0" + UsableURIFactory.SPACE; UsableURI uuri = UsableURIFactory.getInstance(URISTR); final String uuriStr = uuri.toString(); assertEquals("expected escaping", ESCAPED_URISTR, uuriStr); } public final void testUnderscoreMakesPortParseFail() throws URIException { UsableURI uuri = UsableURIFactory.getInstance("http://one-two_three:8080/index.html"); int port = uuri.getPort(); assertTrue("Failed find of port " + uuri, port == 8080); } public final void testRelativeURIWithTwoSlashes() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://www.archive.org"); UsableURI uuri = UsableURIFactory.getInstance(base, "one//index.html"); assertTrue("Doesn't do right thing with two slashes " + uuri, uuri.toString().equals( "http://www.archive.org/one//index.html")); } public final void testSchemelessURI() throws URIException { UsableURI base = UsableURIFactory.getInstance("https://www.archive.org"); UsableURI uuri = UsableURIFactory.getInstance(base, "//example.com/monkey?this:uri:has:colons"); assertTrue("Doesn't do right thing with a schemeless URI " + uuri, uuri.toString().equals( "https://example.com/monkey?this:uri:has:colons")); } public final void testTrailingEncodedSpace() throws URIException { UsableURI uuri = UsableURIFactory.getInstance("http://www.nps-shoes.co.uk%20"); assertTrue("Doesn't strip trailing encoded space 1 " + uuri, uuri.toString().equals("http://www.nps-shoes.co.uk/")); uuri = UsableURIFactory.getInstance("http://www.nps-shoes.co.uk%20%20%20"); assertTrue("Doesn't strip trailing encoded space 2 " + uuri, uuri.toString().equals("http://www.nps-shoes.co.uk/")); } public final void testPort0080is80() throws URIException { UsableURI uuri = UsableURIFactory.getInstance("http://archive.org:0080"); assertTrue("Doesn't strip leading zeros " + uuri, uuri.toString().equals("http://archive.org/")); } // DISABLING TEST AS PRECURSOR TO ELIMINATION // the problematic input given -- specifically the "%6s" incomplete uri-escape, // shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least, // will attempt to fetch such an URL (getting, in this case against that ad // server, a bad-request error). Ideally, we'd generate exactly the same // request against the server as they do. However, with the most recent // fixup for stray '%' signs, we come close, but not exactly. That's enough // to cause this test to fail (it's not getting the expected exception) but // our almost-URI, which might be what was intended, is better than trying // nothing. // public final void testBadPath() { // String message = null; // try { // UURIFactory.getInstance("http://ads.as4x.tmcs.net/" + // "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" + // "generic&Params.richmedia=yes%26city%3Dseattle%26" + // "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" + // "%6state%3DWA"); // } catch (URIException e) { // message = e.getMessage(); // } // assertNotNull("Didn't get expected exception.", message); // } public final void testEscapeEncoding() throws URIException { UsableURI uuri = UsableURIFactory.getInstance("http://www.y1y1.com/" + "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg", "windows-1256"); uuri.getPath(); } public final void testTooLongAfterEscaping() { StringBuffer buffer = new StringBuffer("http://www.archive.org/a/"); // Append bunch of spaces. When escaped, they'll triple in size. for (int i = 0; i < 1024; i++) { buffer.append(" "); } buffer.append("/index.html"); String message = null; try { UsableURIFactory.getInstance(buffer.toString()); } catch (URIException e) { message = e.getMessage(); } assertTrue("Wrong or no exception: " + message, (message != null) && message.startsWith("Created (escaped) uuri >")); } public final void testFtpUris() throws URIException { final String FTP = "ftp"; final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn"; final String PATH = "/clzreceive/"; final String uri = FTP + "://" + AUTHORITY + PATH; UsableURI uuri = UsableURIFactory.getInstance(uri); assertTrue("Failed to get matching scheme: " + uuri.getScheme(), (uuri.getScheme()).equals(FTP)); assertTrue("Failed to get matching authority: " + uuri.getAuthority(), (uuri.getAuthority()).equals(AUTHORITY)); assertTrue("Failed to get matching path: " + uuri.getPath(), (uuri.getPath()).equals(PATH)); } public final void testWhitespaceEscaped() throws URIException { // Test that we get all whitespace even if the uri is // already escaped. String uri = "http://archive.org/index%25 .html"; String tgtUri = "http://archive.org/index%25%20.html"; UsableURI uuri = UsableURIFactory.getInstance(uri); assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(tgtUri)); uri = "http://archive.org/index%25\u001D.html"; tgtUri = "http://archive.org/index%25%1D.html".toLowerCase(); uuri = UsableURIFactory.getInstance(uri); assertEquals("whitespace escaping", tgtUri, uuri.toString()); uri = "http://gemini.info.usaid.gov/directory/" + "pbResults.cfm?&urlNameLast=Rumplestiltskin"; tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?" + "name=Ebenezer%20+Rumplestiltskin,&location=RRB%20%20%20%205%2E08%2D006"; uuri = UsableURIFactory.getInstance(UsableURIFactory.getInstance(uri), "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" + "RRB%20%20%20%205%2E08%2D006"); assertEquals("whitespace escaping", tgtUri, uuri.toString()); } // public final void testFailedGetPath() throws URIException { // final String path = "/RealMedia/ads/" + // "click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty"; // // decoding in getPath will interpret %CA as 8-bit escaped char, // // possibly incomplete // final String uri = "http://ads.nandomedia.com" + path; // final UURI uuri = UURIFactory.getInstance(uri); // String foundPath = uuri.getPath(); // assertEquals("unexpected path", path, foundPath); // } public final void testDnsHost() throws URIException { String uri = "dns://ads.nandomedia.com:81/one.html"; UsableURI uuri = UsableURIFactory.getInstance(uri); String host = uuri.getReferencedHost(); assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com")); uri = "dns:ads.nandomedia.com"; uuri = UsableURIFactory.getInstance(uri); host = uuri.getReferencedHost(); assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com")); uri = "dns:ads.nandomedia.com?a=b"; uuri = UsableURIFactory.getInstance(uri); host = uuri.getReferencedHost(); assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com")); } public final void testPercentEscaping() throws URIException { final String uri = "http://archive.org/%a%%%%%.html"; // tests indicate firefox (1.0.6) does not encode '%' at all final String tgtUri = "http://archive.org/%a%%%%%.html"; UsableURI uuri = UsableURIFactory.getInstance(uri); assertEquals("Not equal",tgtUri, uuri.toString()); } public final void testRelativeDblPathSlashes() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://www.archive.org/index.html"); UsableURI uuri = UsableURIFactory.getInstance(base, "JIGOU//KYC//INDEX.HTM"); assertTrue("Double slash not working " + uuri.toString(), uuri.getPath().equals("/JIGOU//KYC//INDEX.HTM")); } public final void testRelativeWithScheme() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://www.example.com/some/page"); UsableURI uuri = UsableURIFactory.getInstance(base, "http:boo"); assertTrue("Relative with scheme not working " + uuri.toString(), uuri.toString().equals("http://www.example.com/some/boo")); } public final void testBadBaseResolve() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://license.joins.com/board/" + "etc_board_list.asp?board_name=new_main&b_type=&nPage=" + "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage=" + "notice&gate=02"); UsableURIFactory.getInstance(base, "http://www.changeup.com/...</a"); } public final void testTilde() throws URIException { noChangeExpected("http://license.joins.com/~igor"); } public final void testCurlies() throws URIException { // Firefox allows curlies in the query string portion of a URL only // (converts curlies if they are in the path portion ahead of the // query string). UsableURI uuri = noChangeExpected("http://license.joins.com/igor?one={curly}"); assertEquals(uuri.getQuery(), "one={curly}"); assertEquals(UsableURIFactory. getInstance("http://license.joins.com/igor{curly}.html"). toString(), "http://license.joins.com/igor%7Bcurly%7D.html"); boolean exception = false; try { UsableURIFactory.getInstance("http://license.{curly}.com/igor.html"); } catch (URIException u) { exception = true; } assertTrue("Did not get exception.", exception); } protected UsableURI noChangeExpected(final String original) throws URIException { UsableURI uuri = UsableURIFactory.getInstance(original); assertEquals(original, uuri.toString()); return uuri; } public final void testTrimSpaceNBSP() throws URIException { final String uri = " http://archive.org/DIR WITH SPACES/" + UsableURIFactory.NBSP + "home.html " + UsableURIFactory.NBSP + " "; final String tgtUri = "http://archive.org/DIR%20WITH%20SPACES/%20home.html"; UsableURI uuri = UsableURIFactory.getInstance(uri); assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(tgtUri)); } /** * Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them). * See <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>. * @throws URIException */ public final void testSpaceDoubleEncoding() throws URIException { final String uri = "http://www.brook.edu/i.html? %20taxonomy=Politics"; final String encodedUri = "http://www.brook.edu/i.html?%20%20taxonomy=Politics"; UsableURI uuri = UsableURIFactory.getInstance(uri, "ISO-8859-1"); assertTrue("Not equal " + uuri.toString(), uuri.toString().equals(encodedUri)); } /** * Test for doubly-encoded sequences. * See <a href="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>. * @throws URIException */ public final void testDoubleEncoding() throws URIException { final char ae = '\u00E6'; final String uri = "http://archive.org/DIR WITH SPACES/home" + ae + ".html"; final String encodedUri = "http://archive.org/DIR%20WITH%20SPACES/home%E6.html"; UsableURI uuri = UsableURIFactory.getInstance(uri, "ISO-8859-1"); assertEquals("single encoding", encodedUri, uuri.toString()); // Dbl-encodes. uuri = UsableURIFactory.getInstance(uuri.toString(), "ISO-8859-1"); uuri = UsableURIFactory.getInstance(uuri.toString(), "ISO-8859-1"); assertEquals("double encoding", encodedUri, uuri.toString()); // Do default utf-8 test. uuri = UsableURIFactory.getInstance(uri); final String encodedUtf8Uri = "http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html"; assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString()); // Now dbl-encode. uuri = UsableURIFactory.getInstance(uuri.toString()); uuri = UsableURIFactory.getInstance(uuri.toString()); assertEquals("Not equal (dbl-encoding) utf8", encodedUtf8Uri, uuri.toString()); } /** * Test for syntax errors stop page parsing. * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ] URI Syntax Errors stop page parsing</a> * @throws URIException */ public final void testThreeSlashes() throws URIException { UsableURI goodURI = UsableURIFactory. getInstance("http://lcweb.loc.gov/rr/goodtwo.html"); String uuri = "http:///lcweb.loc.gov/rr/goodtwo.html"; UsableURI rewrittenURI = UsableURIFactory.getInstance(uuri); assertTrue("Not equal " + goodURI + ", " + uuri, goodURI.toString().equals(rewrittenURI.toString())); uuri = "http:////lcweb.loc.gov/rr/goodtwo.html"; rewrittenURI = UsableURIFactory.getInstance(uuri); assertTrue("Not equal " + goodURI + ", " + uuri, goodURI.toString().equals(rewrittenURI.toString())); // Check https. goodURI = UsableURIFactory. getInstance("https://lcweb.loc.gov/rr/goodtwo.html"); uuri = "https:////lcweb.loc.gov/rr/goodtwo.html"; rewrittenURI = UsableURIFactory.getInstance(uuri); assertTrue("Not equal " + goodURI + ", " + uuri, goodURI.toString().equals(rewrittenURI.toString())); } public final void testNoScheme() { boolean expectedException = false; String uuri = "www.loc.gov/rr/european/egw/polishex.html"; try { UsableURIFactory.getInstance(uuri); } catch (URIException e) { // Expected exception. expectedException = true; } assertTrue("Didn't get expected exception: " + uuri, expectedException); } public final void testRelative() throws URIException { UsableURI uuriTgt = UsableURIFactory. getInstance("http://archive.org:83/home.html"); UsableURI uri = UsableURIFactory. getInstance("http://archive.org:83/one/two/three.html"); UsableURI uuri = UsableURIFactory. getInstance(uri, "/home.html"); assertTrue("Not equal", uuriTgt.toString().equals(uuri.toString())); } public void testSchemelessRelative() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://www.itsnicethat.com/articles/laura-hobson"); UsableURI test1 = UsableURIFactory.getInstance(base, "//www.facebook.com/plugins/like.php"); assertEquals("schemaless relative 1", "http://www.facebook.com/plugins/like.php", test1.toString()); // reported by Erin Staniland UsableURI test2 = UsableURIFactory.getInstance(base, "//www.facebook.com/plugins/like.php?href=http://www.itsnicethat.com/articles/laura-hobson"); assertEquals("schemeless relative 2", "http://www.facebook.com/plugins/like.php?href=http://www.itsnicethat.com/articles/laura-hobson", test2.toString()); } /** * Test that an empty uuri does the right thing -- that we get back the * base. * * @throws URIException */ public final void testRelativeEmpty() throws URIException { UsableURI uuriTgt = UsableURIFactory. getInstance("http://archive.org:83/one/two/three.html"); UsableURI uri = UsableURIFactory. getInstance("http://archive.org:83/one/two/three.html"); UsableURI uuri = UsableURIFactory. getInstance(uri, ""); assertTrue("Empty length don't work", uuriTgt.toString().equals(uuri.toString())); } public final void testAbsolute() throws URIException { UsableURI uuriTgt = UsableURIFactory. getInstance("http://archive.org:83/home.html"); UsableURI uri = UsableURIFactory. getInstance("http://archive.org:83/one/two/three.html"); UsableURI uuri = UsableURIFactory. getInstance(uri, "http://archive.org:83/home.html"); assertTrue("Not equal", uuriTgt.toString().equals(uuri.toString())); } /** * Test for [ 962892 ] UURI accepting/creating unUsable URIs (bad hosts). * @see <a href="https://sourceforge.net/tracker/?func=detail&atid=539099&aid=962892&group_id=73833">[ 962892 ] UURI accepting/creating unUsable URIs (bad hosts)</a> */ public final void testHostWithLessThan() { checkExceptionOnIllegalDomainlabel("http://www.betamobile.com</A"); checkExceptionOnIllegalDomainlabel( "http://C|/unzipped/426/spacer.gif"); checkExceptionOnIllegalDomainlabel("http://www.lycos.co.uk\"/l/b/\""); } /** * Test for [ 1012520 ] UURI.length() > 2k. * @throws URIException * @see <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1012520&group_id=73833&atid=539099">[ 1012520 ] UURI.length() > 2k</a> */ public final void test2kURI() throws URIException { final StringBuffer buffer = new StringBuffer("http://a.b"); final String subPath = "/123456789"; for (int i = 0; i < 207; i++) { buffer.append(subPath); } // String should be 2080 characters long. Legal. UsableURIFactory.getInstance(buffer.toString()); boolean gotException = false; // Add ten more characters and make size illegal. buffer.append(subPath); try { UsableURIFactory.getInstance(buffer.toString()); } catch (URIException e) { gotException = true; } assertTrue("No expected exception complaining about long URI", gotException); } private void checkExceptionOnIllegalDomainlabel(String uuri) { boolean expectedException = false; try { UsableURIFactory.getInstance(uuri); } catch (URIException e) { // Expected exception. expectedException = true; } assertTrue("Didn't get expected exception: " + uuri, expectedException); } /** * Test for doing separate DNS lookup for same host * * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788277&group_id=73833&atid=539099">[ 788277 ] Doing separate DNS lookup for same host</a> * @throws URIException */ public final void testHostWithPeriod() throws URIException { UsableURI uuri1 = UsableURIFactory. getInstance("http://www.loc.gov./index.html"); UsableURI uuri2 = UsableURIFactory. getInstance("http://www.loc.gov/index.html"); assertEquals("Failed equating hosts with dot", uuri1.getHost(), uuri2.getHost()); } /** * Test for NPE in java.net.URI.encode * * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=874220&group_id=73833&atid=539099">[ 874220 ] NPE in java.net.URI.encode</a> * @throws URIException */ public final void testHostEncodedChars() throws URIException { String s = "http://g.msn.co.kr/0nwkokr0/00/19??" + "PS=10274&NC=10009&CE=42&CP=949&HL=" + "���?��"; assertNotNull("Encoded chars " + s, UsableURIFactory.getInstance(s)); } /** * Test for java.net.URI parses %20 but getHost null * * See <a href="https://sourceforge.net/tracker/?func=detail&aid=927940&group_id=73833&atid=539099">[ 927940 ] java.net.URI parses %20 but getHost null</a> */ public final void testSpaceInHost() { boolean expectedException = false; try { UsableURIFactory.getInstance( "http://www.local-regions.odpm%20.gov.uk" + "/lpsa/challenge/pdf/propect.pdf"); } catch (URIException e) { expectedException = true; } assertTrue("Did not fail with escaped space.", expectedException); expectedException = false; try { UsableURIFactory.getInstance( "http://www.local-regions.odpm .gov.uk" + "/lpsa/challenge/pdf/propect.pdf"); } catch (URIException e) { expectedException = true; } assertTrue("Did not fail with real space.", expectedException); } /** * Test for java.net.URI chokes on hosts_with_underscores. * * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=808270&group_id=73833&atid=539099">[ 808270 ] java.net.URI chokes on hosts_with_underscores</a> * @throws URIException */ public final void testHostWithUnderscores() throws URIException { UsableURI uuri = UsableURIFactory.getInstance( "http://x_underscore_underscore.2u.com.tw/nonexistent_page.html"); assertEquals("Failed get of host with underscore", "x_underscore_underscore.2u.com.tw", uuri.getHost()); } /** * Two dots for igor. */ public final void testTwoDots() { boolean expectedException = false; try { UsableURIFactory.getInstance( "http://x_underscore_underscore..2u.com/nonexistent_page.html"); } catch (URIException e) { expectedException = true; } assertTrue("Two dots did not throw exception", expectedException); } /** * Test for java.net.URI#getHost fails when leading digit. * * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=910120&group_id=73833&atid=539099">[ 910120 ] java.net.URI#getHost fails when leading digit.</a> * @throws URIException */ public final void testHostWithDigit() throws URIException { UsableURI uuri = UsableURIFactory. getInstance("http://0204chat.2u.com.tw/nonexistent_page.html"); assertEquals("Failed get of host with digit", "0204chat.2u.com.tw", uuri.getHost()); } /** * Test for Constraining java URI class. * * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=949548&group_id=73833&atid=539099">[ 949548 ] Constraining java URI class</a> */ public final void testPort() { checkBadPort("http://www.tyopaikat.com:a/robots.txt"); checkBadPort("http://158.144.21.3:80808/robots.txt"); checkBadPort("http://pdb.rutgers.edu:81.rutgers.edu/robots.txt"); checkBadPort( "https://webmail.gse.harvard.edu:9100robots.txt/robots.txt"); checkBadPort( "https://webmail.gse.harvard.edu:0/robots.txt/robots.txt"); } /** * Test bad port throws exception. * @param uri URI with bad port to check. */ private void checkBadPort(String uri) { boolean exception = false; try { UsableURIFactory.getInstance(uri); } catch (URIException e) { exception = true; } assertTrue("Didn't throw exception: " + uri, exception); } /** * Preserve userinfo capitalization. * @throws URIException */ public final void testUserinfo() throws URIException { final String authority = "stack:StAcK@www.tyopaikat.com"; final String uri = "http://" + authority + "/robots.txt"; UsableURI uuri = UsableURIFactory.getInstance(uri); assertEquals("Authority not equal", uuri.getAuthority(), authority); /* String tmp = uuri.toString(); assertTrue("URI not equal", tmp.equals(uri)); */ } /** * Test user info + port * @throws URIException */ public final void testUserinfoPlusPort() throws URIException { final String userInfo = "stack:StAcK"; final String authority = "www.tyopaikat.com"; final int port = 8080; final String uri = "http://" + userInfo + "@" + authority + ":" + port + "/robots.txt"; UsableURI uuri = UsableURIFactory.getInstance(uri); assertEquals("Host not equal", authority,uuri.getHost()); assertEquals("Userinfo Not equal",userInfo,uuri.getUserinfo()); assertEquals("Port not equal",port,uuri.getPort()); assertEquals("Authority wrong","stack:StAcK@www.tyopaikat.com:8080", uuri.getAuthority()); assertEquals("AuthorityMinusUserinfo wrong","www.tyopaikat.com:8080", uuri.getAuthorityMinusUserinfo()); } public final void testRFC3986RelativeChange() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://a/b/c/d;p?q"); tryRelative(base, "?y", "http://a/b/c/d;p?y"); } /** * Tests from rfc3986 * * <pre> * "g:h" = "g:h" * "g" = "http://a/b/c/g" * "./g" = "http://a/b/c/g" * "g/" = "http://a/b/c/g/" * "/g" = "http://a/g" * "//g" = "http://g" * "?y" = "http://a/b/c/d;p?y" * "g?y" = "http://a/b/c/g?y" * "#s" = "http://a/b/c/d;p?q#s" * "g#s" = "http://a/b/c/g#s" * "g?y#s" = "http://a/b/c/g?y#s" * ";x" = "http://a/b/c/;x" * "g;x" = "http://a/b/c/g;x" * "g;x?y#s" = "http://a/b/c/g;x?y#s" * "" = "http://a/b/c/d;p?q" * "." = "http://a/b/c/" * "./" = "http://a/b/c/" * ".." = "http://a/b/" * "../" = "http://a/b/" * "../g" = "http://a/b/g" * "../.." = "http://a/" * "../../" = "http://a/" * "../../g" = "http://a/g" * </pre> * * @throws URIException */ public final void testRFC3986Relative() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://a/b/c/d;p?q"); tryRelative(base, "g:h", "g:h"); tryRelative(base, "g", "http://a/b/c/g"); tryRelative(base, "./g", "http://a/b/c/g"); tryRelative(base, "g/", "http://a/b/c/g/"); tryRelative(base, "/g", "http://a/g"); tryRelative(base, "//g", "http://g"); tryRelative(base, "?y", "http://a/b/c/d;p?y"); tryRelative(base, "g?y", "http://a/b/c/g?y"); tryRelative(base, "#s", "http://a/b/c/d;p?q#s"); tryRelative(base, "g#s", "http://a/b/c/g#s"); tryRelative(base, "g?y#s", "http://a/b/c/g?y#s"); tryRelative(base, ";x", "http://a/b/c/;x"); tryRelative(base, "g;x", "http://a/b/c/g;x"); tryRelative(base, "g;x?y#s","http://a/b/c/g;x?y#s"); tryRelative(base, "", "http://a/b/c/d;p?q"); tryRelative(base, ".", "http://a/b/c/"); tryRelative(base, "./", "http://a/b/c/"); tryRelative(base, "..", "http://a/b/"); tryRelative(base, "../", "http://a/b/"); tryRelative(base, "../g", "http://a/b/g"); tryRelative(base, "../..", "http://a/"); tryRelative(base, "../../", "http://a/"); tryRelative(base, "../../g","http://a/g"); } protected void tryRelative(UsableURI base, String relative, String expected) throws URIException { UsableURI uuri = UsableURIFactory.getInstance(base, relative); assertEquals("Derelativized " + relative + " gave " + uuri + " not " + expected, UsableURIFactory.getInstance(expected),uuri); } /** * Tests from rfc2396 with amendments to accomodate differences * intentionally added to make our URI handling like IEs. * * <pre> * g:h = g:h * g = http://a/b/c/g * ./g = http://a/b/c/g * g/ = http://a/b/c/g/ * /g = http://a/g * //g = http://g * ?y = http://a/b/c/?y * g?y = http://a/b/c/g?y * #s = (current document)#s * g#s = http://a/b/c/g#s * g?y#s = http://a/b/c/g?y#s * ;x = http://a/b/c/;x * g;x = http://a/b/c/g;x * g;x?y#s = http://a/b/c/g;x?y#s * . = http://a/b/c/ * ./ = http://a/b/c/ * .. = http://a/b/ * ../ = http://a/b/ * ../g = http://a/b/g * ../.. = http://a/ * ../../ = http://a/ * ../../g = http://a/g * </pre> * * @throws URIException */ public final void testRFC2396Relative() throws URIException { UsableURI base = UsableURIFactory. getInstance("http://a/b/c/d;p?q"); TreeMap<String,String> m = new TreeMap<String,String>(); m.put("..", "http://a/b/"); m.put("../", "http://a/b/"); m.put("../g", "http://a/b/g"); m.put("../..", "http://a/"); m.put("../../", "http://a/"); m.put("../../g", "http://a/g"); m.put("g#s", "http://a/b/c/g#s"); m.put("g?y#s ", "http://a/b/c/g?y#s"); m.put(";x", "http://a/b/c/;x"); m.put("g;x", "http://a/b/c/g;x"); m.put("g;x?y#s", "http://a/b/c/g;x?y#s"); m.put(".", "http://a/b/c/"); m.put("./", "http://a/b/c/"); m.put("g", "http://a/b/c/g"); m.put("./g", "http://a/b/c/g"); m.put("g/", "http://a/b/c/g/"); m.put("/g", "http://a/g"); m.put("//g", "http://g"); // CHANGED BY RFC3986 // m.put("?y", "http://a/b/c/?y"); m.put("g?y", "http://a/b/c/g?y"); // EXTRAS beyond the RFC set. // TODO: That these resolve to a path of /a/g might be wrong. Perhaps // it should be '/g'?. m.put("/../../../../../../../../g", "http://a/g"); m.put("../../../../../../../../g", "http://a/g"); m.put("../G", "http://a/b/G"); for (Iterator<String> i = m.keySet().iterator(); i.hasNext();) { String key = (String)i.next(); String value = (String)m.get(key); UsableURI uuri = UsableURIFactory.getInstance(base, key); assertTrue("Unexpected " + key + " " + value + " " + uuri, uuri.equals(UsableURIFactory.getInstance(value))); } } /** * A UURI should always be without a 'fragment' segment, which is * unused and irrelevant for network fetches. * * See [ 970666 ] #anchor links not trimmed, and thus recrawled * * @throws URIException */ public final void testAnchors() throws URIException { UsableURI uuri = UsableURIFactory. getInstance("http://www.example.com/path?query#anchor"); assertEquals("Not equal", "http://www.example.com/path?query", uuri.toString()); } /** * Ensure that URI strings beginning with a colon are treated * the same as browsers do (as relative, rather than as absolute * with zero-length scheme). * * @throws URIException */ public void testStartsWithColon() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://www.example.com/path/page"); UsableURI uuri = UsableURIFactory.getInstance(base,":foo"); assertEquals("derelativize starsWithColon", uuri.getURI(), "http://www.example.com/path/:foo"); } /** * Ensure that relative URIs with colons in late positions * aren't mistakenly interpreted as absolute URIs with long, * illegal schemes. * * @throws URIException */ public void testLateColon() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://www.example.com/path/page"); UsableURI uuri1 = UsableURIFactory.getInstance(base,"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value"); assertEquals("derelativize lateColon", uuri1.getURI(), "http://www.example.com/path/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value"); UsableURI uuri2 = UsableURIFactory.getInstance(base,"example.html?parameter=this:value"); assertEquals("derelativize lateColon", uuri2.getURI(), "http://www.example.com/path/example.html?parameter=this:value"); } /** * Ensure that stray trailing '%' characters do not prevent * UURI instances from being created, and are reasonably * escaped when encountered. * * @throws URIException */ public void testTrailingPercents() throws URIException { String plainPath = "http://www.example.com/path%"; UsableURI plainPathUuri = UsableURIFactory.getInstance(plainPath); assertEquals("plainPath getURI", plainPath, plainPathUuri.getURI()); assertEquals("plainPath getEscapedURI", "http://www.example.com/path%", // browsers don't escape '%' plainPathUuri.getEscapedURI()); String partiallyEscapedPath = "http://www.example.com/pa%20th%"; UsableURI partiallyEscapedPathUuri = UsableURIFactory.getInstance( partiallyEscapedPath); // assertEquals("partiallyEscapedPath getURI", // "http://www.example.com/pa th%", // TODO: is this desirable? //// partiallyEscapedPath, // partiallyEscapedPathUuri.getURI()); assertEquals("partiallyEscapedPath getEscapedURI", "http://www.example.com/pa%20th%", partiallyEscapedPathUuri.getEscapedURI()); String plainQueryString = "http://www.example.com/path?q=foo%"; UsableURI plainQueryStringUuri = UsableURIFactory.getInstance( plainQueryString); // assertEquals("plainQueryString getURI", // plainQueryString, // plainQueryStringUuri.getURI()); assertEquals("plainQueryString getEscapedURI", "http://www.example.com/path?q=foo%", plainQueryStringUuri.getEscapedURI()); String partiallyEscapedQueryString = "http://www.example.com/pa%20th?q=foo%"; UsableURI partiallyEscapedQueryStringUuri = UsableURIFactory.getInstance( partiallyEscapedQueryString); assertEquals("partiallyEscapedQueryString getURI", "http://www.example.com/pa th?q=foo%", partiallyEscapedQueryStringUuri.getURI()); assertEquals("partiallyEscapedQueryString getEscapedURI", "http://www.example.com/pa%20th?q=foo%", partiallyEscapedQueryStringUuri.getEscapedURI()); } /** * Ensure that stray '%' characters do not prevent * UURI instances from being created, and are reasonably * escaped when encountered. * * @throws URIException */ public void testStrayPercents() throws URIException { String oneStray = "http://www.example.com/pa%th"; UsableURI oneStrayUuri = UsableURIFactory.getInstance(oneStray); assertEquals("oneStray getURI", oneStray, oneStrayUuri.getURI()); assertEquals("oneStray getEscapedURI", "http://www.example.com/pa%th", // browsers don't escape '%' oneStrayUuri.getEscapedURI()); String precededByValidEscape = "http://www.example.com/pa%20th%way"; UsableURI precededByValidEscapeUuri = UsableURIFactory.getInstance( precededByValidEscape); assertEquals("precededByValidEscape getURI", "http://www.example.com/pa th%way", // getURI interprets escapes precededByValidEscapeUuri.getURI()); assertEquals("precededByValidEscape getEscapedURI", "http://www.example.com/pa%20th%way", precededByValidEscapeUuri.getEscapedURI()); String followedByValidEscape = "http://www.example.com/pa%th%20way"; UsableURI followedByValidEscapeUuri = UsableURIFactory.getInstance( followedByValidEscape); assertEquals("followedByValidEscape getURI", "http://www.example.com/pa%th way", // getURI interprets escapes followedByValidEscapeUuri.getURI()); assertEquals("followedByValidEscape getEscapedURI", "http://www.example.com/pa%th%20way", followedByValidEscapeUuri.getEscapedURI()); } public void testEscapingNotNecessary() throws URIException { String escapesUnnecessary = "http://www.example.com/misc;reserved:chars@that&don't=need" +"+escaping$even,though!you(might)initially?think#so"; // expect everything but the #fragment String expected = escapesUnnecessary.substring(0, escapesUnnecessary .length() - 3); assertEquals("escapes unnecessary", expected, UsableURIFactory.getInstance(escapesUnnecessary).toString()); } public void testIdn() throws URIException { // See http://www.josefsson.org/idn.php. // http://räksmörgås.josefßon.org/ String idn1 = "http://r\u00e4ksm\u00f6rg\u00e5s.josef\u00dfon.org/"; String puny1 = "http://xn--rksmrgs-5wao1o.josefsson.org/"; assertEquals("encoding of " + idn1, puny1, UsableURIFactory .getInstance(idn1).toString()); // http://www.pølse.dk/ String idn2 = "http://www.p\u00f8lse.dk/"; String puny2 = "http://www.xn--plse-gra.dk/"; assertEquals("encoding of " + idn2, puny2, UsableURIFactory .getInstance(idn2).toString()); // http://例子.測試 String idn3 = "http://\u4F8B\u5B50.\u6E2C\u8A66"; String puny3 = "http://xn--fsqu00a.xn--g6w251d/"; assertEquals("encoding of " + idn3, puny3, UsableURIFactory .getInstance(idn3).toString()); } public void testNewLineInURL() throws URIException { UsableURI uuri = UsableURIFactory.getInstance("http://www.ar\rchive\n." + "org/i\n\n\r\rndex.html"); assertEquals("http://www.archive.org/index.html", uuri.toString()); } public void testTabsInURL() throws URIException { UsableURI uuri = UsableURIFactory.getInstance("http://www.ar\tchive\t." + "org/i\t\r\n\tndex.html"); assertEquals("http://www.archive.org/index.html", uuri.toString()); } public void testQueryEscaping() throws URIException { UsableURI uuri = UsableURIFactory.getInstance( "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'\";:/?.>,<"); assertEquals( // tests in FF1.5 indicate it only escapes " < > "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'%22;:/?.%3E,%3C", uuri.toString()); } /** * Check that our 'normalization' does same as Nutch's * Below before-and-afters were taken from the nutch urlnormalizer-basic * TestBasicURLNormalizer class (December 2006, Nutch 0.9-dev). * @throws URIException */ public void testSameAsNutchURLFilterBasic() throws URIException { assertEquals(UsableURIFactory.getInstance(" http://foo.com/ ").toString(), "http://foo.com/"); // check that protocol is lower cased assertEquals(UsableURIFactory.getInstance("HTTP://foo.com/").toString(), "http://foo.com/"); // check that host is lower cased assertEquals(UsableURIFactory. getInstance("http://Foo.Com/index.html").toString(), "http://foo.com/index.html"); assertEquals(UsableURIFactory. getInstance("http://Foo.Com/index.html").toString(), "http://foo.com/index.html"); // check that port number is normalized assertEquals(UsableURIFactory. getInstance("http://foo.com:80/index.html").toString(), "http://foo.com/index.html"); assertEquals(UsableURIFactory.getInstance("http://foo.com:81/").toString(), "http://foo.com:81/"); // check that null path is normalized assertEquals(UsableURIFactory.getInstance("http://foo.com").toString(), "http://foo.com/"); // check that references are removed assertEquals(UsableURIFactory. getInstance("http://foo.com/foo.html#ref").toString(), "http://foo.com/foo.html"); // // check that encoding is normalized // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); // check that unnecessary "../" are removed assertEquals(UsableURIFactory. getInstance("http://foo.com/aa/../").toString(), "http://foo.com/" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/aa/bb/../").toString(), "http://foo.com/aa/"); /* We fail this one. Here we produce: 'http://foo.com/'. assertEquals(UURIFactory. getInstance("http://foo.com/aa/..").toString(), "http://foo.com/aa/.."); */ assertEquals(UsableURIFactory. getInstance("http://foo.com/aa/bb/cc/../../foo.html").toString(), "http://foo.com/aa/foo.html"); assertEquals(UsableURIFactory. getInstance("http://foo.com/aa/bb/../cc/dd/../ee/foo.html"). toString(), "http://foo.com/aa/cc/ee/foo.html"); assertEquals(UsableURIFactory. getInstance("http://foo.com/../foo.html").toString(), "http://foo.com/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/../../foo.html").toString(), "http://foo.com/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/../aa/../foo.html").toString(), "http://foo.com/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/aa/../../foo.html").toString(), "http://foo.com/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/aa/../bb/../foo.html/../../"). toString(), "http://foo.com/" ); assertEquals(UsableURIFactory.getInstance("http://foo.com/../aa/foo.html"). toString(), "http://foo.com/aa/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/../aa/../foo.html").toString(), "http://foo.com/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/a..a/foo.html").toString(), "http://foo.com/a..a/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/a..a/../foo.html").toString(), "http://foo.com/foo.html" ); assertEquals(UsableURIFactory. getInstance("http://foo.com/foo.foo/../foo.html").toString(), "http://foo.com/foo.html" ); } public void testHttpSchemeColonSlash() { boolean exception = false; try { UsableURIFactory.getInstance("https:/"); } catch (URIException e) { exception = true; } assertTrue("Didn't throw exception when one expected", exception); exception = false; try { UsableURIFactory.getInstance("http://"); } catch (URIException e) { exception = true; } assertTrue("Didn't throw exception when one expected", exception); } public void testNakedHttpsSchemeColon() { boolean exception = false; try { UsableURIFactory.getInstance("https:"); } catch (URIException e) { exception = true; } assertTrue("Didn't throw exception when one expected", exception); exception = false; try { UsableURI base = UsableURIFactory.getInstance("http://www.example.com"); UsableURIFactory.getInstance(base, "https:"); } catch (URIException e) { exception = true; } assertTrue("Didn't throw exception when one expected", exception); } /** * Test motivated by [#HER-616] The UURI class may throw * NullPointerException in getReferencedHost() * * @throws URIException */ public void testMissingHttpColon() throws URIException { String suspectUri = "http//www.test.foo"; UsableURI base = UsableURIFactory.getInstance("http://www.example.com"); boolean exceptionThrown = false; try { UsableURI badUuri = UsableURIFactory.getInstance(suspectUri); badUuri.getReferencedHost(); // not reached } catch (URIException e) { // should get relative-uri-no-base exception exceptionThrown = true; } finally { assertTrue("expected exception not thrown",exceptionThrown); } UsableURI goodUuri = UsableURIFactory.getInstance(base,suspectUri); goodUuri.getReferencedHost(); } /** * A UURI's string representation should be same after a * serialization roundtrip. * * @throws URIException */ public final void testSerializationRoundtrip() throws URIException { UsableURI uuri = UsableURIFactory. getInstance("http://www.example.com/path?query#anchor"); UsableURI uuri2 = (UsableURI) SerializationUtils.deserialize( SerializationUtils.serialize(uuri)); assertEquals("Not equal", uuri.toString(), uuri2.toString()); uuri = UsableURIFactory. getInstance("file://///boo_hoo/wwwroot/CMS/Images1/Banner.gif"); uuri2 = (UsableURI) SerializationUtils.deserialize( SerializationUtils.serialize(uuri)); assertEquals("Not equal", uuri.toString(), uuri2.toString()); } /** * A UURI's string representation should be same after a * toCustomString-getInstance roundtrip. * * @throws URIException */ public final void testToCustomStringRoundtrip() throws URIException { UsableURI uuri = UsableURIFactory. getInstance("http://www.example.com/path?query#anchor"); UsableURI uuri2 = UsableURIFactory.getInstance(uuri.toCustomString()); assertEquals("Not equal", uuri.toString(), uuri2.toString()); // TODO: fix // see [HER-1470] UURI String roundtrip (UURIFactory.getInstance(uuri.toString()) results in different URI for file: (and perhaps other) URIs // http://webteam.archive.org/jira/browse/HER-1470 // uuri = UURIFactory. // getInstance("file://///boo_hoo/wwwroot/CMS/Images1/Banner.gif"); // uuri2 = UURIFactory.getInstance(uuri.toCustomString()); // assertEquals("Not equal", uuri.toString(), uuri2.toString()); } /** * A UURI's string representation should be same after a * toCustomString-getInstance roundtrip. * * @throws URIException */ public final void testHostnamePortRoundtrip() throws URIException { UsableURI base = UsableURIFactory. getInstance("http://www.example.com/path?query#anchor"); UsableURI test = UsableURIFactory.getInstance(base,"boom1.hostname.com:9999"); System.out.println("scheme:"+test.getScheme()); System.out.println(test.toCustomString()); UsableURI roundtrip = UsableURIFactory.getInstance(test.toCustomString()); assertEquals("Not equal", test.toString(), roundtrip.toString()); } /** * Test bad port throws URIException not NumberFormatException */ public void testExtremePort() { try { UsableURI uuri = UsableURIFactory.getInstance("http://Tel.:010101010101"); System.out.println(uuri); fail("expected exception not thrown"); } catch (URIException ue){ // expected } } /** * Bars ('|') in path-segments aren't encoded by FF, preferred by some * RESTful-URI-ideas guides, so should work without error. * * @throws URIException */ public void testBarsInRelativePath() throws URIException { UsableURI base = UsableURIFactory.getInstance("http://www.example.com"); String relative = "foo/bar|baz|yorple"; base.resolve(relative); UsableURIFactory.getInstance(base,relative); } /** * To match IE behavior, backslashes in path-info (really, anywhere before * query string) assumed to be slashes, to match IE behavior. In * query-string, they are escaped to %5C. * * @throws URIException */ public void testBackslashes() throws URIException { UsableURI uuri = UsableURIFactory.getInstance("http:\\/www.example.com\\a/b\\c/d?q\\r\\|s/t\\v"); String expected = "http://www.example.com/a/b/c/d?q%5Cr%5C|s/t%5Cv"; assertEquals(expected, uuri.toString()); } }