/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.url;
import java.util.Iterator;
import java.util.TreeMap;
import junit.framework.TestCase;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.SerializationUtils;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
/**
* Test UURIFactory for proper UURI creation across variety of
* important/tricky cases.
*
* Be careful writing this file. Make sure you write it with UTF-8 encoding.
*
* @author igor stack gojomo
*/
public class UsableURIFactoryTest extends TestCase {
public final void testEscaping() throws URIException {
// Note: single quote is not being escaped by URI class.
final String ESCAPED_URISTR = "http://archive.org/" +
UsableURIFactory.ESCAPED_SPACE +
UsableURIFactory.ESCAPED_SPACE +
UsableURIFactory.ESCAPED_CIRCUMFLEX +
UsableURIFactory.ESCAPED_QUOT +
UsableURIFactory.SQUOT +
UsableURIFactory.ESCAPED_APOSTROPH +
UsableURIFactory.ESCAPED_LSQRBRACKET +
UsableURIFactory.ESCAPED_RSQRBRACKET +
UsableURIFactory.ESCAPED_LCURBRACKET +
UsableURIFactory.ESCAPED_RCURBRACKET +
UsableURIFactory.SLASH + "a.gif"; // NBSP and SPACE should be trimmed;
final String URISTR = "http://archive.org/.././" + "\u00A0" +
UsableURIFactory.SPACE + UsableURIFactory.CIRCUMFLEX +
UsableURIFactory.QUOT + UsableURIFactory.SQUOT +
UsableURIFactory.APOSTROPH + UsableURIFactory.LSQRBRACKET +
UsableURIFactory.RSQRBRACKET + UsableURIFactory.LCURBRACKET +
UsableURIFactory.RCURBRACKET + UsableURIFactory.BACKSLASH +
"test/../a.gif" + "\u00A0" + UsableURIFactory.SPACE;
UsableURI uuri = UsableURIFactory.getInstance(URISTR);
final String uuriStr = uuri.toString();
assertEquals("expected escaping", ESCAPED_URISTR, uuriStr);
}
public final void testUnderscoreMakesPortParseFail() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance("http://one-two_three:8080/index.html");
int port = uuri.getPort();
assertTrue("Failed find of port " + uuri, port == 8080);
}
public final void testRelativeURIWithTwoSlashes() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://www.archive.org");
UsableURI uuri = UsableURIFactory.getInstance(base, "one//index.html");
assertTrue("Doesn't do right thing with two slashes " + uuri,
uuri.toString().equals(
"http://www.archive.org/one//index.html"));
}
public final void testSchemelessURI() throws URIException {
UsableURI base = UsableURIFactory.getInstance("https://www.archive.org");
UsableURI uuri = UsableURIFactory.getInstance(base, "//example.com/monkey?this:uri:has:colons");
assertTrue("Doesn't do right thing with a schemeless URI " + uuri,
uuri.toString().equals(
"https://example.com/monkey?this:uri:has:colons"));
}
public final void testTrailingEncodedSpace() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance("http://www.nps-shoes.co.uk%20");
assertTrue("Doesn't strip trailing encoded space 1 " + uuri,
uuri.toString().equals("http://www.nps-shoes.co.uk/"));
uuri = UsableURIFactory.getInstance("http://www.nps-shoes.co.uk%20%20%20");
assertTrue("Doesn't strip trailing encoded space 2 " + uuri,
uuri.toString().equals("http://www.nps-shoes.co.uk/"));
}
public final void testPort0080is80() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance("http://archive.org:0080");
assertTrue("Doesn't strip leading zeros " + uuri,
uuri.toString().equals("http://archive.org/"));
}
// DISABLING TEST AS PRECURSOR TO ELIMINATION
// the problematic input given -- specifically the "%6s" incomplete uri-escape,
// shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least,
// will attempt to fetch such an URL (getting, in this case against that ad
// server, a bad-request error). Ideally, we'd generate exactly the same
// request against the server as they do. However, with the most recent
// fixup for stray '%' signs, we come close, but not exactly. That's enough
// to cause this test to fail (it's not getting the expected exception) but
// our almost-URI, which might be what was intended, is better than trying
// nothing.
// public final void testBadPath() {
// String message = null;
// try {
// UURIFactory.getInstance("http://ads.as4x.tmcs.net/" +
// "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" +
// "generic&Params.richmedia=yes%26city%3Dseattle%26" +
// "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" +
// "%6state%3DWA");
// } catch (URIException e) {
// message = e.getMessage();
// }
// assertNotNull("Didn't get expected exception.", message);
// }
public final void testEscapeEncoding() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance("http://www.y1y1.com/" +
"albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg", "windows-1256");
uuri.getPath();
}
public final void testTooLongAfterEscaping() {
StringBuffer buffer = new StringBuffer("http://www.archive.org/a/");
// Append bunch of spaces. When escaped, they'll triple in size.
for (int i = 0; i < 1024; i++) {
buffer.append(" ");
}
buffer.append("/index.html");
String message = null;
try {
UsableURIFactory.getInstance(buffer.toString());
} catch (URIException e) {
message = e.getMessage();
}
assertTrue("Wrong or no exception: " + message, (message != null) &&
message.startsWith("Created (escaped) uuri >"));
}
public final void testFtpUris() throws URIException {
final String FTP = "ftp";
final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn";
final String PATH = "/clzreceive/";
final String uri = FTP + "://" + AUTHORITY + PATH;
UsableURI uuri = UsableURIFactory.getInstance(uri);
assertTrue("Failed to get matching scheme: " + uuri.getScheme(),
(uuri.getScheme()).equals(FTP));
assertTrue("Failed to get matching authority: " +
uuri.getAuthority(), (uuri.getAuthority()).equals(AUTHORITY));
assertTrue("Failed to get matching path: " +
uuri.getPath(), (uuri.getPath()).equals(PATH));
}
public final void testWhitespaceEscaped() throws URIException {
// Test that we get all whitespace even if the uri is
// already escaped.
String uri = "http://archive.org/index%25 .html";
String tgtUri = "http://archive.org/index%25%20.html";
UsableURI uuri = UsableURIFactory.getInstance(uri);
assertTrue("Not equal " + uuri.toString(),
uuri.toString().equals(tgtUri));
uri = "http://archive.org/index%25\u001D.html";
tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
uuri = UsableURIFactory.getInstance(uri);
assertEquals("whitespace escaping", tgtUri, uuri.toString());
uri = "http://gemini.info.usaid.gov/directory/" +
"pbResults.cfm?&urlNameLast=Rumplestiltskin";
tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?" +
"name=Ebenezer%20+Rumplestiltskin,&location=RRB%20%20%20%205%2E08%2D006";
uuri = UsableURIFactory.getInstance(UsableURIFactory.getInstance(uri),
"faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" +
"RRB%20%20%20%205%2E08%2D006");
assertEquals("whitespace escaping", tgtUri, uuri.toString());
}
// public final void testFailedGetPath() throws URIException {
// final String path = "/RealMedia/ads/" +
// "click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty";
// // decoding in getPath will interpret %CA as 8-bit escaped char,
// // possibly incomplete
// final String uri = "http://ads.nandomedia.com" + path;
// final UURI uuri = UURIFactory.getInstance(uri);
// String foundPath = uuri.getPath();
// assertEquals("unexpected path", path, foundPath);
// }
public final void testDnsHost() throws URIException {
String uri = "dns://ads.nandomedia.com:81/one.html";
UsableURI uuri = UsableURIFactory.getInstance(uri);
String host = uuri.getReferencedHost();
assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
uri = "dns:ads.nandomedia.com";
uuri = UsableURIFactory.getInstance(uri);
host = uuri.getReferencedHost();
assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
uri = "dns:ads.nandomedia.com?a=b";
uuri = UsableURIFactory.getInstance(uri);
host = uuri.getReferencedHost();
assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
}
public final void testPercentEscaping() throws URIException {
final String uri = "http://archive.org/%a%%%%%.html";
// tests indicate firefox (1.0.6) does not encode '%' at all
final String tgtUri = "http://archive.org/%a%%%%%.html";
UsableURI uuri = UsableURIFactory.getInstance(uri);
assertEquals("Not equal",tgtUri, uuri.toString());
}
public final void testRelativeDblPathSlashes() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://www.archive.org/index.html");
UsableURI uuri = UsableURIFactory.getInstance(base, "JIGOU//KYC//INDEX.HTM");
assertTrue("Double slash not working " + uuri.toString(),
uuri.getPath().equals("/JIGOU//KYC//INDEX.HTM"));
}
public final void testRelativeWithScheme() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://www.example.com/some/page");
UsableURI uuri = UsableURIFactory.getInstance(base, "http:boo");
assertTrue("Relative with scheme not working " + uuri.toString(),
uuri.toString().equals("http://www.example.com/some/boo"));
}
public final void testBadBaseResolve() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://license.joins.com/board/" +
"etc_board_list.asp?board_name=new_main&b_type=&nPage=" +
"2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage=" +
"notice&gate=02");
UsableURIFactory.getInstance(base, "http://www.changeup.com/...</a");
}
public final void testTilde() throws URIException {
noChangeExpected("http://license.joins.com/~igor");
}
public final void testCurlies() throws URIException {
// Firefox allows curlies in the query string portion of a URL only
// (converts curlies if they are in the path portion ahead of the
// query string).
UsableURI uuri =
noChangeExpected("http://license.joins.com/igor?one={curly}");
assertEquals(uuri.getQuery(), "one={curly}");
assertEquals(UsableURIFactory.
getInstance("http://license.joins.com/igor{curly}.html").
toString(),
"http://license.joins.com/igor%7Bcurly%7D.html");
boolean exception = false;
try {
UsableURIFactory.getInstance("http://license.{curly}.com/igor.html");
} catch (URIException u) {
exception = true;
}
assertTrue("Did not get exception.", exception);
}
protected UsableURI noChangeExpected(final String original)
throws URIException {
UsableURI uuri = UsableURIFactory.getInstance(original);
assertEquals(original, uuri.toString());
return uuri;
}
public final void testTrimSpaceNBSP() throws URIException {
final String uri = " http://archive.org/DIR WITH SPACES/" +
UsableURIFactory.NBSP + "home.html " + UsableURIFactory.NBSP + " ";
final String tgtUri =
"http://archive.org/DIR%20WITH%20SPACES/%20home.html";
UsableURI uuri = UsableURIFactory.getInstance(uri);
assertTrue("Not equal " + uuri.toString(),
uuri.toString().equals(tgtUri));
}
/**
* Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them).
* See <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>.
* @throws URIException
*/
public final void testSpaceDoubleEncoding() throws URIException {
final String uri = "http://www.brook.edu/i.html? %20taxonomy=Politics";
final String encodedUri =
"http://www.brook.edu/i.html?%20%20taxonomy=Politics";
UsableURI uuri = UsableURIFactory.getInstance(uri, "ISO-8859-1");
assertTrue("Not equal " + uuri.toString(),
uuri.toString().equals(encodedUri));
}
/**
* Test for doubly-encoded sequences.
* See <a href="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>.
* @throws URIException
*/
public final void testDoubleEncoding() throws URIException {
final char ae = '\u00E6';
final String uri = "http://archive.org/DIR WITH SPACES/home" +
ae + ".html";
final String encodedUri =
"http://archive.org/DIR%20WITH%20SPACES/home%E6.html";
UsableURI uuri = UsableURIFactory.getInstance(uri, "ISO-8859-1");
assertEquals("single encoding", encodedUri, uuri.toString());
// Dbl-encodes.
uuri = UsableURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
uuri = UsableURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
assertEquals("double encoding", encodedUri, uuri.toString());
// Do default utf-8 test.
uuri = UsableURIFactory.getInstance(uri);
final String encodedUtf8Uri =
"http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html";
assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString());
// Now dbl-encode.
uuri = UsableURIFactory.getInstance(uuri.toString());
uuri = UsableURIFactory.getInstance(uuri.toString());
assertEquals("Not equal (dbl-encoding) utf8", encodedUtf8Uri, uuri.toString());
}
/**
* Test for syntax errors stop page parsing.
* @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ] URI Syntax Errors stop page parsing</a>
* @throws URIException
*/
public final void testThreeSlashes() throws URIException {
UsableURI goodURI = UsableURIFactory.
getInstance("http://lcweb.loc.gov/rr/goodtwo.html");
String uuri = "http:///lcweb.loc.gov/rr/goodtwo.html";
UsableURI rewrittenURI = UsableURIFactory.getInstance(uuri);
assertTrue("Not equal " + goodURI + ", " + uuri,
goodURI.toString().equals(rewrittenURI.toString()));
uuri = "http:////lcweb.loc.gov/rr/goodtwo.html";
rewrittenURI = UsableURIFactory.getInstance(uuri);
assertTrue("Not equal " + goodURI + ", " + uuri,
goodURI.toString().equals(rewrittenURI.toString()));
// Check https.
goodURI = UsableURIFactory.
getInstance("https://lcweb.loc.gov/rr/goodtwo.html");
uuri = "https:////lcweb.loc.gov/rr/goodtwo.html";
rewrittenURI = UsableURIFactory.getInstance(uuri);
assertTrue("Not equal " + goodURI + ", " + uuri,
goodURI.toString().equals(rewrittenURI.toString()));
}
public final void testNoScheme() {
boolean expectedException = false;
String uuri = "www.loc.gov/rr/european/egw/polishex.html";
try {
UsableURIFactory.getInstance(uuri);
} catch (URIException e) {
// Expected exception.
expectedException = true;
}
assertTrue("Didn't get expected exception: " + uuri,
expectedException);
}
public final void testRelative() throws URIException {
UsableURI uuriTgt = UsableURIFactory.
getInstance("http://archive.org:83/home.html");
UsableURI uri = UsableURIFactory.
getInstance("http://archive.org:83/one/two/three.html");
UsableURI uuri = UsableURIFactory.
getInstance(uri, "/home.html");
assertTrue("Not equal",
uuriTgt.toString().equals(uuri.toString()));
}
public void testSchemelessRelative() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://www.itsnicethat.com/articles/laura-hobson");
UsableURI test1 = UsableURIFactory.getInstance(base, "//www.facebook.com/plugins/like.php");
assertEquals("schemaless relative 1", "http://www.facebook.com/plugins/like.php", test1.toString());
// reported by Erin Staniland
UsableURI test2 = UsableURIFactory.getInstance(base, "//www.facebook.com/plugins/like.php?href=http://www.itsnicethat.com/articles/laura-hobson");
assertEquals("schemeless relative 2", "http://www.facebook.com/plugins/like.php?href=http://www.itsnicethat.com/articles/laura-hobson",
test2.toString());
}
/**
* Test that an empty uuri does the right thing -- that we get back the
* base.
*
* @throws URIException
*/
public final void testRelativeEmpty() throws URIException {
UsableURI uuriTgt = UsableURIFactory.
getInstance("http://archive.org:83/one/two/three.html");
UsableURI uri = UsableURIFactory.
getInstance("http://archive.org:83/one/two/three.html");
UsableURI uuri = UsableURIFactory.
getInstance(uri, "");
assertTrue("Empty length don't work",
uuriTgt.toString().equals(uuri.toString()));
}
public final void testAbsolute() throws URIException {
UsableURI uuriTgt = UsableURIFactory.
getInstance("http://archive.org:83/home.html");
UsableURI uri = UsableURIFactory.
getInstance("http://archive.org:83/one/two/three.html");
UsableURI uuri = UsableURIFactory.
getInstance(uri, "http://archive.org:83/home.html");
assertTrue("Not equal",
uuriTgt.toString().equals(uuri.toString()));
}
/**
* Test for [ 962892 ] UURI accepting/creating unUsable URIs (bad hosts).
* @see <a href="https://sourceforge.net/tracker/?func=detail&atid=539099&aid=962892&group_id=73833">[ 962892 ] UURI accepting/creating unUsable URIs (bad hosts)</a>
*/
public final void testHostWithLessThan() {
checkExceptionOnIllegalDomainlabel("http://www.betamobile.com</A");
checkExceptionOnIllegalDomainlabel(
"http://C|/unzipped/426/spacer.gif");
checkExceptionOnIllegalDomainlabel("http://www.lycos.co.uk\"/l/b/\"");
}
/**
* Test for [ 1012520 ] UURI.length() > 2k.
* @throws URIException
* @see <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1012520&group_id=73833&atid=539099">[ 1012520 ] UURI.length() > 2k</a>
*/
public final void test2kURI() throws URIException {
final StringBuffer buffer = new StringBuffer("http://a.b");
final String subPath = "/123456789";
for (int i = 0; i < 207; i++) {
buffer.append(subPath);
}
// String should be 2080 characters long. Legal.
UsableURIFactory.getInstance(buffer.toString());
boolean gotException = false;
// Add ten more characters and make size illegal.
buffer.append(subPath);
try {
UsableURIFactory.getInstance(buffer.toString());
} catch (URIException e) {
gotException = true;
}
assertTrue("No expected exception complaining about long URI",
gotException);
}
private void checkExceptionOnIllegalDomainlabel(String uuri) {
boolean expectedException = false;
try {
UsableURIFactory.getInstance(uuri);
} catch (URIException e) {
// Expected exception.
expectedException = true;
}
assertTrue("Didn't get expected exception: " + uuri,
expectedException);
}
/**
* Test for doing separate DNS lookup for same host
*
* @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788277&group_id=73833&atid=539099">[ 788277 ] Doing separate DNS lookup for same host</a>
* @throws URIException
*/
public final void testHostWithPeriod() throws URIException {
UsableURI uuri1 = UsableURIFactory.
getInstance("http://www.loc.gov./index.html");
UsableURI uuri2 = UsableURIFactory.
getInstance("http://www.loc.gov/index.html");
assertEquals("Failed equating hosts with dot",
uuri1.getHost(), uuri2.getHost());
}
/**
* Test for NPE in java.net.URI.encode
*
* @see <a href="https://sourceforge.net/tracker/?func=detail&aid=874220&group_id=73833&atid=539099">[ 874220 ] NPE in java.net.URI.encode</a>
* @throws URIException
*/
public final void testHostEncodedChars() throws URIException {
String s = "http://g.msn.co.kr/0nwkokr0/00/19??" +
"PS=10274&NC=10009&CE=42&CP=949&HL=" +
"���?��";
assertNotNull("Encoded chars " + s,
UsableURIFactory.getInstance(s));
}
/**
* Test for java.net.URI parses %20 but getHost null
*
* See <a href="https://sourceforge.net/tracker/?func=detail&aid=927940&group_id=73833&atid=539099">[ 927940 ] java.net.URI parses %20 but getHost null</a>
*/
public final void testSpaceInHost() {
boolean expectedException = false;
try {
UsableURIFactory.getInstance(
"http://www.local-regions.odpm%20.gov.uk" +
"/lpsa/challenge/pdf/propect.pdf");
} catch (URIException e) {
expectedException = true;
}
assertTrue("Did not fail with escaped space.", expectedException);
expectedException = false;
try {
UsableURIFactory.getInstance(
"http://www.local-regions.odpm .gov.uk" +
"/lpsa/challenge/pdf/propect.pdf");
} catch (URIException e) {
expectedException = true;
}
assertTrue("Did not fail with real space.", expectedException);
}
/**
* Test for java.net.URI chokes on hosts_with_underscores.
*
* @see <a href="https://sourceforge.net/tracker/?func=detail&aid=808270&group_id=73833&atid=539099">[ 808270 ] java.net.URI chokes on hosts_with_underscores</a>
* @throws URIException
*/
public final void testHostWithUnderscores() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance(
"http://x_underscore_underscore.2u.com.tw/nonexistent_page.html");
assertEquals("Failed get of host with underscore",
"x_underscore_underscore.2u.com.tw", uuri.getHost());
}
/**
* Two dots for igor.
*/
public final void testTwoDots() {
boolean expectedException = false;
try {
UsableURIFactory.getInstance(
"http://x_underscore_underscore..2u.com/nonexistent_page.html");
} catch (URIException e) {
expectedException = true;
}
assertTrue("Two dots did not throw exception", expectedException);
}
/**
* Test for java.net.URI#getHost fails when leading digit.
*
* @see <a href="https://sourceforge.net/tracker/?func=detail&aid=910120&group_id=73833&atid=539099">[ 910120 ] java.net.URI#getHost fails when leading digit.</a>
* @throws URIException
*/
public final void testHostWithDigit() throws URIException {
UsableURI uuri = UsableURIFactory.
getInstance("http://0204chat.2u.com.tw/nonexistent_page.html");
assertEquals("Failed get of host with digit",
"0204chat.2u.com.tw", uuri.getHost());
}
/**
* Test for Constraining java URI class.
*
* @see <a href="https://sourceforge.net/tracker/?func=detail&aid=949548&group_id=73833&atid=539099">[ 949548 ] Constraining java URI class</a>
*/
public final void testPort() {
checkBadPort("http://www.tyopaikat.com:a/robots.txt");
checkBadPort("http://158.144.21.3:80808/robots.txt");
checkBadPort("http://pdb.rutgers.edu:81.rutgers.edu/robots.txt");
checkBadPort(
"https://webmail.gse.harvard.edu:9100robots.txt/robots.txt");
checkBadPort(
"https://webmail.gse.harvard.edu:0/robots.txt/robots.txt");
}
/**
* Test bad port throws exception.
* @param uri URI with bad port to check.
*/
private void checkBadPort(String uri) {
boolean exception = false;
try {
UsableURIFactory.getInstance(uri);
}
catch (URIException e) {
exception = true;
}
assertTrue("Didn't throw exception: " + uri, exception);
}
/**
* Preserve userinfo capitalization.
* @throws URIException
*/
public final void testUserinfo() throws URIException {
final String authority = "stack:StAcK@www.tyopaikat.com";
final String uri = "http://" + authority + "/robots.txt";
UsableURI uuri = UsableURIFactory.getInstance(uri);
assertEquals("Authority not equal", uuri.getAuthority(),
authority);
/*
String tmp = uuri.toString();
assertTrue("URI not equal", tmp.equals(uri));
*/
}
/**
* Test user info + port
* @throws URIException
*/
public final void testUserinfoPlusPort() throws URIException {
final String userInfo = "stack:StAcK";
final String authority = "www.tyopaikat.com";
final int port = 8080;
final String uri = "http://" + userInfo + "@" + authority + ":" + port
+ "/robots.txt";
UsableURI uuri = UsableURIFactory.getInstance(uri);
assertEquals("Host not equal", authority,uuri.getHost());
assertEquals("Userinfo Not equal",userInfo,uuri.getUserinfo());
assertEquals("Port not equal",port,uuri.getPort());
assertEquals("Authority wrong","stack:StAcK@www.tyopaikat.com:8080",
uuri.getAuthority());
assertEquals("AuthorityMinusUserinfo wrong","www.tyopaikat.com:8080",
uuri.getAuthorityMinusUserinfo());
}
public final void testRFC3986RelativeChange() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://a/b/c/d;p?q");
tryRelative(base, "?y", "http://a/b/c/d;p?y");
}
/**
* Tests from rfc3986
*
* <pre>
* "g:h" = "g:h"
* "g" = "http://a/b/c/g"
* "./g" = "http://a/b/c/g"
* "g/" = "http://a/b/c/g/"
* "/g" = "http://a/g"
* "//g" = "http://g"
* "?y" = "http://a/b/c/d;p?y"
* "g?y" = "http://a/b/c/g?y"
* "#s" = "http://a/b/c/d;p?q#s"
* "g#s" = "http://a/b/c/g#s"
* "g?y#s" = "http://a/b/c/g?y#s"
* ";x" = "http://a/b/c/;x"
* "g;x" = "http://a/b/c/g;x"
* "g;x?y#s" = "http://a/b/c/g;x?y#s"
* "" = "http://a/b/c/d;p?q"
* "." = "http://a/b/c/"
* "./" = "http://a/b/c/"
* ".." = "http://a/b/"
* "../" = "http://a/b/"
* "../g" = "http://a/b/g"
* "../.." = "http://a/"
* "../../" = "http://a/"
* "../../g" = "http://a/g"
* </pre>
*
* @throws URIException
*/
public final void testRFC3986Relative() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://a/b/c/d;p?q");
tryRelative(base, "g:h", "g:h");
tryRelative(base, "g", "http://a/b/c/g");
tryRelative(base, "./g", "http://a/b/c/g");
tryRelative(base, "g/", "http://a/b/c/g/");
tryRelative(base, "/g", "http://a/g");
tryRelative(base, "//g", "http://g");
tryRelative(base, "?y", "http://a/b/c/d;p?y");
tryRelative(base, "g?y", "http://a/b/c/g?y");
tryRelative(base, "#s", "http://a/b/c/d;p?q#s");
tryRelative(base, "g#s", "http://a/b/c/g#s");
tryRelative(base, "g?y#s", "http://a/b/c/g?y#s");
tryRelative(base, ";x", "http://a/b/c/;x");
tryRelative(base, "g;x", "http://a/b/c/g;x");
tryRelative(base, "g;x?y#s","http://a/b/c/g;x?y#s");
tryRelative(base, "", "http://a/b/c/d;p?q");
tryRelative(base, ".", "http://a/b/c/");
tryRelative(base, "./", "http://a/b/c/");
tryRelative(base, "..", "http://a/b/");
tryRelative(base, "../", "http://a/b/");
tryRelative(base, "../g", "http://a/b/g");
tryRelative(base, "../..", "http://a/");
tryRelative(base, "../../", "http://a/");
tryRelative(base, "../../g","http://a/g");
}
protected void tryRelative(UsableURI base, String relative, String expected)
throws URIException {
UsableURI uuri = UsableURIFactory.getInstance(base, relative);
assertEquals("Derelativized " + relative + " gave "
+ uuri + " not " + expected,
UsableURIFactory.getInstance(expected),uuri);
}
/**
* Tests from rfc2396 with amendments to accomodate differences
* intentionally added to make our URI handling like IEs.
*
* <pre>
* g:h = g:h
* g = http://a/b/c/g
* ./g = http://a/b/c/g
* g/ = http://a/b/c/g/
* /g = http://a/g
* //g = http://g
* ?y = http://a/b/c/?y
* g?y = http://a/b/c/g?y
* #s = (current document)#s
* g#s = http://a/b/c/g#s
* g?y#s = http://a/b/c/g?y#s
* ;x = http://a/b/c/;x
* g;x = http://a/b/c/g;x
* g;x?y#s = http://a/b/c/g;x?y#s
* . = http://a/b/c/
* ./ = http://a/b/c/
* .. = http://a/b/
* ../ = http://a/b/
* ../g = http://a/b/g
* ../.. = http://a/
* ../../ = http://a/
* ../../g = http://a/g
* </pre>
*
* @throws URIException
*/
public final void testRFC2396Relative() throws URIException {
UsableURI base = UsableURIFactory.
getInstance("http://a/b/c/d;p?q");
TreeMap<String,String> m = new TreeMap<String,String>();
m.put("..", "http://a/b/");
m.put("../", "http://a/b/");
m.put("../g", "http://a/b/g");
m.put("../..", "http://a/");
m.put("../../", "http://a/");
m.put("../../g", "http://a/g");
m.put("g#s", "http://a/b/c/g#s");
m.put("g?y#s ", "http://a/b/c/g?y#s");
m.put(";x", "http://a/b/c/;x");
m.put("g;x", "http://a/b/c/g;x");
m.put("g;x?y#s", "http://a/b/c/g;x?y#s");
m.put(".", "http://a/b/c/");
m.put("./", "http://a/b/c/");
m.put("g", "http://a/b/c/g");
m.put("./g", "http://a/b/c/g");
m.put("g/", "http://a/b/c/g/");
m.put("/g", "http://a/g");
m.put("//g", "http://g");
// CHANGED BY RFC3986
// m.put("?y", "http://a/b/c/?y");
m.put("g?y", "http://a/b/c/g?y");
// EXTRAS beyond the RFC set.
// TODO: That these resolve to a path of /a/g might be wrong. Perhaps
// it should be '/g'?.
m.put("/../../../../../../../../g", "http://a/g");
m.put("../../../../../../../../g", "http://a/g");
m.put("../G", "http://a/b/G");
for (Iterator<String> i = m.keySet().iterator(); i.hasNext();) {
String key = (String)i.next();
String value = (String)m.get(key);
UsableURI uuri = UsableURIFactory.getInstance(base, key);
assertTrue("Unexpected " + key + " " + value + " " + uuri,
uuri.equals(UsableURIFactory.getInstance(value)));
}
}
/**
* A UURI should always be without a 'fragment' segment, which is
* unused and irrelevant for network fetches.
*
* See [ 970666 ] #anchor links not trimmed, and thus recrawled
*
* @throws URIException
*/
public final void testAnchors() throws URIException {
UsableURI uuri = UsableURIFactory.
getInstance("http://www.example.com/path?query#anchor");
assertEquals("Not equal", "http://www.example.com/path?query",
uuri.toString());
}
/**
* Ensure that URI strings beginning with a colon are treated
* the same as browsers do (as relative, rather than as absolute
* with zero-length scheme).
*
* @throws URIException
*/
public void testStartsWithColon() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://www.example.com/path/page");
UsableURI uuri = UsableURIFactory.getInstance(base,":foo");
assertEquals("derelativize starsWithColon",
uuri.getURI(),
"http://www.example.com/path/:foo");
}
/**
* Ensure that relative URIs with colons in late positions
* aren't mistakenly interpreted as absolute URIs with long,
* illegal schemes.
*
* @throws URIException
*/
public void testLateColon() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://www.example.com/path/page");
UsableURI uuri1 = UsableURIFactory.getInstance(base,"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value");
assertEquals("derelativize lateColon",
uuri1.getURI(),
"http://www.example.com/path/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value");
UsableURI uuri2 = UsableURIFactory.getInstance(base,"example.html?parameter=this:value");
assertEquals("derelativize lateColon",
uuri2.getURI(),
"http://www.example.com/path/example.html?parameter=this:value");
}
/**
* Ensure that stray trailing '%' characters do not prevent
* UURI instances from being created, and are reasonably
* escaped when encountered.
*
* @throws URIException
*/
public void testTrailingPercents() throws URIException {
String plainPath = "http://www.example.com/path%";
UsableURI plainPathUuri = UsableURIFactory.getInstance(plainPath);
assertEquals("plainPath getURI", plainPath, plainPathUuri.getURI());
assertEquals("plainPath getEscapedURI",
"http://www.example.com/path%", // browsers don't escape '%'
plainPathUuri.getEscapedURI());
String partiallyEscapedPath = "http://www.example.com/pa%20th%";
UsableURI partiallyEscapedPathUuri = UsableURIFactory.getInstance(
partiallyEscapedPath);
// assertEquals("partiallyEscapedPath getURI",
// "http://www.example.com/pa th%", // TODO: is this desirable?
//// partiallyEscapedPath,
// partiallyEscapedPathUuri.getURI());
assertEquals("partiallyEscapedPath getEscapedURI",
"http://www.example.com/pa%20th%",
partiallyEscapedPathUuri.getEscapedURI());
String plainQueryString = "http://www.example.com/path?q=foo%";
UsableURI plainQueryStringUuri = UsableURIFactory.getInstance(
plainQueryString);
// assertEquals("plainQueryString getURI",
// plainQueryString,
// plainQueryStringUuri.getURI());
assertEquals("plainQueryString getEscapedURI",
"http://www.example.com/path?q=foo%",
plainQueryStringUuri.getEscapedURI());
String partiallyEscapedQueryString =
"http://www.example.com/pa%20th?q=foo%";
UsableURI partiallyEscapedQueryStringUuri = UsableURIFactory.getInstance(
partiallyEscapedQueryString);
assertEquals("partiallyEscapedQueryString getURI",
"http://www.example.com/pa th?q=foo%",
partiallyEscapedQueryStringUuri.getURI());
assertEquals("partiallyEscapedQueryString getEscapedURI",
"http://www.example.com/pa%20th?q=foo%",
partiallyEscapedQueryStringUuri.getEscapedURI());
}
/**
* Ensure that stray '%' characters do not prevent
* UURI instances from being created, and are reasonably
* escaped when encountered.
*
* @throws URIException
*/
public void testStrayPercents() throws URIException {
String oneStray = "http://www.example.com/pa%th";
UsableURI oneStrayUuri = UsableURIFactory.getInstance(oneStray);
assertEquals("oneStray getURI", oneStray, oneStrayUuri.getURI());
assertEquals("oneStray getEscapedURI",
"http://www.example.com/pa%th", // browsers don't escape '%'
oneStrayUuri.getEscapedURI());
String precededByValidEscape = "http://www.example.com/pa%20th%way";
UsableURI precededByValidEscapeUuri = UsableURIFactory.getInstance(
precededByValidEscape);
assertEquals("precededByValidEscape getURI",
"http://www.example.com/pa th%way", // getURI interprets escapes
precededByValidEscapeUuri.getURI());
assertEquals("precededByValidEscape getEscapedURI",
"http://www.example.com/pa%20th%way",
precededByValidEscapeUuri.getEscapedURI());
String followedByValidEscape = "http://www.example.com/pa%th%20way";
UsableURI followedByValidEscapeUuri = UsableURIFactory.getInstance(
followedByValidEscape);
assertEquals("followedByValidEscape getURI",
"http://www.example.com/pa%th way", // getURI interprets escapes
followedByValidEscapeUuri.getURI());
assertEquals("followedByValidEscape getEscapedURI",
"http://www.example.com/pa%th%20way",
followedByValidEscapeUuri.getEscapedURI());
}
public void testEscapingNotNecessary() throws URIException {
String escapesUnnecessary =
"http://www.example.com/misc;reserved:chars@that&don't=need"
+"+escaping$even,though!you(might)initially?think#so";
// expect everything but the #fragment
String expected = escapesUnnecessary.substring(0, escapesUnnecessary
.length() - 3);
assertEquals("escapes unnecessary",
expected,
UsableURIFactory.getInstance(escapesUnnecessary).toString());
}
public void testIdn() throws URIException {
// See http://www.josefsson.org/idn.php.
// http://räksmörgås.josefßon.org/
String idn1 = "http://r\u00e4ksm\u00f6rg\u00e5s.josef\u00dfon.org/";
String puny1 = "http://xn--rksmrgs-5wao1o.josefsson.org/";
assertEquals("encoding of " + idn1, puny1, UsableURIFactory
.getInstance(idn1).toString());
// http://www.pølse.dk/
String idn2 = "http://www.p\u00f8lse.dk/";
String puny2 = "http://www.xn--plse-gra.dk/";
assertEquals("encoding of " + idn2, puny2, UsableURIFactory
.getInstance(idn2).toString());
// http://例子.測試
String idn3 = "http://\u4F8B\u5B50.\u6E2C\u8A66";
String puny3 = "http://xn--fsqu00a.xn--g6w251d/";
assertEquals("encoding of " + idn3, puny3, UsableURIFactory
.getInstance(idn3).toString());
}
public void testNewLineInURL() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance("http://www.ar\rchive\n." +
"org/i\n\n\r\rndex.html");
assertEquals("http://www.archive.org/index.html", uuri.toString());
}
public void testTabsInURL() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance("http://www.ar\tchive\t." +
"org/i\t\r\n\tndex.html");
assertEquals("http://www.archive.org/index.html", uuri.toString());
}
public void testQueryEscaping() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance(
"http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'\";:/?.>,<");
assertEquals(
// tests in FF1.5 indicate it only escapes " < >
"http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'%22;:/?.%3E,%3C",
uuri.toString());
}
/**
* Check that our 'normalization' does same as Nutch's
* Below before-and-afters were taken from the nutch urlnormalizer-basic
* TestBasicURLNormalizer class (December 2006, Nutch 0.9-dev).
* @throws URIException
*/
public void testSameAsNutchURLFilterBasic() throws URIException {
assertEquals(UsableURIFactory.getInstance(" http://foo.com/ ").toString(),
"http://foo.com/");
// check that protocol is lower cased
assertEquals(UsableURIFactory.getInstance("HTTP://foo.com/").toString(),
"http://foo.com/");
// check that host is lower cased
assertEquals(UsableURIFactory.
getInstance("http://Foo.Com/index.html").toString(),
"http://foo.com/index.html");
assertEquals(UsableURIFactory.
getInstance("http://Foo.Com/index.html").toString(),
"http://foo.com/index.html");
// check that port number is normalized
assertEquals(UsableURIFactory.
getInstance("http://foo.com:80/index.html").toString(),
"http://foo.com/index.html");
assertEquals(UsableURIFactory.getInstance("http://foo.com:81/").toString(),
"http://foo.com:81/");
// check that null path is normalized
assertEquals(UsableURIFactory.getInstance("http://foo.com").toString(),
"http://foo.com/");
// check that references are removed
assertEquals(UsableURIFactory.
getInstance("http://foo.com/foo.html#ref").toString(),
"http://foo.com/foo.html");
// // check that encoding is normalized
// normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
// check that unnecessary "../" are removed
assertEquals(UsableURIFactory.
getInstance("http://foo.com/aa/../").toString(),
"http://foo.com/" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/aa/bb/../").toString(),
"http://foo.com/aa/");
/* We fail this one. Here we produce: 'http://foo.com/'.
assertEquals(UURIFactory.
getInstance("http://foo.com/aa/..").toString(),
"http://foo.com/aa/..");
*/
assertEquals(UsableURIFactory.
getInstance("http://foo.com/aa/bb/cc/../../foo.html").toString(),
"http://foo.com/aa/foo.html");
assertEquals(UsableURIFactory.
getInstance("http://foo.com/aa/bb/../cc/dd/../ee/foo.html").
toString(),
"http://foo.com/aa/cc/ee/foo.html");
assertEquals(UsableURIFactory.
getInstance("http://foo.com/../foo.html").toString(),
"http://foo.com/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/../../foo.html").toString(),
"http://foo.com/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/../aa/../foo.html").toString(),
"http://foo.com/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/aa/../../foo.html").toString(),
"http://foo.com/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/aa/../bb/../foo.html/../../").
toString(),
"http://foo.com/" );
assertEquals(UsableURIFactory.getInstance("http://foo.com/../aa/foo.html").
toString(), "http://foo.com/aa/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/../aa/../foo.html").toString(),
"http://foo.com/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/a..a/foo.html").toString(),
"http://foo.com/a..a/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/a..a/../foo.html").toString(),
"http://foo.com/foo.html" );
assertEquals(UsableURIFactory.
getInstance("http://foo.com/foo.foo/../foo.html").toString(),
"http://foo.com/foo.html" );
}
public void testHttpSchemeColonSlash() {
boolean exception = false;
try {
UsableURIFactory.getInstance("https:/");
} catch (URIException e) {
exception = true;
}
assertTrue("Didn't throw exception when one expected", exception);
exception = false;
try {
UsableURIFactory.getInstance("http://");
} catch (URIException e) {
exception = true;
}
assertTrue("Didn't throw exception when one expected", exception);
}
public void testNakedHttpsSchemeColon() {
boolean exception = false;
try {
UsableURIFactory.getInstance("https:");
} catch (URIException e) {
exception = true;
}
assertTrue("Didn't throw exception when one expected", exception);
exception = false;
try {
UsableURI base = UsableURIFactory.getInstance("http://www.example.com");
UsableURIFactory.getInstance(base, "https:");
} catch (URIException e) {
exception = true;
}
assertTrue("Didn't throw exception when one expected", exception);
}
/**
* Test motivated by [#HER-616] The UURI class may throw
* NullPointerException in getReferencedHost()
*
* @throws URIException
*/
public void testMissingHttpColon() throws URIException {
String suspectUri = "http//www.test.foo";
UsableURI base = UsableURIFactory.getInstance("http://www.example.com");
boolean exceptionThrown = false;
try {
UsableURI badUuri = UsableURIFactory.getInstance(suspectUri);
badUuri.getReferencedHost(); // not reached
} catch (URIException e) {
// should get relative-uri-no-base exception
exceptionThrown = true;
} finally {
assertTrue("expected exception not thrown",exceptionThrown);
}
UsableURI goodUuri = UsableURIFactory.getInstance(base,suspectUri);
goodUuri.getReferencedHost();
}
/**
* A UURI's string representation should be same after a
* serialization roundtrip.
*
* @throws URIException
*/
public final void testSerializationRoundtrip() throws URIException {
UsableURI uuri = UsableURIFactory.
getInstance("http://www.example.com/path?query#anchor");
UsableURI uuri2 = (UsableURI) SerializationUtils.deserialize(
SerializationUtils.serialize(uuri));
assertEquals("Not equal", uuri.toString(), uuri2.toString());
uuri = UsableURIFactory.
getInstance("file://///boo_hoo/wwwroot/CMS/Images1/Banner.gif");
uuri2 = (UsableURI) SerializationUtils.deserialize(
SerializationUtils.serialize(uuri));
assertEquals("Not equal", uuri.toString(), uuri2.toString());
}
/**
* A UURI's string representation should be same after a
* toCustomString-getInstance roundtrip.
*
* @throws URIException
*/
public final void testToCustomStringRoundtrip() throws URIException {
UsableURI uuri = UsableURIFactory.
getInstance("http://www.example.com/path?query#anchor");
UsableURI uuri2 = UsableURIFactory.getInstance(uuri.toCustomString());
assertEquals("Not equal", uuri.toString(), uuri2.toString());
// TODO: fix
// see [HER-1470] UURI String roundtrip (UURIFactory.getInstance(uuri.toString()) results in different URI for file: (and perhaps other) URIs
// http://webteam.archive.org/jira/browse/HER-1470
// uuri = UURIFactory.
// getInstance("file://///boo_hoo/wwwroot/CMS/Images1/Banner.gif");
// uuri2 = UURIFactory.getInstance(uuri.toCustomString());
// assertEquals("Not equal", uuri.toString(), uuri2.toString());
}
/**
* A UURI's string representation should be same after a
* toCustomString-getInstance roundtrip.
*
* @throws URIException
*/
public final void testHostnamePortRoundtrip() throws URIException {
UsableURI base = UsableURIFactory.
getInstance("http://www.example.com/path?query#anchor");
UsableURI test = UsableURIFactory.getInstance(base,"boom1.hostname.com:9999");
System.out.println("scheme:"+test.getScheme());
System.out.println(test.toCustomString());
UsableURI roundtrip = UsableURIFactory.getInstance(test.toCustomString());
assertEquals("Not equal", test.toString(), roundtrip.toString());
}
/**
* Test bad port throws URIException not NumberFormatException
*/
public void testExtremePort() {
try {
UsableURI uuri = UsableURIFactory.getInstance("http://Tel.:010101010101");
System.out.println(uuri);
fail("expected exception not thrown");
} catch (URIException ue){
// expected
}
}
/**
* Bars ('|') in path-segments aren't encoded by FF, preferred by some
* RESTful-URI-ideas guides, so should work without error.
*
* @throws URIException
*/
public void testBarsInRelativePath() throws URIException {
UsableURI base = UsableURIFactory.getInstance("http://www.example.com");
String relative = "foo/bar|baz|yorple";
base.resolve(relative);
UsableURIFactory.getInstance(base,relative);
}
/**
* To match IE behavior, backslashes in path-info (really, anywhere before
* query string) assumed to be slashes, to match IE behavior. In
* query-string, they are escaped to %5C.
*
* @throws URIException
*/
public void testBackslashes() throws URIException {
UsableURI uuri = UsableURIFactory.getInstance("http:\\/www.example.com\\a/b\\c/d?q\\r\\|s/t\\v");
String expected = "http://www.example.com/a/b/c/d?q%5Cr%5C|s/t%5Cv";
assertEquals(expected, uuri.toString());
}
}