/* UriUtilsTest
*
* $Id: ArchiveUtilsTest.java 5052 2007-04-10 02:26:52Z gojomo $
*
* Copyright (C) 2010 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.util;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* JUnit test suite for UriUtils.
*
* Several of the tests for the 'legacy' (H1 through at least 1.14.4)
* heuristics are disabled by renaming, because those heuristics have known
* failures; however, until more experience with the new heuristics is
* collected, H1 still uses them for consistency.
*
* @contributor gojomo
* @version $Id: ArchiveUtilsTest.java 5052 2007-04-10 02:26:52Z gojomo $
*/
public class UriUtilsTest extends TestCase {
public UriUtilsTest(final String testName) {
super(testName);
}
/**
* run all the tests for ArchiveUtilsTest
*
* @param argv
* the command line arguments
*/
public static void main(String argv[]) {
junit.textui.TestRunner.run(suite());
}
public static Test suite() {
return new TestSuite(UriUtilsTest.class);
}
/** image URIs that should be considered likely URIs **/
static String[] urisRelativeImages = {
"photo.jpg",
"./photo.jpg",
"../photo.jpg",
"images/photo.jpg",
"../../images/photo.jpg" };
/** check that plausible relative image URIs return true with legacy tests */
public void xestLegacySimpleImageRelatives() {
legacyTryAll(urisRelativeImages, true);
}
/** check that plausible relative image URIs return true with new tests */
public void testNewSimpleImageRelatives() {
tryAll(urisRelativeImages,true);
}
/** absolute URIs that should be considered likely URIs **/
static String[] urisAbsolute = {
"http://example.com",
"http://example.com/", "http://www.example.com",
"http://www.example.com/", "http://www.example.com/about",
"http://www.example.com/about/",
"http://www.example.com/about/index.html", "https://example.com",
"https://example.com/", "https://www.example.com",
"https://www.example.com/", "https://www.example.com/about",
"https://www.example.com/about/",
"https://www.example.com/about/index.html",
"ftp://example.com/public/report.pdf",
"http://a.example.com/combiner/c?js=analytics/sOmni.js,analytics/analytics.js,analytics/zf.js,analytics/externalnielsen.js",
"http://l.example.com/jn/util/anysize/74*74c-86400,http%3A%2F%2Fl.example.com%2Fa%2Fi%2Fus%2Fshine%2Fmoreon%2F74.upallnight.jpg",
// TODO: other schemes? mailto?
};
/** check that absolute URIs return true with legacy tests */
public void testLegacyAbsolutes() {
legacyTryAll(urisAbsolute,true);
}
/** check that absolute URIs return true with new tests */
public void testAbsolutes() {
tryAll(urisAbsolute,true);
}
protected static String[] urisRelative = new String[] {
"default.asp?type=1",
"\\/add\\/page?.crumb=O2.eArRHJUUWRkVHN6L0Y.&frompg=p1",
"/wiki/Ficheiro:Wikiversity-logo.svg",
"cssp!gelui-1/overlay",
"/wiki/%E0%B4%B8%E0%B4%B9%E0%B4%BE%E0%B4%AF%E0%B4%82:To_Read_in_Malayalam",
"/wiki/Wikiversity:Why_create_an_account%3F",
};
public void testRelatives() {
tryAll(urisRelative, true);
}
/** path-absolute images URIs that should be considered likely URIs **/
static String[] urisPathAbsoluteImages = {
"/photo.jpg",
"/images/photo.jpg",
};
/** check that path-absolute image URIs return true with legacy tests*/
public void testLegacySimpleImagePathAbsolutes() {
legacyTryAll(urisPathAbsoluteImages, true);
}
/** check that path-absolute image URIs return true with new tests*/
public void testSimpleImagePathAbsolutes() {
tryAll(urisPathAbsoluteImages, true);
}
/** URI-like strings risking false positives that should NOT be likely URIs **/
static String[] notUrisNaiveFalsePositives = {
"0.99",
"3.14157",
"text/javascript"
};
/** check that typical false-positives of the naive test are not deemed URIs */
public void xestLegacyNaiveFalsePositives() {
legacyTryAll(notUrisNaiveFalsePositives, false);
}
/** check that typical false-positives of the naive test are not deemed URIs */
public void testNaiveFalsePositives() {
tryAll(notUrisNaiveFalsePositives, false);
}
/** strings that should not be considered likely URIs **/
static String[] notUrisNaive = {
"foo bar",
"<script>foo=bar</script>",
"item\t$0.99\tred",
};
/** check that strings that fail naive test are not deemed URIs legacy tests*/
public void testLegacyNaiveNotUris() {
legacyTryAll(notUrisNaive, false);
}
/** check that strings that fail naive test are not deemed URIs new tests*/
public void testNaiveNotUris() {
tryAll(notUrisNaive, false);
}
protected static final String[] unusualCharacterFalsePositives = new String[] {
"),f=document.getElementsByTagName(",
"window.location.href='/'",
"location='http://example.com/blah/'",
"http://example.com/intent/user?screen_name='+p.user+'",
").append(",
"[\\x3cb\\x3e-\\x3c/b\\x3e]",
"http://demo.example.net/panama.php?cgroup=ron728x90&pid=\"+pid+\"&uid=\"+uid+\"&rid=\"+rid+\"&kw=10&cx=10&bh=10",
};
public void testUnusualCharacterFalsePositives() {
tryAll(unusualCharacterFalsePositives, false);
}
protected static final String[] mimetypesFalsePositives = new String[] {
"text/javascript",
"text/css",
"application/x-shockwave-flash",
"text/javaScript",
"text/html",
"application/x-www-form-urlencoded",
"text/xml",
"text/plain",
"application/x-mplayer2",
"application/json",
"image/jpeg",
"image/x-icon",
"audio/mpeg",
"image/gif",
"audio/ogg",
"video/quicktime",
"audio/x-pn-realaudio-plugin",
};
public void testMimetypesFalsePositives() {
tryAll(mimetypesFalsePositives, false);
}
protected static final String[] startsOrEndsWithPlusFalsePositives = new String[] {
"+resp.result+",
";overlay.style.width=viewport_dimensions.width+",
"+_ti;bb.src=",
};
public void testStartsOrEndsWithPlusFalsePositives() {
tryAll(startsOrEndsWithPlusFalsePositives, false);
}
protected static final String[] doubleSlashFalsePositives = new String[] {
".//*",
"http://example.com/monkey//foo/whatever"
};
public void testDoubleSlashFalsePositives() {
tryAll(startsOrEndsWithPlusFalsePositives, false);
}
/**
* Test that all supplied candidates give the expected result, for each of
* the 'legacy' (H1) likely-URI-tests
*
* @param candidates String[] to test
* @param expected desired answer
*/
protected void legacyTryAll(String[] candidates, boolean expected) {
for (String candidate : candidates) {
assertEquals("javascript context: " + candidate,
expected,
UriUtils.isLikelyUriJavascriptContextLegacy(candidate));
assertEquals("html context: " + candidate,
expected,
UriUtils.isLikelyUriHtmlContextLegacy(candidate));
}
}
/**
* Test that all supplied candidates give the expected results, for
* the 'new' heuristics now in this class.
* @param candidates String[] to test
* @param expected desired answer
*/
protected void tryAll(String[] candidates, boolean expected) {
for (String candidate : candidates) {
assertEquals(candidate, expected, UriUtils.isLikelyUri(candidate));
}
}
}