/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.util; import java.net.URL; import org.junit.Assert; import org.junit.Test; /** Test class for URLUtil */ public class TestURLUtil { @Test public void testGetDomainName() throws Exception { URL url = null; url = new URL("http://lucene.apache.org/nutch"); Assert.assertEquals("apache.org", URLUtil.getDomainName(url)); url = new URL("http://en.wikipedia.org/wiki/Java_coffee"); Assert.assertEquals("wikipedia.org", URLUtil.getDomainName(url)); url = new URL("http://140.211.11.130/foundation/contributing.html"); Assert.assertEquals("140.211.11.130", URLUtil.getDomainName(url)); url = new URL("http://www.example.co.uk:8080/index.html"); Assert.assertEquals("example.co.uk", URLUtil.getDomainName(url)); url = new URL("http://com"); Assert.assertEquals("com", URLUtil.getDomainName(url)); url = new URL("http://www.example.co.uk.com"); Assert.assertEquals("uk.com", URLUtil.getDomainName(url)); // "nn" is not a tld url = new URL("http://example.com.nn"); Assert.assertEquals("nn", URLUtil.getDomainName(url)); url = new URL("http://"); Assert.assertEquals("", URLUtil.getDomainName(url)); url = new URL("http://www.edu.tr.xyz"); Assert.assertEquals("xyz", URLUtil.getDomainName(url)); url = new URL("http://www.example.c.se"); Assert.assertEquals("example.c.se", URLUtil.getDomainName(url)); // plc.co.im is listed as a domain suffix url = new URL("http://www.example.plc.co.im"); Assert.assertEquals("example.plc.co.im", URLUtil.getDomainName(url)); // 2000.hu is listed as a domain suffix url = new URL("http://www.example.2000.hu"); Assert.assertEquals("example.2000.hu", URLUtil.getDomainName(url)); // test non-ascii url = new URL("http://www.example.商業.tw"); Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url)); } @Test public void testGetDomainSuffix() throws Exception { URL url = null; url = new URL("http://lucene.apache.org/nutch"); Assert.assertEquals("org", URLUtil.getDomainSuffix(url).getDomain()); url = new URL("http://140.211.11.130/foundation/contributing.html"); Assert.assertNull(URLUtil.getDomainSuffix(url)); url = new URL("http://www.example.co.uk:8080/index.html"); Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain()); url = new URL("http://com"); Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain()); url = new URL("http://www.example.co.uk.com"); Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain()); // "nn" is not a tld url = new URL("http://example.com.nn"); Assert.assertNull(URLUtil.getDomainSuffix(url)); url = new URL("http://"); Assert.assertNull(URLUtil.getDomainSuffix(url)); url = new URL("http://www.edu.tr.xyz"); Assert.assertNull(URLUtil.getDomainSuffix(url)); url = new URL("http://subdomain.example.edu.tr"); Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain()); url = new URL("http://subdomain.example.presse.fr"); Assert.assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain()); url = new URL("http://subdomain.example.presse.tr"); Assert.assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain()); // plc.co.im is listed as a domain suffix url = new URL("http://www.example.plc.co.im"); Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain()); // 2000.hu is listed as a domain suffix url = new URL("http://www.example.2000.hu"); Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain()); // test non-ascii url = new URL("http://www.example.商業.tw"); Assert.assertEquals("商業.tw", URLUtil.getDomainSuffix(url).getDomain()); } @Test public void testGetHostSegments() throws Exception { URL url; String[] segments; url = new URL("http://subdomain.example.edu.tr"); segments = URLUtil.getHostSegments(url); Assert.assertEquals("subdomain", segments[0]); Assert.assertEquals("example", segments[1]); Assert.assertEquals("edu", segments[2]); Assert.assertEquals("tr", segments[3]); url = new URL("http://"); segments = URLUtil.getHostSegments(url); Assert.assertEquals(1, segments.length); Assert.assertEquals("", segments[0]); url = new URL("http://140.211.11.130/foundation/contributing.html"); segments = URLUtil.getHostSegments(url); Assert.assertEquals(1, segments.length); Assert.assertEquals("140.211.11.130", segments[0]); // test non-ascii url = new URL("http://www.example.商業.tw"); segments = URLUtil.getHostSegments(url); Assert.assertEquals("www", segments[0]); Assert.assertEquals("example", segments[1]); Assert.assertEquals("商業", segments[2]); Assert.assertEquals("tw", segments[3]); } @Test public void testChooseRepr() throws Exception { String aDotCom = "http://www.a.com"; String bDotCom = "http://www.b.com"; String aSubDotCom = "http://www.news.a.com"; String aQStr = "http://www.a.com?y=1"; String aPath = "http://www.a.com/xyz/index.html"; String aPath2 = "http://www.a.com/abc/page.html"; String aPath3 = "http://www.news.a.com/abc/page.html"; // 1) different domain them keep dest, temp or perm // a.com -> b.com* Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, true)); Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, false)); // 2) permanent and root, keep src // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aQStr, false)); Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, false)); // 3) permanent and not root and dest root, keep dest // a.com/xyz/index.html -> a.com* Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, false)); // 4) permanent and neither root keep dest // a.com/xyz/index.html -> a.com/abc/page.html* Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, false)); // 5) temp and root and dest not root keep src // *a.com -> a.com/xyz/index.html Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, true)); // 6) temp and not root and dest root keep dest // a.com/xyz/index.html -> a.com* Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, true)); // 7) temp and neither root, keep shortest, if hosts equal by path else by // hosts // a.com/xyz/index.html -> a.com/abc/page.html* // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, true)); Assert.assertEquals(aPath, URLUtil.chooseRepr(aPath, aPath3, true)); // 8) temp and both root keep shortest sub domain // *www.a.com -> www.news.a.com Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true)); } // from RFC3986 section 5.4.1 private static String baseString = "http://a/b/c/d;p?q"; private static String[][] targets = new String[][] { // unknown protocol {"g:h" , "g:h"}, { "g", "http://a/b/c/g" }, { "./g", "http://a/b/c/g" }, { "g/", "http://a/b/c/g/" }, { "/g", "http://a/g" }, { "//g", "http://g" }, { "?y", "http://a/b/c/d;p?y" }, { "g?y", "http://a/b/c/g?y" }, { "#s", "http://a/b/c/d;p?q#s" }, { "g#s", "http://a/b/c/g#s" }, { "g?y#s", "http://a/b/c/g?y#s" }, { ";x", "http://a/b/c/;x" }, { "g;x", "http://a/b/c/g;x" }, { "g;x?y#s", "http://a/b/c/g;x?y#s" }, { "", "http://a/b/c/d;p?q" }, { ".", "http://a/b/c/" }, { "./", "http://a/b/c/" }, { "..", "http://a/b/" }, { "../", "http://a/b/" }, { "../g", "http://a/b/g" }, { "../..", "http://a/" }, { "../../", "http://a/" }, { "../../g", "http://a/g" } }; @Test public void testResolveURL() throws Exception { // test NUTCH-436 URL u436 = new URL("http://a/b/c/d;p?q#f"); Assert.assertEquals("http://a/b/c/d;p?q#f", u436.toString()); URL abs = URLUtil.resolveURL(u436, "?y"); Assert.assertEquals("http://a/b/c/d;p?y", abs.toString()); // test NUTCH-566 URL u566 = new URL("http://www.fleurie.org/entreprise.asp"); abs = URLUtil.resolveURL(u566, "?id_entrep=111"); Assert.assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111", abs.toString()); URL base = new URL(baseString); Assert.assertEquals("base url parsing", baseString, base.toString()); for (int i = 0; i < targets.length; i++) { URL u = URLUtil.resolveURL(base, targets[i][0]); Assert.assertEquals(targets[i][1], targets[i][1], u.toString()); } } @Test public void testToUNICODE() throws Exception { Assert.assertEquals("http://www.çevir.com", URLUtil.toUNICODE("http://www.xn--evir-zoa.com")); Assert.assertEquals("http://uni-tübingen.de/", URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/")); Assert .assertEquals( "http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1", URLUtil .toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1")); } @Test public void testToASCII() throws Exception { Assert.assertEquals("http://www.xn--evir-zoa.com", URLUtil.toASCII("http://www.çevir.com")); Assert.assertEquals("http://xn--uni-tbingen-xhb.de/", URLUtil.toASCII("http://uni-tübingen.de/")); Assert .assertEquals( "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1", URLUtil .toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1")); } @Test public void testFileProtocol() throws Exception { // keep one single slash NUTCH-XXX Assert.assertEquals("file:/path/file.html", URLUtil.toASCII("file:/path/file.html")); Assert.assertEquals("file:/path/file.html", URLUtil.toUNICODE("file:/path/file.html")); } }