/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.jboss.elasticsearch.river.remote.sitemap; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.URL; import java.util.zip.GZIPOutputStream; import org.jboss.elasticsearch.river.remote.DateTimeUtils; import org.junit.Assert; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; /** * Unit test for {@link SiteMapParser} * * @author http://code.google.com/p/crawler-commons * @author Vlastimil Elias (velias at redhat dot com) */ public class SiteMapParserTest { public static final String SITEMAP_XML_INDEX = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" + "<sitemap>" + " <loc>http://www.example.com/sitemap1.xml.gz</loc>" + " <lastmod>2004-10-01T18:23:17+00:00</lastmod>" + " </sitemap>" + "<sitemap>" + " <loc>http://www.example.com/sitemap2.xml.gz</loc>" + " <lastmod>2005-01-01</lastmod>" + " </sitemap>" + " </sitemapindex>"; @Test public void parseSiteMap_Index() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); Assert.assertTrue(parser.isStrict()); String contentType = "text/xml"; byte[] content = SITEMAP_XML_INDEX.getBytes(); URL url = new URL(URL_SITEMAP_XML); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(true, asm.isIndex()); assertEquals(true, asm instanceof SiteMapIndex); SiteMapIndex smi = (SiteMapIndex) asm; assertEquals(2, smi.getSitemaps().size()); AbstractSiteMap currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap1.xml.gz")); assertNotNull(currentSiteMap); assertEquals("http://www.example.com/sitemap1.xml.gz", currentSiteMap.getUrl().toString()); assertEquals(SiteMap.convertToDate("2004-10-01T18:23:17+00:00"), currentSiteMap.getLastModified()); currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap2.xml.gz")); assertNotNull(currentSiteMap); assertEquals("http://www.example.com/sitemap2.xml.gz", currentSiteMap.getUrl().toString()); assertEquals(SiteMap.convertToDate("2005-01-01"), currentSiteMap.getLastModified()); } public static final String URL_SITEMAP_XML = "http://www.example.com/sitemap.xml"; public static final String SITEMAP_XML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" + " <url>" + "<loc>http://www.example.com/</loc>" + "<lastmod>2005-01-01</lastmod>" + "<changefreq>monthly</changefreq>" + "<priority>0.8</priority>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc>" + "<changefreq>weekly</changefreq>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=73&desc=vacation_new_zealand</loc>" + "<lastmod>2004-12-23</lastmod>" + "<changefreq>weekly</changefreq>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=74&desc=vacation_newfoundland</loc>" + "<lastmod>2004-12-23T18:00:15+00:00</lastmod>" + "<priority>0.3</priority>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=83&desc=vacation_usa</loc>" + "<lastmod>2004-11-23</lastmod>" + "</url>" + "</urlset>"; @Test public void parseSiteMap_XML() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "text/xml"; byte[] content = SITEMAP_XML.getBytes(); URL url = new URL(URL_SITEMAP_XML); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(5, sm.getSiteMapUrls().size()); Assert.assertEquals("http://www.example.com/", sm.getSiteMapUrls().iterator().next().getUrl().toString()); Assert.assertEquals("2005-01-01T00:00:00.0+0000", DateTimeUtils.formatISODateTime(sm.getSiteMapUrls().iterator().next().getLastModified())); } @Test(expected = UnknownFormatException.class) public void parseSiteMap_UnknownXml() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "text/xml"; byte[] content = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<unknown>" + "</unknown>").getBytes(); URL url = new URL(URL_SITEMAP_XML); parser.parseSiteMap(contentType, content, url); } @Test public void parseSiteMap_XMLGzip() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "application/x-gzip"; ByteArrayOutputStream bos = new ByteArrayOutputStream(); GZIPOutputStream gos = new GZIPOutputStream(bos); gos.write(SITEMAP_XML.getBytes()); gos.close(); URL url = new URL(URL_SITEMAP_XML + ".gz"); AbstractSiteMap asm = parser.parseSiteMap(contentType, bos.toByteArray(), url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(5, sm.getSiteMapUrls().size()); Assert.assertEquals("http://www.example.com/", sm.getSiteMapUrls().iterator().next().getUrl().toString()); Assert.assertEquals("2005-01-01T00:00:00.0+0000", DateTimeUtils.formatISODateTime(sm.getSiteMapUrls().iterator().next().getLastModified())); } public static final String SITEMAP_XML_NO_DECLARATIONS = "<urlset>" + " <url>" + "<loc>http://www.example.com/</loc>" + "<lastmod>2005-01-01</lastmod>" + "<changefreq>monthly</changefreq>" + "<priority>0.8</priority>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc>" + "<changefreq>weekly</changefreq>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=73&desc=vacation_new_zealand</loc>" + "<lastmod>2004-12-23</lastmod>" + "<changefreq>weekly</changefreq>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=74&desc=vacation_newfoundland</loc>" + "<lastmod>2004-12-23T18:00:15+00:00</lastmod>" + "<priority>0.3</priority>" + "</url>" + "<url>" + "<loc>http://www.example.com/catalog?item=83&desc=vacation_usa</loc>" + "<lastmod>2004-11-23</lastmod>" + "</url>" + "</urlset>"; @Test(expected = UnknownFormatException.class) public void parseSiteMap_UnknownFormat() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "text/unklnown"; byte[] content = SITEMAP_XML_NO_DECLARATIONS.getBytes(); URL url = new URL(URL_SITEMAP_XML + ".aaa"); parser.parseSiteMap(contentType, content, url); } @Test public void parseSiteMap_XMLNoDeclaration() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "text/xml"; byte[] content = SITEMAP_XML_NO_DECLARATIONS.getBytes(); URL url = new URL(URL_SITEMAP_XML); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(5, sm.getSiteMapUrls().size()); } @Test public void parseSiteMap_XMLNoDeclarationNoContentype() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); byte[] content = SITEMAP_XML_NO_DECLARATIONS.getBytes(); URL url = new URL(URL_SITEMAP_XML); AbstractSiteMap asm = parser.parseSiteMap(null, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(5, sm.getSiteMapUrls().size()); } @Test public void parseSiteMap_TXT() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "text/plain"; String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11"; byte[] content = scontent.getBytes(); URL url = new URL("http://www.example.com/sitemap.txt"); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(2, sm.getSiteMapUrls().size()); } @Test(expected = UnknownFormatException.class) public void parseSiteMap_BrokenXml() throws IOException, UnknownFormatException { // This Sitemap contains badly formatted XML and can't be read SiteMapParser parser = new SiteMapParser(); String contentType = "text/xml"; String scontent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><urlset " + "xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"><url>" + "<!-- This file is not a valid XML file --></url><url><loc>" + "http://cs.harding.edu/fmccown/sitemaps/something.html</loc>" + "</url><!-- missing opening url tag --></url></urlset>"; byte[] content = scontent.getBytes(); URL url = new URL("http://www.example.com/sitemapindex.xml"); parser.parseSiteMap(contentType, content, url); } @Test public void parseSiteMap_lenientParser() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "text/xml"; String scontent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" + " <url>" + "<loc>http://www.example.com/</loc>" + " </url>" + "</urlset>"; // no lenient parsing means URL is not there as it is not from range byte[] content = scontent.getBytes(); URL url = new URL("http://www.example.com/subsection/sitemap.xml"); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(0, sm.getSiteMapUrls().size()); // Now try again with lenient parsing. We should get one invalid URL parser = new SiteMapParser(false); Assert.assertFalse(parser.isStrict()); asm = parser.parseSiteMap(contentType, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); sm = (SiteMap) asm; assertEquals(1, sm.getSiteMapUrls().size()); assertFalse(sm.getSiteMapUrls().iterator().next().isValid()); // no lenient parsing means URL is not there as it is from another domain parser = new SiteMapParser(); asm = parser.parseSiteMap(contentType, content, new URL("http://www.example.org/sitemap.xml")); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); sm = (SiteMap) asm; assertEquals(0, sm.getSiteMapUrls().size()); parser = new SiteMapParser(false); asm = parser.parseSiteMap(contentType, content, new URL("http://www.example.org/sitemap.xml")); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); sm = (SiteMap) asm; assertEquals(1, sm.getSiteMapUrls().size()); assertFalse(sm.getSiteMapUrls().iterator().next().isValid()); } public static final String URL_SITEMAP_ATOM = "http://www.example.com/sitemap.xml"; public static final String SITEMAP_ATOM = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<feed>" + "<modified>2005-01-01</modified>" + "<entry><link href=\"http://www.example.com/\"/></entry>" + "<entry><link href=\"http://www.example.com/catalog?item=12&desc=vacation_hawaii\"/></entry>" + "<entry><link href=\"http://www.example.com/catalog?item=73&desc=vacation_new_zealand\"/></entry>" + "<entry><link href=\"http://www.example.com/catalog?item=74&desc=vacation_newfoundland\"/></entry>" + "<entry><link href=\"http://www.example.org/catalog?item=74&desc=vacation_newfoundland\"/></entry>" + "<entry><link href=\"\"/></entry>" + "<entry><link href=\"http://www.example.com/catalog?item=83&desc=vacation_usa\"/></entry>" + "</feed>"; @Test public void parseSiteMap_Atom() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "application/atom+xml"; byte[] content = SITEMAP_ATOM.getBytes(); URL url = new URL(URL_SITEMAP_ATOM); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(5, sm.getSiteMapUrls().size()); Assert.assertEquals("http://www.example.com/", sm.getSiteMapUrls().iterator().next().getUrl().toString()); Assert.assertEquals("2005-01-01T00:00:00.0+0000", DateTimeUtils.formatISODateTime(sm.getSiteMapUrls().iterator().next().getLastModified())); } public static final String URL_SITEMAP_RSS = "http://www.example.com/sitemap.xml"; public static final String SITEMAP_RSS = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<rss version=\"2.0\"> <channel>" + "<pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>" + "<item><link>http://www.example.com/</link></item>" + "<item><link>http://www.example.com/catalog?item=12&desc=vacation_hawaii</link></item>" + "<item><link>http://www.example.com/catalog?item=73&desc=vacation_new_zealand</link></item>" + "<item><link>http://www.example.com/catalog?item=74&desc=vacation_newfoundland</link></item>" + "<item><link>http://www.example.org/catalog?item=74&desc=vacation_newfoundland</link></item>" + "<item><link></link></item>" + "<item><link>http://www.example.com/catalog?item=83&desc=vacation_usa</link></item>" + "</channel></rss>"; @Test public void parseSiteMap_Rss() throws UnknownFormatException, IOException { SiteMapParser parser = new SiteMapParser(); String contentType = "application/rss+xml"; byte[] content = SITEMAP_RSS.getBytes(); URL url = new URL(URL_SITEMAP_RSS); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(false, asm.isIndex()); assertEquals(true, asm instanceof SiteMap); SiteMap sm = (SiteMap) asm; assertEquals(5, sm.getSiteMapUrls().size()); Assert.assertEquals("http://www.example.com/", sm.getSiteMapUrls().iterator().next().getUrl().toString()); Assert.assertEquals("2003-06-10T04:00:00.0+0000", DateTimeUtils.formatISODateTime(sm.getSiteMapUrls().iterator().next().getLastModified())); } @Test public void urlIsLegal() { Assert.assertTrue(SiteMapParser.urlIsLegal("http://aaa.cz", "http://aaa.cz")); Assert.assertTrue(SiteMapParser.urlIsLegal("http://aaa.cz", "http://aaa.cz/aok/rtr.html")); Assert.assertTrue(SiteMapParser.urlIsLegal("http://aaA.cz", "http://aaa.CZ/aok/rtR.html")); Assert.assertFalse(SiteMapParser.urlIsLegal("http://aaa.com", "http://aaa.cz/aok/rtr.html")); Assert.assertFalse(SiteMapParser.urlIsLegal("http://aaa.cz/oo", "http://aaa.cz/aok/rtr.html")); } }