package org.jboss.elasticsearch.river.remote.sitemap; import java.io.IOException; import java.net.URL; import java.util.Collection; import org.apache.commons.io.IOUtils; /** * Utility class for testing the Sitemap Parsing * * @author http://code.google.com/p/crawler-commons */ public class SiteMapTester { SiteMapParser parser = new SiteMapParser(false); private void parse(URL url, String mt, boolean recursive) throws IOException, UnknownFormatException { byte[] content = IOUtils.toByteArray(url); AbstractSiteMap sm = parser.parseSiteMap(mt, content, url); // System.out.println(sm.toString()); if (recursive && sm.isIndex()) { Collection<AbstractSiteMap> links = ((SiteMapIndex) sm).getSitemaps(); for (AbstractSiteMap asm : links) { parse(asm.getUrl(), mt, recursive); } } else if (!sm.isIndex()) { Collection<SiteMapURL> links = ((SiteMap) sm).getSiteMapUrls(); for (SiteMapURL smu : links) { System.out.println(smu.getUrl()); } } } public static void main(String[] args) throws IOException, UnknownFormatException { if (args.length < 1) { System.err.println("SiteMapTester URL_to_test [MimeType]"); } URL url = new URL(args[0]); String mt = ""; if (args.length > 1) mt = args[1]; SiteMapTester tester = new SiteMapTester(); tester.parse(url, mt, true); } }