/*
* Copyright (2007-2012) Schibsted ASA
* This file is part of Possom.
*
* Possom is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Possom is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Possom. If not, see <http://www.gnu.org/licenses/>.
*/
package no.sesat.sitemap;
import no.sesat.search.sitemap.PageProvider;
import no.sesat.search.sitemap.Page;
import org.apache.log4j.Logger;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import javax.xml.XMLConstants;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.*;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* A class for generating sitemaps. The entries are provided by implementors of
* <tt>no.sesat.commons.sitemaps.PageProvider</tt>. This class will generate a sitemap index file and any number of
* sitemap files required to meet the size resitriction of a single sitemap file.
*
* A list of <tt>no.sesat.search.sitemap.PageProvider</tt> can be supplied at instantiation. Another options is to
* have this class load providers available in a skin using the the java6 ServiceLoader mechanism. The main method
* of this class expect three parameters:
*
* <ul>
* <li>The skin from which to load page providers (e.g. <tt>http://sesam.no</tt>)</li>
* <li>The directory to which you want the sitemap files to be written (e.g. <tt>/www/data/sitemaps/</tt>)</li>
* <li>The URL at which this directory can be accessed using HTTP (e.g. <tt>http://sesam.no/sitemaps/</tt>).
* This information is needed for the sitemap index file</li>
* </ul>
*
*
* @version $Id$
*/
public final class SitemapGenerator {
private static final String NS = "http://www.sitemaps.org/schemas/sitemap/0.9";
private static final SimpleDateFormat ISO_8601_DATE = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
private static final Logger LOG = Logger.getLogger(SitemapGenerator.class);
private final Collection<PageProvider> providers;
private final File location;
private final URI uri;
private final SAXTransformerFactory tf = (SAXTransformerFactory) TransformerFactory.newInstance();
private final TransformerHandler transformerHandler;
private final FileWriter writer;
/**
* Creates a new generator.
*
* @param providers the list of page providers to use.
* @param dest the destination directory.
* @param url the public url corresponding to the the destination directory.
* @throws IOException if the files could not be written.
*/
public SitemapGenerator(final Collection<PageProvider> providers, final File dest, final URI url)
throws IOException {
LOG.info("Initializing...");
this.uri = url;
this.providers = providers;
this.location = dest;
this.writer = new FileWriter(new File(location, "sitemap_index.xml"));
final StreamResult streamResult = new StreamResult(writer);
try {
tf.setAttribute("indent-number", 2);
this.transformerHandler = tf.newTransformerHandler();
final Transformer serializer = transformerHandler.getTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");//
serializer.setOutputProperty(OutputKeys.METHOD,"xml");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
transformerHandler.setResult(streamResult);
} catch (TransformerConfigurationException e) {
throw new RuntimeException(e);
}
}
/**
* Creates a new instance loading any providers found in the provided skin.
*
* @param site The skin to generate a sitemap for
* @param dest The directory to write files to.
* @param url The URI at which this directory is accessible using HTTP.
*
* @throws IOException If the files can't be written.
*/
public SitemapGenerator(final URI site, final File dest, final URI url) throws IOException {
this(new ArrayList<PageProvider>(), dest, url);
// HACK add site into system properties, a la the global context
System.setProperty("no.sesat.sitemap.SitemapGenerator.site", site.toString());
for (PageProvider provider : ServiceLoader.load(PageProvider.class, getClassLoader(site))) {
LOG.info("Found " + provider.getName());
providers.add(provider);
}
}
public static void main(String[] args)
throws URISyntaxException, IOException, TransformerConfigurationException, SAXException {
new SitemapGenerator(new URI(args[0]), new File(args[1]), new URI(args[2])).generate();
}
/**
* Generate the sitemap files.
*
* @throws IOException if the files could not be created.
* @throws SAXException if a xml error occurs.
*/
public void generate() throws IOException, SAXException {
int totalCount = 0;
AttributesImpl schemaLocation = new AttributesImpl();
transformerHandler.startDocument();
transformerHandler.startPrefixMapping("xsd", XMLConstants.W3C_XML_SCHEMA_NS_URI);
transformerHandler.startPrefixMapping("xsi", XMLConstants.W3C_XML_SCHEMA_INSTANCE_NS_URI);
schemaLocation.addAttribute(XMLConstants.W3C_XML_SCHEMA_NS_URI, "schemaLocation", "xsi:schemaLocation", "CDATA", "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd");
transformerHandler.startElement(NS, "", "sitemapindex", schemaLocation);
for (final PageProvider provider : providers) {
LOG.info("Processing " + provider.getName());
final SiteMap group = new SiteMap(provider.getName());
try {
for (final Page page : provider) {
if (null != page) {
group.addPage(page);
}
}
} finally {
group.finish();
LOG.info(group.getCount() + " entries processed for " + provider.getName());
totalCount += group.getCount();
}
for (final SiteMap.SiteMapFile map : group.getSiteMaps()) {
transformerHandler.startElement("", "", "sitemap", new AttributesImpl());
addElement("loc", uri.resolve(map.getFileName()).toString());
addElement("lastmod", formatDateW3c((new Date())));
transformerHandler.endElement("", "", "sitemap");
}
}
transformerHandler.endElement(NS, "", "sitemapindex");
transformerHandler.endDocument();
writer.close();
LOG.info("All done (" + totalCount + " entries)");
}
private void addElement(final String element, final String string) throws SAXException {
final AttributesImpl noAttributes = new AttributesImpl();
transformerHandler.startElement("", "", element, noAttributes);
if (string != null) {
transformerHandler.characters(string.toCharArray(), 0, string.length());
}
transformerHandler.endElement("", "", element);
}
/**
* A sitemap group represents a sitemap. It will create multiple underlying sitemaps if the number of entries exceed
* 25000.
*/
private class SiteMap {
private final String name;
private SiteMapFile currentSiteMapFile;
private List<SiteMapFile> siteMaps;
private int count = 0;
public SiteMap(final String name) throws IOException, SAXException {
this.name = name;
this.currentSiteMapFile = new SiteMapFile(1);
this.siteMaps = new ArrayList<SiteMapFile>();
}
public void addPage(final Page page) throws IOException, SAXException {
if (currentSiteMapFile.getEntryCount() == 25000) {
siteMaps.add(currentSiteMapFile);
count += currentSiteMapFile.getEntryCount();
currentSiteMapFile.finish();
currentSiteMapFile = new SiteMapFile(siteMaps.size() + 1);
}
currentSiteMapFile.addPage(page);
}
public void finish() throws IOException {
siteMaps.add(currentSiteMapFile);
count += currentSiteMapFile.getEntryCount();
currentSiteMapFile.finish();
}
public List<SiteMapFile> getSiteMaps() {
return siteMaps;
}
public int getCount() {
return count;
}
/**
* Representation of a single sitemap file.
*/
private class SiteMapFile {
private int entryCount = 0;
private final String fileName;
private final TransformerHandler transformerHandler;
private final FileWriter writer;
private boolean finished = false;
public SiteMapFile(int count) throws IOException, SAXException {
this.fileName = name + '_' + count + ".xml";
writer = new FileWriter(new File(location, fileName));
final StreamResult streamResult = new StreamResult(writer);
try {
transformerHandler = tf.newTransformerHandler();
Transformer serializer = transformerHandler.getTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");//
serializer.setOutputProperty(OutputKeys.METHOD,"xml");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
transformerHandler.setResult(streamResult);
transformerHandler.startDocument();
AttributesImpl schemaLocation = new AttributesImpl();
transformerHandler.startPrefixMapping("xsd", XMLConstants.W3C_XML_SCHEMA_NS_URI);
transformerHandler.startPrefixMapping("xsi", XMLConstants.W3C_XML_SCHEMA_INSTANCE_NS_URI);
schemaLocation.addAttribute(XMLConstants.W3C_XML_SCHEMA_NS_URI, "schemaLocation", "xsi:schemaLocation", "CDATA", "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd");
transformerHandler.startElement(NS, "", "urlset", schemaLocation);
} catch (TransformerConfigurationException e) {
throw new RuntimeException(e);
}
}
public void addPage(final Page page) throws SAXException {
transformerHandler.startElement("", "", "url", new AttributesImpl());
addElement("loc", page.getLocation().toString());
if (null != page.getLastModified()) {
addElement("lastmod", formatDateW3c(page.getLastModified()));
}
if (0.5 != page.getPriority()) { // 0.5 is the default and we don't want to waste tags.
addElement("priority", Double.toString(page.getPriority()));
}
if (null != page.getFrequency()) {
addElement("changefreq" , page.getFrequency().name().toLowerCase());
}
transformerHandler.endElement("", "", "url");
entryCount++;
}
public String getFileName() {
return fileName;
}
private void addElement(final String element, final String string) throws SAXException {
final AttributesImpl noAttributes = new AttributesImpl();
transformerHandler.startElement("", "", element, noAttributes);
if (string != null) {
transformerHandler.characters(string.toCharArray(), 0, string.length());
}
transformerHandler.endElement("", "", element);
}
public int getEntryCount() {
return entryCount;
}
public void finish() throws IOException {
if (!finished) {
try {
transformerHandler.endElement(NS, "", "urlset");
transformerHandler.endDocument();
this.writer.close();
finished = true;
} catch (SAXException e) {
throw new RuntimeException(e);
}
}
}
}
}
/*
* @todo This should really be using the site-spi, but SEARCH-3732 needs to be resolved first.
*/
private ClassLoader getClassLoader(final URI site) throws MalformedURLException {
final URL url = site.resolve("/" + site.getHost() + "/lib/sitemap.jar").toURL();
LOG.info("skin's sitemap.jar at " + url);
return new URLClassLoader((new URL[] {url}));
}
private String formatDateW3c(final Date date) {
final String iso8601Date = ISO_8601_DATE.format(date);
final StringBuilder w3cDate = new StringBuilder(iso8601Date);
// Hmm..is it really not possible to create a w3c compliant date using java.util.Date...
return w3cDate.insert(w3cDate.length() - 2, ':').toString();
}
}