/* * AbstractMETSDisseminator.java * * Version: $Revision: 3733 $ * * Date: $Date: 2009-04-24 03:52:11 +0000 (Fri, 24 Apr 2009) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.sitemap; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.util.Date; import java.util.zip.GZIPOutputStream; /** * Base class for creating sitemaps of various kinds. A sitemap consists of one * or more files which list significant URLs on a site for search engines to * efficiently crawl. Dates of modification may also be included. A sitemap * index file that links to each of the sitemap files is also generated. It is * this index file that search engines should be directed towards. * <P> * Provides most of the required functionality, subclasses need just implement a * few methods that specify the "boilerplate" and text for including URLs. * <P> * Typical usage: * <pre> * AbstractGenerator g = new FooGenerator(...); * while (...) { * g.addURL(url, date); * } * g.finish(); * </pre> * * @author Robert Tansley */ public abstract class AbstractGenerator { /** Number of files written so far */ protected int fileCount; /** Number of bytes written to current file */ protected int bytesWritten; /** Number of URLs written to current file */ protected int urlsWritten; /** Directory files are written to */ protected File outputDir; /** Current output */ protected PrintStream currentOutput; /** Size in bytes of trailing boilerplate */ private int trailingByteCount; /** * Initialize this generator to write to the given directory. This must be * called by any subclass constructor. * * @param outputDirIn * directory to write sitemap files to */ public AbstractGenerator(File outputDirIn) { fileCount = 0; outputDir = outputDirIn; trailingByteCount = getTrailingBoilerPlate().length(); currentOutput = null; } /** * Start writing a new sitemap file. * * @throws IOException * if an error occurs creating the file */ protected void startNewFile() throws IOException { String lbp = getLeadingBoilerPlate(); OutputStream fo = new FileOutputStream(new File(outputDir, getFilename(fileCount))); if (useCompression()) { fo = new GZIPOutputStream(fo); } currentOutput = new PrintStream(fo); currentOutput.print(lbp); bytesWritten = lbp.length(); urlsWritten = 0; } /** * Add the given URL to the sitemap. * * @param url * Full URL to add * @param lastMod * Date URL was last modified, or {@code null} * @throws IOException * if an error occurs writing */ public void addURL(String url, Date lastMod) throws IOException { // Kick things off if this is the first call if (currentOutput == null) { startNewFile(); } String newURLText = getURLText(url, lastMod); if (bytesWritten + newURLText.length() + trailingByteCount > getMaxSize() || urlsWritten + 1 > getMaxURLs()) { closeCurrentFile(); startNewFile(); } currentOutput.print(newURLText); bytesWritten += newURLText.length(); urlsWritten++; } /** * Finish with the current sitemap file. * * @throws IOException * if an error occurs writing */ protected void closeCurrentFile() throws IOException { currentOutput.print(getTrailingBoilerPlate()); currentOutput.close(); fileCount++; } /** * Complete writing sitemap files and write the index files. This is invoked * when all calls to {@link AbstractGenerator#addURL(String, Date)} have * been completed, and invalidates the generator. * * @return number of sitemap files written. * * @throws IOException * if an error occurs writing */ public int finish() throws IOException { closeCurrentFile(); OutputStream fo = new FileOutputStream(new File(outputDir, getIndexFilename())); if (useCompression()) { fo = new GZIPOutputStream(fo); } PrintStream out = new PrintStream(fo); writeIndex(out, fileCount); out.close(); return fileCount; } /** * Return marked-up text to be included in a sitemap about a given URL. * * @param url * URL to add information about * @param lastMod * date URL was last modified, or {@code null} if unknown or not * applicable * @return the mark-up to include */ public abstract String getURLText(String url, Date lastMod); /** * Return the boilerplate at the top of a sitemap file. * * @return The boilerplate markup. */ public abstract String getLeadingBoilerPlate(); /** * Return the boilerplate at the end of a sitemap file. * * @return The boilerplate markup. */ public abstract String getTrailingBoilerPlate(); /** * Return the maximum size in bytes that an individual sitemap file should * be. * * @return the size in bytes. */ public abstract int getMaxSize(); /** * Return the maximum number of URLs that an individual sitemap file should * contain. * * @return the maximum number of URLs. */ public abstract int getMaxURLs(); /** * Return whether the written sitemap files and index should be * GZIP-compressed. * * @return {@code true} if GZIP compression should be used, {@code false} * otherwise. */ public abstract boolean useCompression(); /** * Return the filename a sitemap at the given index should be stored at. * * @param number * index of the sitemap file (zero is first). * @return the filename to write the sitemap to. */ public abstract String getFilename(int number); /** * Get the filename the index should be written to. * * @return the filename of the index. */ public abstract String getIndexFilename(); /** * Write the index file. * * @param output * stream to write the index to * @param sitemapCount * number of sitemaps that were generated * @throws IOException * if an IO error occurs */ public abstract void writeIndex(PrintStream output, int sitemapCount) throws IOException; }