Harvest.java example

Explorer
DSpace-SVN-Deprecated-master
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.search;

import org.apache.log4j.Logger;
import org.dspace.authorize.AuthorizeManager;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.LogManager;
import org.dspace.eperson.Group;
import org.dspace.handle.HandleManager;
import org.dspace.storage.rdbms.DatabaseManager;
import org.dspace.storage.rdbms.TableRow;
import org.dspace.storage.rdbms.TableRowIterator;

import java.io.Serializable;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

/**
 * Utility class for extracting information about items, possibly just within a
 * certain community or collection, that have been created, modified or
 * withdrawn within a particular range of dates.
 * 
 * @author Robert Tansley
 * @version $Revision$
 */
public class Harvest
{
    /** log4j logger */
    private static Logger log = Logger.getLogger(Harvest.class);

    /**
     * Obtain information about items that have been created, modified or
     * withdrawn within a given date range. You can also specify 'offset' and
     * 'limit' so that a big harvest can be split up into smaller sections.
     * <P>
     * Note that dates are passed in the standard ISO8601 format used by DSpace
     * (and OAI-PMH).
     * <P>
     * FIXME: Assumes all in_archive items have public metadata
     *
     * @param context
     *            DSpace context
     * @param scope
     *            a Collection, Community, or <code>null</code> indicating the scope is
     *            all of DSpace
     * @param startDate
     *            start of date range, or <code>null</code>
     * @param endDate
     *            end of date range, or <code>null</code>
     * @param offset
     *            for a partial harvest, the point in the overall list of
     *            matching items to start at. 0 means just start at the
     *            beginning.
     * @param limit
     *            the number of matching items to return in a partial harvest.
     *            Specify 0 to return the whole list (or the rest of the list if
     *            an offset was specified.)
     * @param items
     *            if <code>true</code> the <code>item</code> field of each
     *            <code>HarvestedItemInfo</code> object is filled out
     * @param collections
     *            if <code>true</code> the <code>collectionHandles</code>
     *            field of each <code>HarvestedItemInfo</code> object is
     *            filled out
     * @param withdrawn
     *            If <code>true</code>, information about withdrawn items is
     *            included
     * @param nonAnon
     *            If items without anonymous access should be included or not
     * @return List of <code>HarvestedItemInfo</code> objects
     * @throws java.sql.SQLException
     * @throws java.text.ParseException If the date is not in a supported format
     */
    public static List<HarvestedItemInfo> harvest(Context context, DSpaceObject scope,
            String startDate, String endDate, int offset, int limit,
            boolean items, boolean collections, boolean withdrawn,
            boolean nonAnon) throws SQLException, ParseException
    {

        // Put together our query. Note there is no need for an
        // "in_archive=true" condition, we are using the existence of
        // Handles as our 'existence criterion'.
        // FIXME: I think the "DISTINCT" is redundant
        String query = "SELECT DISTINCT handle.handle, handle.resource_id, item.withdrawn, item.last_modified FROM handle, item";


        // We are building a complex query that may contain a variable
        // about of input data points. To accommodate this while still
        // providing type safety we build a list of parameters to be
        // plugged into the query at the database level.
        List<Serializable> parameters = new ArrayList<Serializable>();

        if (scope != null)
        {
        	if (scope.getType() == Constants.COLLECTION)
        	{
        		query += ", collection2item";
        	}
        	else if (scope.getType() == Constants.COMMUNITY)
        	{
        		query += ", communities2item";
        	}
        }

        query += " WHERE handle.resource_type_id=" + Constants.ITEM + " AND handle.resource_id=item.item_id ";

        if (scope != null)
        {
        	if (scope.getType() == Constants.COLLECTION)
        	{
        		query += " AND collection2item.collection_id= ? " +
        	             " AND collection2item.item_id=handle.resource_id ";
        		parameters.add(Integer.valueOf(scope.getID()));
        	}
        	else if (scope.getType() == Constants.COMMUNITY)
        	{
        		query += " AND communities2item.community_id= ? " +
						 " AND communities2item.item_id=handle.resource_id";
        		parameters.add(Integer.valueOf(scope.getID()));
        	}
        }

        if (startDate != null)
        {
        	query = query + " AND item.last_modified >= ? ";
        	parameters.add(toTimestamp(startDate, false));
        }

        if (endDate != null)
        {
            /*
             * If the end date has seconds precision, e.g.:
             *
             * 2004-04-29T13:45:43Z
             *
             * we need to add 999 milliseconds to this. This is because SQL
             * TIMESTAMPs have millisecond precision, and so might have a value:
             *
             * 2004-04-29T13:45:43.952Z
             *
             * and so <= '2004-04-29T13:45:43Z' would not pick this up. Reading
             * things out of the database, TIMESTAMPs are rounded down, so the
             * above value would be read as '2004-04-29T13:45:43Z', and
             * therefore a caller would expect <= '2004-04-29T13:45:43Z' to
             * include that value.
             *
             * Got that? ;-)
             */
        	boolean selfGenerated = false;
            if (endDate.length() == 20)
            {
                endDate = endDate.substring(0, 19) + ".999Z";
                selfGenerated = true;
            }

        	query += " AND item.last_modified <= ? ";
            parameters.add(toTimestamp(endDate, selfGenerated));
        }

        if (!withdrawn)
        {
            // Exclude withdrawn items
            if ("oracle".equals(ConfigurationManager.getProperty("db.name")))
            {
                query += " AND withdrawn=0 ";
            }
            else
            {
                // postgres uses booleans
                query += " AND withdrawn=false ";
            }
        }

        // Order by item ID, so that for a given harvest the order will be
        // consistent. This is so that big harvests can be broken up into
        // several smaller operations (e.g. for OAI resumption tokens.)
        query += " ORDER BY handle.resource_id";

        log.debug(LogManager.getHeader(context, "harvest SQL", query));

        Object[] parametersArray = parameters.toArray();
        TableRowIterator tri = DatabaseManager.query(context, query, parametersArray);
        List<HarvestedItemInfo> infoObjects = new LinkedList<HarvestedItemInfo>();

        // Count of items read from the record set that match the selection criteria.
        // Note : Until 'index > offset' the records are not added to the output set.
        int index = 0;

        // Count of items added to the output set.
        int itemCounter = 0;

        try
        {
            // Process results of query into HarvestedItemInfo objects
            while ((tri.hasNext()) && ((limit == 0) || (itemCounter < limit)))
            {
                TableRow row = tri.next();

                HarvestedItemInfo itemInfo = new HarvestedItemInfo();
                itemInfo.context = context;
                itemInfo.handle = row.getStringColumn("handle");
                itemInfo.itemID = row.getIntColumn("resource_id");
                itemInfo.datestamp = row.getDateColumn("last_modified");
                itemInfo.withdrawn = row.getBooleanColumn("withdrawn");

                if (collections)
                {
                    // Add collections data
                    fillCollections(context, itemInfo);
                }

                if (items)
                {
                    // Add the item reference
                    itemInfo.item = Item.find(context, itemInfo.itemID);
                }

                if ((nonAnon) || (itemInfo.item == null) || (withdrawn && itemInfo.withdrawn))
                {
                    index++;
                    if (index > offset)
                    {
                        infoObjects.add(itemInfo);
                        itemCounter++;
                    }
                }
                else
                {
                    // We only want items that allow for anonymous access.
                    if (anonAccessAllowed(context, itemInfo))
                    {
                        index++;
                        if (index > offset)
                        {
                            infoObjects.add(itemInfo);
                            itemCounter++;
                        }
                    }
                }
            }
        }
        finally
        {
            // close the TableRowIterator to free up resources
            if (tri != null)
            {
                tri.close();
            }
        }

        return infoObjects;
    }

    /**
     * Get harvested item info for a single item. <code>item</code> field in
     * returned <code>HarvestedItemInfo</code> object is always filled out.
     *
     * @param context
     *            DSpace context
     * @param handle
     *            Prefix-less Handle of item
     * @param collections
     *            if <code>true</code> the <code>collectionHandles</code>
     *            field of the <code>HarvestedItemInfo</code> object is filled
     *            out
     *
     * @return <code>HarvestedItemInfo</code> object for the single item, or
     *         <code>null</code>
     * @throws java.sql.SQLException
     */
    public static HarvestedItemInfo getSingle(Context context, String handle,
            boolean collections) throws SQLException
    {
        // FIXME: Assume Handle is item
        Item i = (Item) HandleManager.resolveToObject(context, handle);

        if (i == null)
        {
            return null;
        }

        // Fill out OAI info item object
        HarvestedItemInfo itemInfo = new HarvestedItemInfo();

        itemInfo.context = context;
        itemInfo.item = i;
        itemInfo.handle = handle;
        itemInfo.withdrawn = i.isWithdrawn();
        itemInfo.datestamp = i.getLastModified();
        itemInfo.itemID = i.getID();

        // Get the sets
        if (collections)
        {
            fillCollections(context, itemInfo);
        }

        return itemInfo;
    }

    /**
     * Fill out the containers field of the HarvestedItemInfo object
     *
     * @param context
     *            DSpace context
     * @param itemInfo
     *            HarvestedItemInfo object to fill out
     * @throws java.sql.SQLException
     */
    private static void fillCollections(Context context,
            HarvestedItemInfo itemInfo) throws SQLException
    {
        // Get the collection Handles from DB
        TableRowIterator colRows = DatabaseManager.query(context,
                        "SELECT handle.handle FROM handle, collection2item WHERE handle.resource_type_id= ? " +
                        "AND collection2item.collection_id=handle.resource_id AND collection2item.item_id = ? ",
                        Constants.COLLECTION, itemInfo.itemID);

        try
        {
            // Chuck 'em in the itemInfo object
            itemInfo.collectionHandles = new LinkedList();

            while (colRows.hasNext())
            {
                TableRow r = colRows.next();
                itemInfo.collectionHandles.add(r.getStringColumn("handle"));
            }
        }
        finally
        {
            if (colRows != null)
            {
                colRows.close();
            }
        }
    }


    /**
     * Convert a String to a java.sql.Timestamp object
     *
     * @param t The timestamp String
     * @param selfGenerated Is this a self generated timestamp (e.g. it has .999 on the end)
     * @return The converted Timestamp
     * @throws java.text.ParseException
     */
    private static Timestamp toTimestamp(String t, boolean selfGenerated) throws ParseException
    {
        SimpleDateFormat df;
        
        // Choose the correct date format based on string length
        if (t.length() == 10)
        {
            df = new SimpleDateFormat("yyyy-MM-dd");
        }
        else if (t.length() == 20)
        {
            df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
        }
        else if (selfGenerated)
        {
            df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
        }
        else {
            // Not self generated, and not in a guessable format
            throw new ParseException("", 0);
        }
        
        // Parse the date
        df.setCalendar(Calendar.getInstance(TimeZone.getTimeZone("UTC")));
        return new Timestamp(df.parse(t).getTime());
    }

    /**
     * Does the item allow anonymous access ? ie. authorizedGroups must include id=0.
     */
    private static boolean anonAccessAllowed(Context context, HarvestedItemInfo itemInfo) throws SQLException
    {
        Group[] authorizedGroups = AuthorizeManager.getAuthorizedGroups(context, itemInfo.item, Constants.READ);

        for (Group authorizedGroup : authorizedGroups)
        {
            if (authorizedGroup.getID() == 0)
            {
                return true;
            }
        }

        return false;
    }
}