OREIngestionCrosswalk.java example

Explorer

CORISCO2-master
- adore-djatoka-1.1-corisco-1
  - src
    - java
      - gov
        lanl
        adore
        djatoka
        DjatokaCompress.java
        DjatokaConstants.java
        DjatokaDecodeParam.java
        DjatokaEncodeParam.java
        DjatokaException.java
        DjatokaExtract.java
        DjatokaExtractProcessor.java
        ICompress.java
        IExtract.java
        io
        ExtractorFactory.java
        FormatConstants.java
        FormatFactory.java
        FormatIOException.java
        FormatWriterParams.java
        IReader.java
        IWriter.java
        reader
        DjatokaReader.java
        ImageIOReader.java
        ImageJReader.java
        PNMReader.java
        writer
        BMPWriter.java
        GIFWriter.java
        JP2Writer.java
        JPGWriter.java
        PNGWriter.java
        PNMWriter.java
        TIFWriter.java
        kdu
        KduCompressExe.java
        KduExtractExe.java
        jni
        KduCompressedSource.java
        KduExtractJNI.java
        KduExtractProcessorJNI.java
        openurl
        DjatokaImageMigrator.java
        IReferentMigrator.java
        IReferentResolver.java
        IdentifierNotFoundException.java
        OpenURLJP2Datastream.java
        OpenURLJP2KMetadata.java
        OpenURLJP2KService.java
        OpenURLJP2Ping.java
        OpenURLJP2XML.java
        OpenURLServlet.java
        ReferentManager.java
        ResolverException.java
        SimpleListResolver.java
        TileCacheManager.java
        plugin
        dspace
        DSpaceResolver.java
        rftdb
        DatabaseResolver.java
        plugin
        ExtractJPG.java
        ExtractPDF.java
        ITransformPlugIn.java
        ImageWatermark.java
        TextWatermark.java
        TransformException.java
        util
        IOUtils.java
        ImageProcessingUtils.java
        ImageRecord.java
        ImageRecordUtils.java
        JP2ImageInfo.java
        JP2Markers.java
        SourceImageFileFilter.java
        util
        AccessManager.java
        ConfigurationManager.java
        DBCPUtils.java
        DjatokaContextListener.java
        ExecuteStreamHandler.java
        HttpDate.java
        PumpStreamHandler.java
        StreamPumper.java
- dspace-1.6.2-src-release-corisco-1

/*
 * OREIngestionCrosswalk.java
 *
 * Version: $Revision: 1 $
 *
 * Date: $Date: 2007-07-30 12:26:50 -0500 (Mon, 30 Jul 2007) $
 *
 * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
 * Institute of Technology.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * - Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *
 * - Neither the name of the Hewlett-Packard Company nor the name of the
 * Massachusetts Institute of Technology nor the names of their
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

package org.dspace.content.crosswalk;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.ConnectException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.sql.SQLException;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import java.util.TreeMap;

import org.apache.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.DCValue;
import org.dspace.content.DSpaceObject;
import org.dspace.content.FormatIdentifier;
import org.dspace.content.Item;
import org.dspace.content.MetadataSchema;
import org.dspace.content.packager.PackageDisseminator;
import org.dspace.content.packager.PackageException;
import org.dspace.content.packager.PackageParameters;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.PluginManager;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Utils;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.jdom.xpath.XPath;

/**
 * ORE ingestion crosswalk
 * <p>
 * Processes an Atom-encoded ORE resource map and attemps to interpret it as a DSpace item
 *
 * @author Alexey Maslov
 * @version $Revision: 1 $
 */
public class OREIngestionCrosswalk
    implements IngestionCrosswalk
{
    /** log4j category */
    private static Logger log = Logger.getLogger(OREDisseminationCrosswalk.class);

    /* Namespaces */
    public static final Namespace ATOM_NS =
        Namespace.getNamespace("atom", "http://www.w3.org/2005/Atom");
    private static final Namespace ORE_ATOM =
        Namespace.getNamespace("oreatom", "http://www.openarchives.org/ore/atom/");
    private static final Namespace ORE_NS =
        Namespace.getNamespace("ore", "http://www.openarchives.org/ore/terms/");
    private static final Namespace RDF_NS =
        Namespace.getNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
    private static final Namespace DCTERMS_NS =
        Namespace.getNamespace("dcterms", "http://purl.org/dc/terms/");
    private static final Namespace DS_NS =
    	Namespace.getNamespace("ds","http://www.dspace.org/objectModel/");

    

	public void ingest(Context context, DSpaceObject dso, List metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {

		// If this list contains only the root already, just pass it on
		List<Element> elements = metadata;
		if (elements.size() == 1) {
			ingest(context, dso, elements.get(0));
		}
		// Otherwise, wrap them up 
		else {
			Element wrapper = new Element("wrap",elements.get(0).getNamespace());
			wrapper.addContent(elements);

			ingest(context,dso,wrapper);
		}
	}

	
	
	public void ingest(Context context, DSpaceObject dso, Element root) throws CrosswalkException, IOException, SQLException, AuthorizeException {
		
		Date timeStart = new Date();
		
		if (dso.getType() != Constants.ITEM)
            throw new CrosswalkObjectNotSupported("OREIngestionCrosswalk can only crosswalk an Item.");
        Item item = (Item)dso;
        
        if (root == null) {
        	System.err.println("The element received by ingest was null");
        	return;
        }
                
        Document doc = new Document();
        doc.addContent(root.detach());
        
        XPath xpathLinks;
        List<Element> aggregatedResources;
        String entryId;
		try {
			xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel=\"" + ORE_NS.getURI()+"aggregates" + "\"]");
			xpathLinks.addNamespace(ATOM_NS);
	        aggregatedResources = xpathLinks.selectNodes(doc);
	        
	        xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel='alternate']/@href");
	        xpathLinks.addNamespace(ATOM_NS);
	        entryId = ((Attribute)xpathLinks.selectSingleNode(doc)).getValue();
		} catch (JDOMException e) {
			throw new CrosswalkException("JDOM exception occured while ingesting the ORE");
		}

		// Next for each resource, create a bitstream
    	XPath xpathDesc;
    	NumberFormat nf=NumberFormat.getInstance(); 
		nf.setGroupingUsed(false);
		nf.setMinimumIntegerDigits(4);  
		
    	int countInt=0;
    	String count;
        for (Element resource : aggregatedResources) 
        {
        	countInt++;
        	count = nf.format((long)countInt);
        	String href = resource.getAttributeValue("href"); 
        	log.debug("ORE processing: " + href);
        	
        	String bundleName;
        	Element desc = null;
        	try {
        		xpathDesc = XPath.newInstance("/atom:entry/oreatom:triples/rdf:Description[@rdf:about=\"" + this.URLencode(href) + "\"][1]");
        		xpathDesc.addNamespace(ATOM_NS);
        		xpathDesc.addNamespace(ORE_ATOM);
        		xpathDesc.addNamespace(RDF_NS);
        		desc = (Element)xpathDesc.selectSingleNode(doc);
        	} catch (JDOMException e) {
        		e.printStackTrace();
        	}
        	
        	if (desc != null && desc.getChild("type", RDF_NS).getAttributeValue("resource", RDF_NS).equals(DS_NS.getURI() + "DSpaceBitstream"))
        	{
        		bundleName = desc.getChildText("description", DCTERMS_NS);
        		log.debug("Setting bundle name to: " + bundleName);
        	}
        	else {
        		log.info("Could not obtain bundle name; using 'ORIGINAL'");
        		bundleName = "ORIGINAL";
        	}
        	
        	// Bundle names are not unique, so we just pick the first one if there's more than one. 
        	Bundle[] targetBundles = item.getBundles(bundleName);
        	Bundle targetBundle;
        	
        	// if null, create the new bundle and add it in
        	if (targetBundles.length == 0) {
        		targetBundle = item.createBundle(bundleName);
        		item.addBundle(targetBundle);
        	}
        	else {
        		targetBundle = targetBundles[0];
        	}
        	
        	URL ARurl = null;
        	InputStream in = null;
        	if (href != null) {
        		try {
		        	// Make sure the url string escapes all the oddball characters
        			String processedURL = URLencode(href);
        			// Generate a requeset for the aggregated resource
        			ARurl = new URL(processedURL);
		        	in = ARurl.openStream();
        		}
        		catch(FileNotFoundException fe) {
            		log.error("The provided URI failed to return a resource: " + href);
            	}
        		catch(ConnectException fe) {
            		log.error("The provided URI was invalid: " + href);
            	}
        	}
        	else {
        		throw new CrosswalkException("Entry did not contain link to resource: " + entryId);
        	}
        	
        	// ingest and update
        	if (in != null) {
	        	Bitstream newBitstream = targetBundle.createBitstream(in);
	        	
	        	String bsName = resource.getAttributeValue("title");
	        	newBitstream.setName(bsName);
	        	
	            // Identify the format
	        	String mimeString = resource.getAttributeValue("type");
	        	BitstreamFormat bsFormat = BitstreamFormat.findByMIMEType(context, mimeString);
	        	if (bsFormat == null) {
	        		bsFormat = FormatIdentifier.guessFormat(context, newBitstream);
	        	}
	        	newBitstream.setFormat(bsFormat);
	            newBitstream.update();
	            
	            targetBundle.addBitstream(newBitstream);
	        	targetBundle.update();
        	}
        	else {
        		throw new CrosswalkException("Could not retrieve bitstream: " + entryId);
        	}
        	
        }
        log.info("OREIngest for Item "+ item.getID() + " took: " + (new Date().getTime() - timeStart.getTime()) + "ms."); 
	}
	
	
	/**
     * Helper method to escape all chaacters that are not part of the canon set 
     * @param sourceString source unescaped string
     */
    private String URLencode(String sourceString) {
    	Character lowalpha[] = {'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' ,
				'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' , 'q' , 'r' ,
				's' , 't' , 'u' , 'v' , 'w' , 'x' , 'y' , 'z'};
		Character upalpha[] = {'A' , 'B' , 'C' , 'D' , 'E' , 'F' , 'G' , 'H' , 'I' ,
                'J' , 'K' , 'L' , 'M' , 'N' , 'O' , 'P' , 'Q' , 'R' ,
                'S' , 'T' , 'U' , 'V' , 'W' , 'X' , 'Y' , 'Z'};
		Character digit[] = {'0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9'};
		Character mark[] = {'-' , '_' , '.' , '!' , '~' , '*' , '\'' , '(' , ')'};
		
		// reserved
		Character reserved[] = {';' , '/' , '?' , ':' , '@' , '&' , '=' , '+' , '$' , ',' ,'%', '#'};
		
		Set<Character> URLcharsSet = new HashSet<Character>();
		URLcharsSet.addAll(Arrays.asList(lowalpha));
		URLcharsSet.addAll(Arrays.asList(upalpha));
		URLcharsSet.addAll(Arrays.asList(digit));
		URLcharsSet.addAll(Arrays.asList(mark));
		URLcharsSet.addAll(Arrays.asList(reserved));
		
		String processedString = new String();
		for (int i=0; i<sourceString.length(); i++) {
			char ch = sourceString.charAt(i);
			if (URLcharsSet.contains(ch)) {
				processedString += ch;
			}
			else {
				processedString += "%" + Integer.toHexString((int)ch);
			}
		}
		
		return processedString;
    }
	
}