/*
* -------------------------------------------------------------------------
* Copyright 2006 OCLC, Online Computer Library Center
* Copyright 2014
* Centre for Information Modeling - Austrian Centre for Digital Humanities
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
* -------------------------------------------------------------------------
*/
package org.emile.cirilo.oai;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.zip.GZIPInputStream;
import java.util.zip.InflaterInputStream;
import java.util.zip.ZipInputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.log4j.Logger;
import org.apache.xpath.XPathAPI;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* HarvesterVerb is the parent class for each of the OAI verbs.
*
* @author Jefffrey A. Young, OCLC Online Computer Library Center
*/
public abstract class HarvesterVerb {
private static Logger log = Logger.getLogger(HarvesterVerb.class);
/* Primary OAI namespaces */
public static final String SCHEMA_LOCATION_V2_0 = "http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd";
public static final String SCHEMA_LOCATION_V1_1_GET_RECORD = "http://www.openarchives.org/OAI/1.1/OAI_GetRecord http://www.openarchives.org/OAI/1.1/OAI_GetRecord.xsd";
public static final String SCHEMA_LOCATION_V1_1_IDENTIFY = "http://www.openarchives.org/OAI/1.1/OAI_Identify http://www.openarchives.org/OAI/1.1/OAI_Identify.xsd";
public static final String SCHEMA_LOCATION_V1_1_LIST_IDENTIFIERS = "http://www.openarchives.org/OAI/1.1/OAI_ListIdentifiers http://www.openarchives.org/OAI/1.1/OAI_ListIdentifiers.xsd";
public static final String SCHEMA_LOCATION_V1_1_LIST_METADATA_FORMATS = "http://www.openarchives.org/OAI/1.1/OAI_ListMetadataFormats http://www.openarchives.org/OAI/1.1/OAI_ListMetadataFormats.xsd";
public static final String SCHEMA_LOCATION_V1_1_LIST_RECORDS = "http://www.openarchives.org/OAI/1.1/OAI_ListRecords http://www.openarchives.org/OAI/1.1/OAI_ListRecords.xsd";
public static final String SCHEMA_LOCATION_V1_1_LIST_SETS = "http://www.openarchives.org/OAI/1.1/OAI_ListSets http://www.openarchives.org/OAI/1.1/OAI_ListSets.xsd";
private Document doc = null;
private String schemaLocation = null;
private String requestURL = null;
private static HashMap builderMap = new HashMap();
private static Element namespaceElement = null;
private static DocumentBuilderFactory factory = null;
private static TransformerFactory xformFactory = TransformerFactory.newInstance();
static {
try {
/* Load DOM Document */
factory = DocumentBuilderFactory
.newInstance();
factory.setNamespaceAware(true);
Thread t = Thread.currentThread();
DocumentBuilder builder = factory.newDocumentBuilder();
builderMap.put(t, builder);
DOMImplementation impl = builder.getDOMImplementation();
Document namespaceHolder = impl.createDocument(
"http://www.oclc.org/research/software/oai/harvester",
"harvester:namespaceHolder", null);
namespaceElement = namespaceHolder.getDocumentElement();
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:harvester",
"http://www.oclc.org/research/software/oai/harvester");
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance");
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:oai20", "http://www.openarchives.org/OAI/2.0/");
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:oai11_GetRecord",
"http://www.openarchives.org/OAI/1.1/OAI_GetRecord");
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:oai11_Identify",
"http://www.openarchives.org/OAI/1.1/OAI_Identify");
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:oai11_ListIdentifiers",
"http://www.openarchives.org/OAI/1.1/OAI_ListIdentifiers");
namespaceElement
.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:oai11_ListMetadataFormats",
"http://www.openarchives.org/OAI/1.1/OAI_ListMetadataFormats");
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:oai11_ListRecords",
"http://www.openarchives.org/OAI/1.1/OAI_ListRecords");
namespaceElement.setAttributeNS("http://www.w3.org/2000/xmlns/",
"xmlns:oai11_ListSets",
"http://www.openarchives.org/OAI/1.1/OAI_ListSets");
} catch (Exception e) {
log.error(e.getLocalizedMessage(),e);
}
}
/**
* Get the OAI response as a DOM object
*
* @return the DOM for the OAI response
*/
public Document getDocument() {
return doc;
}
/**
* Get the xsi:schemaLocation for the OAI response
*
* @return the xsi:schemaLocation value
*/
public String getSchemaLocation() {
return schemaLocation;
}
/**
* Get the OAI errors
* @return a NodeList of /oai:OAI-PMH/oai:error elements
* @throws TransformerException
*/
public NodeList getErrors() throws TransformerException {
if (SCHEMA_LOCATION_V2_0.equals(getSchemaLocation())) {
return getNodeList("/oai20:OAI-PMH/oai20:error");
} else {
return null;
}
}
/**
* Get the OAI request URL for this response
* @return the OAI request URL as a String
*/
public String getRequestURL() {
return requestURL;
}
/**
* Mock object creator (for unit testing purposes)
*/
public HarvesterVerb() {
}
/**
* Performs the OAI request
*
* @param requestURL
* @throws IOException
* @throws ParserConfigurationException
* @throws SAXException
* @throws TransformerException
*/
public HarvesterVerb(String requestURL) throws IOException,
ParserConfigurationException, SAXException, TransformerException {
harvest(requestURL);
}
/**
* Preforms the OAI request
*
* @param requestURL
* @throws IOException
* @throws ParserConfigurationException
* @throws SAXException
* @throws TransformerException
*/
public void harvest(String requestURL) throws IOException,
ParserConfigurationException, SAXException, TransformerException {
this.requestURL = requestURL;
log.debug("requestURL=" + requestURL);
InputStream in = null;
URL url = new URL(requestURL);
HttpURLConnection con = null;
int responseCode = 0;
do {
con = (HttpURLConnection) url.openConnection();
con.setRequestProperty("User-Agent", "OAIHarvester/2.0");
con.setRequestProperty("Accept-Encoding",
"compress, gzip, identify");
try {
responseCode = con.getResponseCode();
log.debug("responseCode=" + responseCode);
} catch (FileNotFoundException e) {
// assume it's a 503 response
log.info(requestURL, e);
responseCode = HttpURLConnection.HTTP_UNAVAILABLE;
}
if (responseCode == HttpURLConnection.HTTP_UNAVAILABLE) {
long retrySeconds = con.getHeaderFieldInt("Retry-After", -1);
if (retrySeconds == -1) {
long now = (new Date()).getTime();
long retryDate = con.getHeaderFieldDate("Retry-After", now);
retrySeconds = retryDate - now;
}
if (retrySeconds == 0) { // Apparently, it's a bad URL
throw new FileNotFoundException("Bad URL?");
}
System.err.println("Server response: Retry-After="
+ retrySeconds);
if (retrySeconds > 0) {
try {
Thread.sleep(retrySeconds * 1000);
} catch (InterruptedException ex) {
log.error(ex.getLocalizedMessage(),ex);
}
}
}
} while (responseCode == HttpURLConnection.HTTP_UNAVAILABLE);
String contentEncoding = con.getHeaderField("Content-Encoding");
log.debug("contentEncoding=" + contentEncoding);
if ("compress".equals(contentEncoding)) {
ZipInputStream zis = new ZipInputStream(con.getInputStream());
zis.getNextEntry();
in = zis;
} else if ("gzip".equals(contentEncoding)) {
in = new GZIPInputStream(con.getInputStream());
} else if ("deflate".equals(contentEncoding)) {
in = new InflaterInputStream(con.getInputStream());
} else {
in = con.getInputStream();
}
InputSource data = new InputSource(in);
Thread t = Thread.currentThread();
DocumentBuilder builder = (DocumentBuilder) builderMap.get(t);
if (builder == null) {
builder = factory.newDocumentBuilder();
builderMap.put(t, builder);
}
doc = builder.parse(data);
StringTokenizer tokenizer = new StringTokenizer(
getSingleString("/*/@xsi:schemaLocation"), " ");
StringBuffer sb = new StringBuffer();
while (tokenizer.hasMoreTokens()) {
if (sb.length() > 0)
sb.append(" ");
sb.append(tokenizer.nextToken());
}
this.schemaLocation = sb.toString();
}
/**
* Get the String value for the given XPath location in the response DOM
*
* @param xpath
* @return a String containing the value of the XPath location.
* @throws TransformerException
*/
public String getSingleString(String xpath) throws TransformerException {
return getSingleString(getDocument(), xpath);
// return XPathAPI.eval(getDocument(), xpath, namespaceElement).str();
// String str = null;
// Node node = XPathAPI.selectSingleNode(getDocument(), xpath,
// namespaceElement);
// if (node != null) {
// XObject xObject = XPathAPI.eval(node, "string()");
// str = xObject.str();
// }
// return str;
}
public String getSingleString(Node node, String xpath)
throws TransformerException {
return XPathAPI.eval(node, xpath, namespaceElement).str();
}
/**
* Get a NodeList containing the nodes in the response DOM for the specified
* xpath
* @param xpath
* @return the NodeList for the xpath into the response DOM
* @throws TransformerException
*/
public NodeList getNodeList(String xpath) throws TransformerException {
return XPathAPI.selectNodeList(getDocument(), xpath, namespaceElement);
}
public String toString() {
// Element docEl = getDocument().getDocumentElement();
// return docEl.toString();
Source input = new DOMSource(getDocument());
StringWriter sw = new StringWriter();
Result output = new StreamResult(sw);
try {
Transformer idTransformer = xformFactory.newTransformer();
idTransformer.setOutputProperty(
OutputKeys.OMIT_XML_DECLARATION, "yes");
idTransformer.transform(input, output);
return sw.toString();
} catch (TransformerException e) {
return e.getMessage();
}
}
}