package dk.statsbiblioteket.medieplatform.autonomous.iterator.fedora3;
import com.sun.jersey.api.client.Client;
import com.sun.jersey.api.client.WebResource;
import dk.statsbiblioteket.medieplatform.autonomous.iterator.AbstractIterator;
import dk.statsbiblioteket.medieplatform.autonomous.iterator.common.AttributeParsingEvent;
import dk.statsbiblioteket.medieplatform.autonomous.iterator.common.DelegatingTreeIterator;
import dk.statsbiblioteket.util.xml.DOM;
import org.apache.ws.commons.util.NamespaceContextImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
/**
* Iterator that iterates objects in a Fedora 3.x repository. It works directly on the
* REST api.
*/
public class IteratorForFedora3 extends AbstractIterator<String> {
private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();
private static final String OBJECTS = "/objects/";
private static final String INFO_FEDORA = "<info:fedora/";
private static final String FORMAT = "format";
private static final String XML = "xml";
private static final String DC_NAMESPACE = "http://purl.org/dc/elements/1.1/";
private static final String DATASTREAM_PROFILE_NAMESPACE = "http://www.fedora.info/definitions/1/0/management/";
private static final String NTRIPLES = "ntriples";
/** xpath used to pick out the datastream nodes from the list of datastreams */
private final XPathExpression datastreamsXpath;
/** Xpath used to select the dc identifier from the contents of a DC datastream */
private final XPathExpression dcIdentifierXpath;
/** Xpath used to pick out the checksum from a datastream profile */
private final XPathExpression datastreamChecksumXpath;
/** Xpath used to pick out the datastream name from a datastream profile */
private final XPathExpression datastreamNameXpath;
private final Client client;
private final String restUrl;
private final FedoraTreeFilter filter;
private final String name;
private final Logger log = LoggerFactory.getLogger(getClass());
/**
* Constructor.
*
* @param id the fedora pid of the root object
* @param client the jersey client to use
* @param restUrl the url to Fedora
* @param filter the fedora tree filter to know which relations and datastreams to use
*/
public IteratorForFedora3(String id, Client client, String restUrl, FedoraTreeFilter filter,
String dataFilePattern) {
super(id, dataFilePattern);
this.client = client;
if (!restUrl.endsWith(OBJECTS)) {
restUrl = restUrl + OBJECTS;
}
this.restUrl = restUrl;
this.filter = filter;
try {
XPath xPath = XPATH_FACTORY.newXPath();
NamespaceContextImpl context = new NamespaceContextImpl();
context.startPrefixMapping("dc", DC_NAMESPACE);
context.startPrefixMapping("dp", DATASTREAM_PROFILE_NAMESPACE);
xPath.setNamespaceContext(context);
datastreamsXpath = xPath.compile("//@dsid");
dcIdentifierXpath = xPath.compile("//dc:identifier");
datastreamChecksumXpath = xPath.compile("//dp:dsChecksum");
datastreamNameXpath = xPath.compile("/dp:datastreamProfile/dp:dsAltID");
} catch (XPathExpressionException e) {
throw new RuntimeException("Illegal XPath. This is a programming error.", e);
}
this.name = getNameFromId(id);
}
/**
* Given an object id, get the name from dc:identifier
*
* @param id fedora ID of object
*
* @return The name found, or the id if none could be found.
*/
private String getNameFromId(String id) {
WebResource resource = client.resource(restUrl);
String dcContent = resource.path(id).path("/datastreams/DC/content").queryParam(FORMAT, XML).get(String.class);
NodeList nodeList;
try {
nodeList = (NodeList) dcIdentifierXpath.evaluate(
DOM.streamToDOM(new ByteArrayInputStream(dcContent.getBytes()), true), XPathConstants.NODESET);
} catch (XPathExpressionException e) {
throw new RuntimeException("Invalid XPath. This is a programming error.", e);
}
for (int i = 0; i < nodeList.getLength(); i++) {
String textContent = nodeList.item(i).getTextContent();
if (textContent.startsWith("path:")) {
return textContent.substring("path:".length());
}
}
return id;
}
/**
* Parse the list of datastreams from the datastream xml list. Removes the ones that should
* not be used, based on the fedora tree filter
*
* @param datastreamXml the datastream xml list
*
* @return the list of datastreams
*/
private List<String> parseDatastreamsFromXml(String datastreamXml) {
NodeList nodeList;
try {
nodeList = (NodeList) datastreamsXpath.evaluate(
DOM.streamToDOM(new ByteArrayInputStream(datastreamXml.getBytes()), true), XPathConstants.NODESET);
} catch (XPathExpressionException e) {
throw new RuntimeException("Invalid XPath. This is a programming error.", e);
}
ArrayList<String> result = new ArrayList<>();
for (int i = 0; i < nodeList.getLength(); i++) {
String dsid = nodeList.item(i).getTextContent();
if (filter.isAttributeDatastream(dsid)) {
result.add(dsid);
}
}
return result;
}
@Override
protected Iterator<DelegatingTreeIterator> initializeChildrenIterator() {
WebResource resource = client.resource(restUrl);
//remember to not urlEncode the id here... Stupid fedora
String relationsShips = resource.path(id).path("relationships").queryParam(FORMAT, NTRIPLES).get(String.class);
List<String> children = parseRelationsToList(relationsShips);
List<DelegatingTreeIterator> result = new ArrayList<>(children.size());
for (String child : children) {
try {
DelegatingTreeIterator delegate = new IteratorForFedora3(
child, client, restUrl, filter, getDataFilePattern());
result.add(delegate);
} catch (Exception e) {
log.warn("Unable to load child {}, ignoring as if it didn't exist", child, e);
}
}
Collections.sort(
result, new Comparator<DelegatingTreeIterator>() {
@Override
public int compare(DelegatingTreeIterator o1, DelegatingTreeIterator o2) {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
});
return result.iterator();
}
/**
* Parse the relationships of the object into a list of fedora pids. Filters out the ones that
* should be ignored as detailed in the fedora tree filter
*
* @param relationsShips the relationships
*
* @return the list of pids of the child objects.
*/
private List<String> parseRelationsToList(String relationsShips) {
ArrayList<String> result = new ArrayList<>();
for (String line : relationsShips.split("\n")) {
String[] tuple = line.split(" ");
if (tuple.length >= 3 && tuple[2].startsWith(INFO_FEDORA)) {
String predicate = tuple[1].substring(1, tuple[1].length() - 1);
String child = tuple[2].substring(INFO_FEDORA.length(), tuple[2].length() - 1);
if (filter.isChildRel(predicate)) {
result.add(child);
}
} else {
log.debug("Ignoring line {}, while parsing predicates", line);
}
}
return result;
}
@Override
protected Iterator<String> initilizeAttributeIterator() {
WebResource resource = client.resource(restUrl);
String datastreamXml = resource.path(id).path("datastreams").queryParam(FORMAT, XML).get(String.class);
return parseDatastreamsFromXml(datastreamXml).iterator();
}
/**
* construct a Attribute parsing event for a node and attributeID. Uses jersey
* to return an inputstream to the content
*
* @param nodeID the identifier of the node that the attribute resides in
* @param attributeID the identifier of the attribute.
*
* @return the attribute parsing event
*/
@Override
protected AttributeParsingEvent makeAttributeEvent(String nodeID, String attributeID) {
if (attributeID.equals(JerseyContentsAttributeParsingEvent.CONTENTS)) {
return new JerseyContentsAttributeParsingEvent(
name + "/" + attributeID.toLowerCase(), client.resource(restUrl).path(nodeID), nodeID);
} else {
String response = client.resource(restUrl)
.path(nodeID)
.path("/datastreams/")
.path(attributeID)
.queryParam(FORMAT, XML)
.get(String.class);
Document datastreamProfile = DOM.streamToDOM(new ByteArrayInputStream(response.getBytes()), true);
String name = null;
String checksum = null;
try {
name = datastreamNameXpath.evaluate(datastreamProfile);
checksum = datastreamChecksumXpath.evaluate(datastreamProfile);
} catch (XPathExpressionException e) {
throw new RuntimeException("Invalid XPath. This is a programming error.", e);
}
return new JerseyAttributeParsingEvent(
name, checksum, client.resource(restUrl).path(nodeID).path("/datastreams/").path(attributeID));
}
}
@Override
protected String getIdOfNode() {
return name;
}
}