/* * Copyright (C) 2012 Jan Pokorsky * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package cz.cas.lib.proarc.common.export; import com.yourmediashelf.fedora.client.FedoraClient; import com.yourmediashelf.fedora.client.FedoraClientException; import com.yourmediashelf.fedora.generated.foxml.DatastreamType; import com.yourmediashelf.fedora.generated.foxml.DatastreamVersionType; import com.yourmediashelf.fedora.generated.foxml.DigitalObject; import com.yourmediashelf.fedora.generated.foxml.XmlContentType; import cz.cas.lib.proarc.common.dublincore.DcStreamEditor; import cz.cas.lib.proarc.common.export.ExportResultLog.ExportResult; import cz.cas.lib.proarc.common.export.ExportResultLog.ResultError; import cz.cas.lib.proarc.common.export.ExportResultLog.ResultStatus; import cz.cas.lib.proarc.common.fedora.BinaryEditor; import cz.cas.lib.proarc.common.fedora.DigitalObjectException; import cz.cas.lib.proarc.common.fedora.FoxmlUtils; import cz.cas.lib.proarc.common.fedora.LocalStorage; import cz.cas.lib.proarc.common.fedora.LocalStorage.LocalObject; import cz.cas.lib.proarc.common.fedora.RemoteStorage; import cz.cas.lib.proarc.common.fedora.RemoteStorage.RemoteObject; import cz.cas.lib.proarc.common.fedora.SearchView; import cz.cas.lib.proarc.common.fedora.SearchView.Item; import cz.cas.lib.proarc.common.fedora.relation.RelationEditor; import cz.cas.lib.proarc.common.fedora.relation.RelationResource; import cz.cas.lib.proarc.common.fedora.relation.Relations; import cz.cas.lib.proarc.common.mods.ModsStreamEditor; import cz.cas.lib.proarc.common.object.DigitalObjectCrawler; import cz.cas.lib.proarc.common.object.DigitalObjectElement; import cz.cas.lib.proarc.common.object.DigitalObjectManager; import cz.cas.lib.proarc.oaidublincore.DcConstants; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Queue; import java.util.Set; import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * Exports digital object and transforms its data streams to Kramerius4 format. * * For now it exports FOXML in archive format. It is memory intensive but fast. * In case of OOME it should be rewritten to export FOXML in public or migrate * format and fetch each managed data streams with REST or build the whole FOXML * from scratch. * * @author Jan Pokorsky */ public final class Kramerius4Export { public static final String KRAMERIUS_RELATION_NS = "http://www.nsdl.org/ontologies/relationships#"; public static final String KRAMERIUS_RELATION_PREFIX = "kramerius"; public static final String OAI_NS = "http://www.openarchives.org/OAI/2.0/"; private RemoteStorage rstorage; private LocalStorage lstorage = new LocalStorage(); private DigitalObjectCrawler crawler; private final SearchView search; /** already exported PIDs to prevent loops */ private HashSet<String> exportedPids = new HashSet<String>(); /** PIDs scheduled for export */ private Queue<String> toExport = new LinkedList<String>(); private final Kramerius4ExportOptions options; public Kramerius4Export(RemoteStorage rstorage, Kramerius4ExportOptions options) { this.rstorage = rstorage; this.options = options; this.search = rstorage.getSearch(); this.crawler = new DigitalObjectCrawler(DigitalObjectManager.getDefault(), search); } public File export(File output, boolean hierarchy, String log, String... pids) { if (!output.exists() || !output.isDirectory()) { throw new IllegalStateException(String.valueOf(output)); } if (pids == null || pids.length == 0) { throw new IllegalArgumentException(); } ExportResultLog reslog = new ExportResultLog(); ExportResult result = new ExportResult(); result.setInputPid(pids[0]); reslog.getExports().add(result); File target = ExportUtils.createFolder(output, "k4_" + FoxmlUtils.pidAsUuid(pids[0])); HashSet<String> selectedPids = new HashSet<String>(Arrays.asList(pids)); toExport.addAll(selectedPids); try { for (String pid = toExport.poll(); pid != null; pid = toExport.poll()) { exportPid(target, hierarchy, pid); } exportParents(target, selectedPids); storeExportResult(target, log); } catch (RuntimeException ex) { result.setStatus(ResultStatus.FAILED); reslog.getExports().add(result); result.getError().add(new ResultError(null, ex)); result.setEnd(); ExportUtils.writeExportResult(target, reslog); throw ex; } result.setStatus(ResultStatus.OK); result.setEnd(); ExportUtils.writeExportResult(target, reslog); return target; } void exportPid(File output, boolean hierarchy, String pid) { try { if (exportedPids.contains(pid)) { return ; } exportedPids.add(pid); RemoteObject robject = rstorage.find(pid); FedoraClient client = robject.getClient(); DigitalObject dobj = FedoraClient.export(pid).context("archive") .format("info:fedora/fedora-system:FOXML-1.1") .execute(client).getEntity(DigitalObject.class); File foxml = ExportUtils.pidAsXmlFile(output, pid); LocalObject local = lstorage.create(foxml, dobj); RelationEditor editor = new RelationEditor(local); if (hierarchy) { List<String> children = editor.getMembers(); toExport.addAll(children); } exportDatastreams(local, editor); local.flush(); } catch (DigitalObjectException ex) { throw new IllegalStateException(pid, ex); } catch (FedoraClientException ex) { // replace with ExportException throw new IllegalStateException(pid, ex); } } /** * Exports hierarchy of parent objects. Leafs of the hierarchy are PIDs * that were selected for export. * <p/>RELS-EXT of exported parent objects contains only PIDs that are subject to export. * Other relations are excluded. * * @param output output folder * @param pids PIDs selected for export */ private void exportParents(File output, Collection<String> pids) { Map<String, Set<String>> buildPidTree = buildPidTree(pids, exportedPids); for (Entry<String, Set<String>> node : buildPidTree.entrySet()) { String pid = node.getKey(); Set<String> children = node.getValue(); exportParentPid(output, pid, children); } } void exportParentPid(File output, String pid, Collection<String> includeChildPids) { try { exportedPids.add(pid); RemoteObject robject = rstorage.find(pid); FedoraClient client = robject.getClient(); DigitalObject dobj = FedoraClient.export(pid).context("archive") .format("info:fedora/fedora-system:FOXML-1.1") .execute(client).getEntity(DigitalObject.class); File foxml = ExportUtils.pidAsXmlFile(output, pid); LocalObject local = lstorage.create(foxml, dobj); exportParentDatastreams(local, includeChildPids); local.flush(); } catch (DigitalObjectException ex) { throw new IllegalStateException(pid, ex); } catch (FedoraClientException ex) { // replace with ExportException throw new IllegalStateException(pid, ex); } } void storeExportResult(File output, String log) { for (String pid : exportedPids) { try { File foxml = ExportUtils.pidAsXmlFile(output, pid); ExportUtils.storeObjectExportResult(pid, foxml.toURI().toASCIIString(), log); } catch (DigitalObjectException ex) { throw new IllegalStateException(ex); } } } /** * Builds tree of digital objects as map of parent nodes and their children. * * @param pids PIDs that will be leafs of the tree * @param exportedPids collection of already exported PIDs * @return {@code Map<parent-PID, Set<child-PID>>} */ private Map<String, Set<String>> buildPidTree(Collection<String> pids, Collection<String> exportedPids) { // P1/R1/C1 // P1/R1/C2 // P1/R3/C3 // pids={C1, C2, C3} // Map<PID, Set<PID>> tree P1={R1, R3}, R1={C1, C2}, R3={C3} Map<String, Set<String>> pidTree = new HashMap<String, Set<String>>(); for (String pid : pids) { try { fillPidTree(pid, pidTree); } catch (DigitalObjectException ex) { throw new IllegalStateException(pid, ex); } } return pidTree; } private void fillPidTree(String selectedPid, Map<String, Set<String>> pidTree) throws DigitalObjectException { List<DigitalObjectElement> reversePath = crawler.getReversePath(selectedPid); reversePath.add(crawler.getEntry(selectedPid)); Set<String> lastChildren = null; for (Iterator<DigitalObjectElement> it = reversePath.iterator(); it.hasNext();) { DigitalObjectElement elm = it.next(); if (lastChildren != null) { lastChildren.add(elm.getPid()); } if (it.hasNext() && !exportedPids.contains(elm.getPid())) { lastChildren = pidTree.get(elm.getPid()); if (lastChildren == null) { lastChildren = new HashSet<String>(); pidTree.put(elm.getPid(), lastChildren); } } } } private void exportDatastreams(LocalObject local, RelationEditor editor) { DigitalObject dobj = local.getDigitalObject(); // XXX replace DS only for other than image/* MIMEs? DatastreamType fullDs = FoxmlUtils.findDatastream(dobj, BinaryEditor.FULL_ID); DatastreamType rawDs = fullDs != null ? null : FoxmlUtils.findDatastream(dobj, BinaryEditor.RAW_ID); for (Iterator<DatastreamType> it = dobj.getDatastream().iterator(); it.hasNext();) { DatastreamType datastream = it.next(); if (options.getExcludeDatastreams().contains(datastream.getID())) { // use RAW if FULL is not available if (rawDs != datastream ) { it.remove(); continue; } } excludeVersions(datastream); renameDatastream(datastream); processDublinCore(datastream); processMods(datastream); processRelsExt(dobj.getPID(), datastream, editor, null); } } private void exportParentDatastreams(LocalObject local, Collection<String> includeChildPids) { DigitalObject dobj = local.getDigitalObject(); RelationEditor editor = new RelationEditor(local); for (Iterator<DatastreamType> it = dobj.getDatastream().iterator(); it.hasNext();) { DatastreamType datastream = it.next(); if (options.getExcludeDatastreams().contains(datastream.getID())) { it.remove(); continue; } excludeVersions(datastream); renameDatastream(datastream); processDublinCore(datastream); processMods(datastream); processRelsExt(dobj.getPID(), datastream, editor, includeChildPids); } } private void excludeVersions(DatastreamType datastream) { List<DatastreamVersionType> versions = datastream.getDatastreamVersion(); final int size = versions.size(); if (size > 1) { DatastreamVersionType actual = versions.get(size - 1); versions.retainAll(Collections.singletonList(actual)); } datastream.setVERSIONABLE(false); } private void renameDatastream(DatastreamType datastream) { String id = datastream.getID(); String newId = options.getDsIdMap().get(id); if (newId != null) { datastream.setID(newId); for (DatastreamVersionType version : datastream.getDatastreamVersion()) { String versionId = version.getID(); String newVersionId = versionId.replace(id, newId); version.setID(newVersionId); } } } private void processDublinCore(DatastreamType datastream) { if (!DcStreamEditor.DATASTREAM_ID.equals(datastream.getID())) { return ; } DatastreamVersionType version = datastream.getDatastreamVersion().get(0); XmlContentType xmlContent = version.getXmlContent(); Element dcElm = xmlContent.getAny().get(0); FoxmlUtils.fixFoxmlDc(dcElm); // add policy String policy = options.getPolicy(); if (policy != null) { Element elmRights = dcElm.getOwnerDocument().createElementNS( DcConstants.NS_PURL, DcConstants.PREFIX_NS_PURL + ':' + DcConstants.RIGHTS); elmRights.setTextContent(policy); dcElm.appendChild(elmRights); } // map proarc/K4 models NodeList typeNodes = dcElm.getElementsByTagNameNS(DcConstants.NS_PURL, DcConstants.TYPE); for (int i = 0; i < typeNodes.getLength(); i++) { Element typeElm = (Element) typeNodes.item(i); String type = typeElm.getTextContent(); String k4ModelId = options.getModelMap().get(type); if (k4ModelId != null) { typeElm.setTextContent(k4ModelId); } } } private void processMods(DatastreamType datastream) { if (!ModsStreamEditor.DATASTREAM_ID.equals(datastream.getID())) { return ; } DatastreamVersionType version = datastream.getDatastreamVersion().get(0); XmlContentType xmlContent = version.getXmlContent(); Element mods = xmlContent.getAny().get(0); removeNils(mods); wrapModsInCollection(xmlContent); } /** * Removes all subelements with xsi:nil attribute as they are worthless. * * JAXB optimizes namespace declarations and moves them to common parent elements * but Fedora ingest ignores it. Then some ingested datastreams may be broken * as they miss optimized namespace declarations (xsi in this case). */ public static void removeNils(Element elm) { NodeList children = elm.getChildNodes(); for (int i = children.getLength() - 1; i >= 0; i--) { Node item = children.item(i); if (item.getNodeType() == Node.ELEMENT_NODE) { Element itemElm = (Element) item; if (itemElm.hasAttributeNS(XMLConstants.W3C_XML_SCHEMA_INSTANCE_NS_URI, "nil")) { itemElm.getParentNode().removeChild(item); } else { removeNils(itemElm); } } } } /** * Wraps mods root element in modsCollection element like Kramerius expects it. */ private static void wrapModsInCollection(XmlContentType xmlContent) { Element mods = xmlContent.getAny().get(0); if ("modsCollection".equals(mods.getLocalName())) { return ; } Element modsCollection = mods.getOwnerDocument().createElementNS( ModsStreamEditor.DATASTREAM_FORMAT_URI, "mods:modsCollection"); modsCollection.appendChild(mods); xmlContent.getAny().clear(); xmlContent.getAny().add(modsCollection); } private void processRelsExt(String pid, DatastreamType datastream, RelationEditor editor, Collection<String> includePids ) { if (!RelationEditor.DATASTREAM_ID.equals(datastream.getID())) { return ; } try { List<Item> childDescriptors = search.findChildren(pid); transformRelation2Kramerius(pid, editor, childDescriptors, includePids); } catch (DigitalObjectException ex) { throw new IllegalStateException(ex); } catch (FedoraClientException ex) { throw new IllegalStateException(ex); } catch (IOException ex) { throw new IllegalStateException(ex); } DatastreamVersionType version = datastream.getDatastreamVersion().get(0); XmlContentType xmlContent = version.getXmlContent(); Element get = xmlContent.getAny().get(0); // optimize XML namespace declaration get.setAttributeNS(XMLConstants.XMLNS_ATTRIBUTE_NS_URI, XMLConstants.XMLNS_ATTRIBUTE + ":" + KRAMERIUS_RELATION_PREFIX, KRAMERIUS_RELATION_NS); } private void transformRelation2Kramerius( String pid, RelationEditor editor, List<Item> childDescriptors, Collection<String> includePids ) throws DigitalObjectException { List<String> children = editor.getMembers(); try { DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance(); dfactory.setNamespaceAware(true); Document doc = dfactory.newDocumentBuilder().newDocument(); List<Element> relations = editor.getRelations(); setOaiId(pid, relations, doc); setImportFile(editor, relations, doc); setPolicy(options.getPolicy(), relations, doc); editor.setDevice(null); editor.setExportResult(null); editor.setOwners(Collections.<String>emptyList()); editor.setMembership(Collections.<String>emptyList()); String modelId = editor.getModel(); String k4ModelId = options.getModelMap().get(modelId); k4ModelId = k4ModelId == null ? modelId : k4ModelId; editor.setModel(k4ModelId); editor.setMembers(Collections.<String>emptyList()); for (String childPid : children) { Item desc = remove(childPid, childDescriptors); if (desc == null) { throw new IllegalStateException("Child " + childPid + " of " + pid + " not found in resource index!"); } if (includePids != null && !includePids.contains(childPid)) { continue; } String krelation = options.getRelationMap().get(desc.getModel()); if (krelation == null) { throw new IllegalStateException(String.format( "Cannot map to Kramerius relation! Child: %s, model: %s, parent: %s ", childPid, desc.getModel(), pid)); } Element elm = doc.createElementNS(KRAMERIUS_RELATION_NS, KRAMERIUS_RELATION_PREFIX + ":" + krelation); elm.setAttributeNS(Relations.RDF_NS, "rdf:resource", RelationResource.fromPid(childPid).getResource()); relations.add(elm); } editor.setRelations(relations); editor.write(editor.getLastModified(), null); } catch (ParserConfigurationException ex) { throw new IllegalStateException(ex); } } private static Item remove(String pid, List<Item> childDescriptors) { for (Iterator<Item> it = childDescriptors.iterator(); it.hasNext();) { Item desc = it.next(); if (pid.equals(desc.getPid())) { it.remove(); return desc; } } return null; } /** * Sets OAI ID as relation if necessary. * @param pid PID to use as OAI ID * @param relations list of existing relations * @param doc DOM */ private static void setOaiId(String pid, List<Element> relations, Document doc) { for (Element relation : relations) { if (OAI_NS.equals(relation.getNamespaceURI()) && "itemID".equals(relation.getLocalName())) { return ; } } Element elm = doc.createElementNS(OAI_NS, "oai:itemID"); elm.setTextContent(pid); relations.add(elm); } /** * Sets kramerius:policy as relation if necessary. * @param policy access policy * @param relations list of existing relations * @param doc DOM */ private static void setPolicy(String policy, List<Element> relations, Document doc) { if (policy != null) { Element elmPolicy = doc.createElementNS(KRAMERIUS_RELATION_NS, KRAMERIUS_RELATION_PREFIX + ":policy"); elmPolicy.setTextContent(policy); relations.add(elmPolicy); } } /** * Replaces proarc-rels:importFile with kramerius:file. */ private void setImportFile(RelationEditor editor, List<Element> relations, Document doc) throws DigitalObjectException { String importFile = editor.getImportFile(); if (importFile != null) { editor.setImportFile(null); Element elm = doc.createElementNS(KRAMERIUS_RELATION_NS, KRAMERIUS_RELATION_PREFIX + ":file"); elm.setTextContent(importFile); relations.add(0, elm); } } }