IteratorForFedora3.java example

Explorer

newspaper-batch-event-framework-master
- item-event-framework
  - autonomous-component
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        AutonomousComponent.java
        AutonomousComponentUtils.java
        AutonomousWorker.java
        CallResult.java
        ConcurrencyConnectionStateListener.java
        CouldNotGetLockException.java
        LockingException.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        AutonomousComponentTest.java
        TestingComponent.java
  - item-event-framework-common
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        AbstractRunnableComponent.java
        CommunicationException.java
        ConfigConstants.java
        DomsItemFactory.java
        Event.java
        EventStorer.java
        EventTrigger.java
        InitialisationException.java
        Item.java
        ItemFactory.java
        NotFoundException.java
        ResultCollector.java
        RunnableComponent.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        ResultCollectorTest.java
  - sboi-doms-autonomous-component
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        SBOIDomsAutonomousComponentUtils.java
  - sboi-doms-event-framework
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        DomsEventStorage.java
        DomsEventStorageFactory.java
        PremisManipulator.java
        PremisManipulatorFactory.java
        SBOIEventIndex.java
        SolrJConnector.java
        SolrProxyIterator.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        PremisManipulatorTest.java
        SBOIEventIT.java
        SBOIEventIndexTest.java
        SolrProxyIteratorTest.java
- newspaper-batch-event-framework
  - batch-iterator
    - src
      - main
        java
        dk
        statsbiblioteket
        newspaper
        treenode
        NodeType.java
        TreeNode.java
        TreeNodeState.java
        TreeNodeStateWithChildren.java
        TreeNodeWithChildren.java
      - test
        java
        dk
        statsbiblioteket
        newspaper
        treenode
        TestEventHelper.java
        TreeNodeStateTest.java
        TreeNodeStateWithChildrenTest.java
        TreeNodesStructurePrint.java
  - hadoop-component
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        hadoop
        AbstractHadoopRunnableComponent.java
  - hadoop-helpers
    - hadoop-doms-integration
      - src
        main
        java
        dk
        statsbiblioteket
        medieplatform
        hadoop
        AbstractDomsReducer.java
        DomsSaverReducer.java
        Utils.java
        test
        java
        dk
        statsbiblioteket
        medieplatform
        hadoop
        DomsSaverReducerTest.java
    - hadoop-file-converters
      - src
        main
        java
        dk
        statsbiblioteket
        medieplatform
        hadoop
        ConvertMapper.java
        WrapperMapper.java
        test
        java
        dk
        statsbiblioteket
        medieplatform
        hadoop
        ConvertMapperTest.java
  - newspaper-autonomous-component
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        NewspaperBatchAutonomousComponentUtils.java
        NewspaperDomsEventStorage.java
        NewspaperDomsEventStorageFactory.java
        NewspaperSBOIEventStorage.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        AbstractFedoraMockup.java
        FedoraMockupBatchNoRoundTripObject.java
        FedoraMockupEmpty.java
        NewspaperDomsEventStorageIntegrationTest.java
        NewspaperDomsEventStorageTest.java
        NewspaperSBOIEventStorageTest.java
  - newspaper-batch-event-framework-common
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        Batch.java
        BatchItemFactory.java
        EventAccessor.java
  - sample-autonomous-component
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        SampleComponent.java
        SampleRunnableComponent.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        MockupIteratorSuper.java
        SampleRunnableComponentTest.java
  - tree-processor
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        iterator
        AbstractIterator.java
        common
        AttributeParsingEvent.java
        DataFileNodeBeginsParsingEvent.java
        DataFileNodeEndsParsingEvent.java
        DelegatingTreeIterator.java
        InMemoryAttributeParsingEvent.java
        NodeBeginsParsingEvent.java
        NodeEndParsingEvent.java
        ParsingEvent.java
        ParsingEventType.java
        TreeIterator.java
        eventhandlers
        ConsoleLogger.java
        DefaultTreeEventHandler.java
        EventHandlerFactory.java
        EventRunner.java
        InjectingTreeEventHandler.java
        MultiThreadedEventRunner.java
        TreeEventHandler.java
        fedora3
        ConfigurableFilter.java
        FedoraTreeFilter.java
        IteratorForFedora3.java
        JerseyAttributeParsingEvent.java
        JerseyContentsAttributeParsingEvent.java
        YesFilter.java
        filesystem
        FileAttributeParsingEvent.java
        SimpleIteratorForFilesystems.java
        transforming
        CommonTransformingIterator.java
        DatafileIterator.java
        TransformingIteratorForFileSystems.java
        VirtualIteratorForFileSystems.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        AbstractTests.java
        iterator
        eventhandlers
        EventRunnerTest.java
        MultiThreadedEventRunnerTest.java
        fedora3
        IteratorForFedora3Test.java
        IteratorForFedora3TestWireMocked.java
        JerseyContentsAttributeParsingEventTest.java
        JerseyContentsAttributeParsingEventTestWireMocked.java
        filesystem
        InvalidDirTest.java
        IteratorForFileSystemsTestTestdata1.java
        SimpleIteratorForFilesystemsTest.java
        TransformingIteratorForFileSystemsTest.java
        TransformingIteratorForFileSystemsTestTestdata1.java
  - tree-processor-runnable-component
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        TreeProcessorAbstractRunnableComponent.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        TestingRunnableComponent.java
        TreeProcessorAbstractRunnableComponentTest.java
- process-monitor
  - process-monitor-datasource
    - process-monitor-datasource-interfaces
      - src
        main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        processmonitor
        datasources
        DataSource.java
        NotWorkingProperlyException.java
    - process-monitor-datasource-mockup
      - src
        main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        processmonitor
        datasources
        DataSourceMockup.java
        test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        processmonitor
        datasources
        DataSourceMockupTest.java
    - process-monitor-datasource-tck
      - src
        main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        processmonitor
        datasources
        TCKTestSuite.java
  - sboi-datasource
    - src
      - main
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        processmonitor
        datasources
        SBOIDatasource.java
        SBOIDatasourceConfiguration.java
      - test
        java
        dk
        statsbiblioteket
        medieplatform
        autonomous
        processmonitor
        datasources
        SBOIDatasourceTest.java

package dk.statsbiblioteket.medieplatform.autonomous.iterator.fedora3;

import com.sun.jersey.api.client.Client;
import com.sun.jersey.api.client.WebResource;
import dk.statsbiblioteket.medieplatform.autonomous.iterator.AbstractIterator;
import dk.statsbiblioteket.medieplatform.autonomous.iterator.common.AttributeParsingEvent;
import dk.statsbiblioteket.medieplatform.autonomous.iterator.common.DelegatingTreeIterator;
import dk.statsbiblioteket.util.xml.DOM;
import org.apache.ws.commons.util.NamespaceContextImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;

/**
 * Iterator that iterates objects in a Fedora 3.x repository. It works directly on the
 * REST api.
 */
public class IteratorForFedora3 extends AbstractIterator<String> {
    private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();
    private static final String OBJECTS = "/objects/";
    private static final String INFO_FEDORA = "<info:fedora/";
    private static final String FORMAT = "format";
    private static final String XML = "xml";
    private static final String DC_NAMESPACE = "http://purl.org/dc/elements/1.1/";
    private static final String DATASTREAM_PROFILE_NAMESPACE = "http://www.fedora.info/definitions/1/0/management/";
    private static final String NTRIPLES = "ntriples";
    /** xpath used to pick out the datastream nodes from the list of datastreams */
    private final XPathExpression datastreamsXpath;
    /** Xpath used to select the dc identifier from the contents of a DC datastream */
    private final XPathExpression dcIdentifierXpath;
    /** Xpath used to pick out the checksum from a datastream profile */
    private final XPathExpression datastreamChecksumXpath;
    /** Xpath used to pick out the datastream name from a datastream profile */
    private final XPathExpression datastreamNameXpath;
    private final Client client;
    private final String restUrl;
    private final FedoraTreeFilter filter;
    private final String name;
    private final Logger log = LoggerFactory.getLogger(getClass());


    /**
     * Constructor.
     *
     * @param id      the fedora pid of the root object
     * @param client  the jersey client to use
     * @param restUrl the url to Fedora
     * @param filter  the fedora tree filter to know which relations and datastreams to use
     */
    public IteratorForFedora3(String id, Client client, String restUrl, FedoraTreeFilter filter,
                              String dataFilePattern) {
        super(id, dataFilePattern);
        this.client = client;
        if (!restUrl.endsWith(OBJECTS)) {
            restUrl = restUrl + OBJECTS;
        }
        this.restUrl = restUrl;
        this.filter = filter;
        try {
            XPath xPath = XPATH_FACTORY.newXPath();
            NamespaceContextImpl context = new NamespaceContextImpl();
            context.startPrefixMapping("dc", DC_NAMESPACE);
            context.startPrefixMapping("dp", DATASTREAM_PROFILE_NAMESPACE);
            xPath.setNamespaceContext(context);
            datastreamsXpath = xPath.compile("//@dsid");
            dcIdentifierXpath = xPath.compile("//dc:identifier");
            datastreamChecksumXpath = xPath.compile("//dp:dsChecksum");
            datastreamNameXpath = xPath.compile("/dp:datastreamProfile/dp:dsAltID");

        } catch (XPathExpressionException e) {
            throw new RuntimeException("Illegal XPath. This is a programming error.", e);
        }
        this.name = getNameFromId(id);
    }

    /**
     * Given an object id, get the name from dc:identifier
     *
     * @param id fedora ID of object
     *
     * @return The name found, or the id if none could be found.
     */
    private String getNameFromId(String id) {
        WebResource resource = client.resource(restUrl);
        String dcContent = resource.path(id).path("/datastreams/DC/content").queryParam(FORMAT, XML).get(String.class);
        NodeList nodeList;
        try {
            nodeList = (NodeList) dcIdentifierXpath.evaluate(
                    DOM.streamToDOM(new ByteArrayInputStream(dcContent.getBytes()), true), XPathConstants.NODESET);
        } catch (XPathExpressionException e) {
            throw new RuntimeException("Invalid XPath. This is a programming error.", e);
        }
        for (int i = 0; i < nodeList.getLength(); i++) {
            String textContent = nodeList.item(i).getTextContent();
            if (textContent.startsWith("path:")) {
                return textContent.substring("path:".length());
            }
        }
        return id;
    }

    /**
     * Parse the list of datastreams from the datastream xml list. Removes the ones that should
     * not be used, based on the fedora tree filter
     *
     * @param datastreamXml the datastream xml list
     *
     * @return the list of datastreams
     */
    private List<String> parseDatastreamsFromXml(String datastreamXml) {
        NodeList nodeList;
        try {
            nodeList = (NodeList) datastreamsXpath.evaluate(
                    DOM.streamToDOM(new ByteArrayInputStream(datastreamXml.getBytes()), true), XPathConstants.NODESET);
        } catch (XPathExpressionException e) {
            throw new RuntimeException("Invalid XPath. This is a programming error.", e);
        }
        ArrayList<String> result = new ArrayList<>();
        for (int i = 0; i < nodeList.getLength(); i++) {
            String dsid = nodeList.item(i).getTextContent();
            if (filter.isAttributeDatastream(dsid)) {
                result.add(dsid);
            }

        }
        return result;
    }

    @Override
    protected Iterator<DelegatingTreeIterator> initializeChildrenIterator() {
        WebResource resource = client.resource(restUrl);
        //remember to not urlEncode the id here... Stupid fedora
        String relationsShips = resource.path(id).path("relationships").queryParam(FORMAT, NTRIPLES).get(String.class);
        List<String> children = parseRelationsToList(relationsShips);
        List<DelegatingTreeIterator> result = new ArrayList<>(children.size());
        for (String child : children) {
            try {
                DelegatingTreeIterator delegate = new IteratorForFedora3(
                        child, client, restUrl, filter, getDataFilePattern());
                result.add(delegate);
            } catch (Exception e) {
                log.warn("Unable to load child {}, ignoring as if it didn't exist", child, e);
            }
        }
        Collections.sort(
                result, new Comparator<DelegatingTreeIterator>() {
            @Override
            public int compare(DelegatingTreeIterator o1, DelegatingTreeIterator o2) {

                return 0;  //To change body of implemented methods use File | Settings | File Templates.
            }
        });
        return result.iterator();
    }

    /**
     * Parse the relationships of the object into a list of fedora pids. Filters out the ones that
     * should be ignored as detailed in the fedora tree filter
     *
     * @param relationsShips the relationships
     *
     * @return the list of pids of the child objects.
     */
    private List<String> parseRelationsToList(String relationsShips) {
        ArrayList<String> result = new ArrayList<>();
        for (String line : relationsShips.split("\n")) {
            String[] tuple = line.split(" ");
            if (tuple.length >= 3 && tuple[2].startsWith(INFO_FEDORA)) {
                String predicate = tuple[1].substring(1, tuple[1].length() - 1);
                String child = tuple[2].substring(INFO_FEDORA.length(), tuple[2].length() - 1);
                if (filter.isChildRel(predicate)) {
                    result.add(child);
                }
            } else {
                log.debug("Ignoring line {}, while parsing predicates", line);
            }
        }
        return result;
    }

    @Override
    protected Iterator<String> initilizeAttributeIterator() {
        WebResource resource = client.resource(restUrl);
        String datastreamXml = resource.path(id).path("datastreams").queryParam(FORMAT, XML).get(String.class);

        return parseDatastreamsFromXml(datastreamXml).iterator();
    }

    /**
     * construct a Attribute parsing event for a node and attributeID. Uses jersey
     * to return an inputstream to the content
     *
     * @param nodeID      the identifier of the node that the attribute resides in
     * @param attributeID the identifier of the attribute.
     *
     * @return the attribute parsing event
     */
    @Override
    protected AttributeParsingEvent makeAttributeEvent(String nodeID, String attributeID) {
        if (attributeID.equals(JerseyContentsAttributeParsingEvent.CONTENTS)) {
            return new JerseyContentsAttributeParsingEvent(
                    name + "/" + attributeID.toLowerCase(), client.resource(restUrl).path(nodeID), nodeID);
        } else {
            String response = client.resource(restUrl)
                                    .path(nodeID)
                                    .path("/datastreams/")
                                    .path(attributeID)
                                    .queryParam(FORMAT, XML)
                                    .get(String.class);
            Document datastreamProfile = DOM.streamToDOM(new ByteArrayInputStream(response.getBytes()), true);


            String name = null;
            String checksum = null;
            try {
                name = datastreamNameXpath.evaluate(datastreamProfile);
                checksum = datastreamChecksumXpath.evaluate(datastreamProfile);
            } catch (XPathExpressionException e) {
                throw new RuntimeException("Invalid XPath. This is a programming error.", e);
            }
            return new JerseyAttributeParsingEvent(
                    name, checksum, client.resource(restUrl).path(nodeID).path("/datastreams/").path(attributeID));
        }
    }


    @Override
    protected String getIdOfNode() {
        return name;
    }
}