package org.xbib.tools; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.common.settings.Settings; import org.xbib.elasticsearch.helper.client.LongAdderIngestMetric; import org.xbib.oai.OAIConstants; import org.xbib.oai.OAIDateResolution; import org.xbib.oai.client.OAIClient; import org.xbib.oai.client.OAIClientFactory; import org.xbib.oai.client.listrecords.ListRecordsListener; import org.xbib.oai.client.listrecords.ListRecordsRequest; import org.xbib.oai.rdf.RdfResourceHandler; import org.xbib.oai.xml.SimpleMetadataHandler; import org.xbib.iri.namespace.IRINamespaceContext; import org.xbib.rdf.RdfContentBuilder; import org.xbib.rdf.RdfContentParams; import org.xbib.rdf.content.RouteRdfXContentParams; import org.xbib.rdf.io.ntriple.NTripleContentParams; import org.xbib.util.DateUtil; import org.xbib.util.URIUtil; import org.xbib.util.concurrent.URIWorkerRequest; import org.xbib.util.concurrent.Worker; import org.xbib.util.concurrent.WorkerProvider; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import java.io.IOException; import java.io.StringWriter; import java.net.URI; import java.util.Date; import java.util.Map; import static org.xbib.rdf.content.RdfXContentFactory.routeRdfXContentBuilder; /** * Harvest from OAI and feed to Elasticsearch */ public class OAIFeeder extends TimewindowFeeder { private final static Logger logger = LogManager.getLogger(OAIFeeder.class); private static String index; private static String concreteIndex; protected void setIndex(String index) { this.index = index; } protected String getIndex() { return index; } protected void setConcreteIndex(String concreteIndex) { this.concreteIndex = concreteIndex; } protected String getConcreteIndex() { return concreteIndex; } @Override protected WorkerProvider<Worker<URIWorkerRequest>> provider() { return OAIFeeder::new; } @Override protected void prepareSink() throws IOException { if (ingest == null) { ingest = createIngest(); Integer maxbulkactions = settings.getAsInt("maxbulkactions", 1000); Integer maxconcurrentbulkrequests = settings.getAsInt("maxconcurrentbulkrequests", Runtime.getRuntime().availableProcessors()); ingest.maxActionsPerRequest(maxbulkactions) .maxConcurrentRequests(maxconcurrentbulkrequests); ingest.init(Settings.settingsBuilder() .put("cluster.name", settings.get("elasticsearch.cluster")) .put("host", settings.get("elasticsearch.host")) .put("port", settings.getAsInt("elasticsearch.port", 9300)) .put("sniff", settings.getAsBoolean("elasticsearch.sniff", false)) .put("autodiscover", settings.getAsBoolean("elasticsearch.autodiscover", false)) .build(), new LongAdderIngestMetric()); } super.prepareSink(); } @Override protected void process(URI uri) throws Exception { Map<String, String> params = URIUtil.parseQueryString(uri); String server = uri.toString(); String verb = params.get("verb"); String metadataPrefix = params.get("metadataPrefix"); String set = params.get("set"); Date from = DateUtil.parseDateISO(params.get("from")); Date until = DateUtil.parseDateISO(params.get("until")); final OAIClient client = OAIClientFactory.newClient(server); client.setTimeout(settings.getAsInt("timeout", 60000)); if (!verb.equals(OAIConstants.LIST_RECORDS)) { logger.warn("no verb {}, returning", OAIConstants.LIST_RECORDS); return; } ListRecordsRequest request = client.newListRecordsRequest() .setMetadataPrefix(metadataPrefix) .setSet(set) .setFrom(from, OAIDateResolution.DAY) .setUntil(until, OAIDateResolution.DAY); do { try { request.addHandler(newMetadataHandler()); ListRecordsListener listener = new ListRecordsListener(request); logger.info("OAI request: {}", request); request.prepare().execute(listener).waitFor(); if (listener.getResponse() != null) { logger.debug("got OAI response"); StringWriter w = new StringWriter(); listener.getResponse().to(w); logger.debug("{}", w); request = client.resume(request, listener.getResumptionToken()); } else { logger.debug("no valid OAI response"); } } catch (IOException e) { logger.error(e.getMessage(), e); request = null; } } while (request != null); client.close(); } protected RdfResourceHandler rdfResourceHandler() { RdfContentParams params = NTripleContentParams.DEFAULT_PARAMS; return new RdfResourceHandler(params); } protected SimpleMetadataHandler newMetadataHandler() { return new OAISimpleMetadataHandler(); } protected String map(String id, String content) throws IOException { return content; } @Override public void newRequest(Worker worker, URIWorkerRequest request) { try { URI uri = request.get(); logger.info("processing URI {}", uri); process(uri); } catch (Throwable ex) { logger.error(request.get() + ": error while processing input: " + ex.getMessage(), ex); } } public class OAISimpleMetadataHandler extends SimpleMetadataHandler { private final IRINamespaceContext namespaceContext; private RdfResourceHandler handler; public OAISimpleMetadataHandler() { namespaceContext = IRINamespaceContext.newInstance(); namespaceContext.addNamespace("", "http://www.openarchives.org/OAI/2.0/oai_dc/"); namespaceContext.addNamespace("dc", "http://purl.org/dc/elements/1.1/"); } @Override public void startDocument() throws SAXException { this.handler = rdfResourceHandler(); handler.setDefaultNamespace("", "http://www.openarchives.org/OAI/2.0/oai_dc/"); handler.startDocument(); } @Override public void endDocument() throws SAXException { handler.endDocument(); try { RouteRdfXContentParams params = new RouteRdfXContentParams(namespaceContext, getConcreteIndex(), getType()); params.setHandler((content, p) -> { content = map(getHeader().getIdentifier(), content); if (settings.getAsBoolean("mock", false)) { logger.info("{}", content); } else { ingest.index(p.getIndex(), p.getType(), getHeader().getIdentifier(), content); } }); RdfContentBuilder builder = routeRdfXContentBuilder(params); builder.receive(handler.getResource()); } catch (IOException e) { logger.error(e.getMessage(), e); throw new SAXException(e); } } @Override public void startPrefixMapping(String string, String string1) throws SAXException { handler.startPrefixMapping(string, string1); } @Override public void endPrefixMapping(String string) throws SAXException { handler.endPrefixMapping(string); } @Override public void startElement(String ns, String localname, String string2, Attributes atrbts) throws SAXException { handler.startElement(ns, localname, string2, atrbts); } @Override public void endElement(String ns, String localname, String string2) throws SAXException { handler.endElement(ns, localname, string2); } @Override public void characters(char[] chars, int i, int i1) throws SAXException { handler.characters(chars, i, i1); } } }