/* Copyright 2015 hbz, Pascal Christoph.
* Licensed under the Eclipse Public License 1.0 */
package org.lobid.lodmill;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.FileUtils;
import org.culturegraph.mf.morph.Metamorph;
import org.culturegraph.mf.stream.converter.xml.XmlDecoder;
import org.culturegraph.mf.stream.source.FileOpener;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.index.query.MatchAllQueryBuilder;
import org.elasticsearch.node.Node;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import com.github.jsonldjava.core.JsonLdError;
import com.github.jsonldjava.core.JsonLdProcessor;
import com.github.jsonldjava.jena.JenaTripleCallback;
import com.github.jsonldjava.utils.JSONUtils;
import com.hp.hpl.jena.rdf.model.Model;
/**
* Transform hbz01 Aleph Mab XML catalog data into lobid elasticsearch JSON-LD.
* Query the index and test the data by transforming the data into ntriples
* (which is great to make diffs).
*
* @author Pascal Christoph (dr0i)
*
*/
@SuppressWarnings("javadoc")
public final class MabXml2ElasticsearchLobidTest {
private static Node node;
protected static Client client;
private static final String LOBID_RESOURCES = "lobid-resources";
private static final String N_TRIPLE = "N-TRIPLE";
private static final String TEST_FILENAME = "hbz01.es.nt";
@BeforeClass
public static void setup() {
node = nodeBuilder().local(true)
.settings(ImmutableSettings.settingsBuilder()
.put("index.number_of_replicas", "0")
.put("index.number_of_shards", "1").build())
.node();
client = node.client();
client.admin().indices().prepareDelete("_all").execute().actionGet();
client.admin().cluster().prepareHealth().setWaitForYellowStatus().execute()
.actionGet();
}
@SuppressWarnings("static-method")
@Test
public void testFlow() throws URISyntaxException {
buildAndExecuteFlow();
String ntriples = getElasticsearchDocumentsAsNtriples();
File testFile = new File(TEST_FILENAME);
try {
FileUtils.writeStringToFile(testFile, ntriples, false);
} catch (IOException e) {
e.printStackTrace();
}
AbstractIngestTests.compareFilesDefaultingBNodes(testFile,
new File(Thread.currentThread().getContextClassLoader()
.getResource(TEST_FILENAME).toURI()));
testFile.deleteOnExit();
}
private static void buildAndExecuteFlow() {
final FileOpener opener = new FileOpener();
opener.setCompression("BZIP2");
final Triples2RdfModel triple2model = new Triples2RdfModel();
triple2model.setInput(N_TRIPLE);
opener.setReceiver(new TarReader()).setReceiver(new XmlDecoder())
.setReceiver(new MabXmlHandler())
.setReceiver(
new Metamorph("src/main/resources/morph-hbz01-to-lobid.xml"))
.setReceiver(new PipeEncodeTriples()).setReceiver(triple2model)
.setReceiver(new RdfModel2ElasticsearchJsonLd())
.setReceiver(getElasticsearchIndexer());
opener.process(
new File("src/test/resources/hbz01XmlClobs.tar.bz2").getAbsolutePath());
opener.closeStream();
}
private static ElasticsearchIndexer getElasticsearchIndexer() {
ElasticsearchIndexer esIndexer = new ElasticsearchIndexer();
esIndexer.setElasticsearchClient(client);
esIndexer.setIndexName(LOBID_RESOURCES);
esIndexer.setIndexAliasSuffix("");
esIndexer.setUpdateNewestIndex(false);
esIndexer.onSetReceiver();
return esIndexer;
}
private static String getElasticsearchDocumentsAsNtriples() {
SearchResponse actionGet = client.prepareSearch(LOBID_RESOURCES)
.setQuery(new MatchAllQueryBuilder()).setFrom(0).setSize(10000)
.execute().actionGet();
return Arrays.asList(actionGet.getHits().getHits()).parallelStream()
.flatMap(hit -> Stream.of(toRdf(hit.getSourceAsString())))
.collect(Collectors.joining());
}
private static String toRdf(final String jsonLd) {
try {
final Object jsonObject = JSONUtils.fromString(jsonLd);
final JenaTripleCallback callback = new JenaTripleCallback();
final Model model = (Model) JsonLdProcessor.toRDF(jsonObject, callback);
final StringWriter writer = new StringWriter();
model.write(writer, N_TRIPLE);
return writer.toString();
} catch (IOException | JsonLdError e) {
e.printStackTrace();
}
return null;
}
@AfterClass
public static void down() {
client.admin().indices().prepareDelete("_all").execute().actionGet();
node.close();
}
}