/* Copyright 2015 hbz, Pascal Christoph. * Licensed under the Eclipse Public License 1.0 */ package org.lobid.lodmill.run; import java.io.File; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import org.culturegraph.mf.morph.Metamorph; import org.culturegraph.mf.stream.converter.xml.XmlDecoder; import org.culturegraph.mf.stream.pipe.BatchLogger; import org.culturegraph.mf.stream.pipe.ObjectBatchLogger; import org.culturegraph.mf.stream.pipe.StreamTee; import org.culturegraph.mf.stream.source.FileOpener; import org.culturegraph.mf.stream.source.TarReader; import org.lobid.lodmill.ElasticsearchIndexer; import org.lobid.lodmill.MabXmlHandler; import org.lobid.lodmill.PipeEncodeTriples; import org.lobid.lodmill.RdfModel2ElasticsearchJsonLd; import org.lobid.lodmill.Stats; import org.lobid.lodmill.Triples2RdfModel; /** * Transform hbz01 Aleph Mab XML catalog data into lobid elasticsearch ready * JSON-LD and index that into elasticsearch. * * @author Pascal Christoph (dr0i) * */ @SuppressWarnings("javadoc") public final class MabXml2lobidJsonEs { public static void main(String... args) { String usage = "<input path>%s<index name>%s<index alias suffix>%s<node>%s<cluster>%s<'update' (will take latest index), 'exact' (will take ->'index name' even when no timestamp is suffixed) , else create new index with actual timestamp>%s"; String inputPath = args[0]; String indexName = args[1]; String date = new SimpleDateFormat("yyyyMMdd-HHmmss").format(new Date()); indexName = indexName.matches(".*-20.*") || args[5].toLowerCase().equals("exact") ? indexName : indexName + "-" + date; String indexAliasSuffix = args[2]; String node = args[3]; String cluster = args[4]; boolean update = args[5].toLowerCase().equals("update"); System.out.println( "It is specified:\n" + String.format(usage, ": " + inputPath + "\n", ": " + indexName + "\n", ": " + indexAliasSuffix + "\n", ": " + node + "\n", ": " + cluster, ": " + "\n" + update)); if (args.length != 6) { System.err.println("Usage: MabXml2lobidJsonEs" + String.format(usage, " ", " ", " ", " ", " ", " ")); System.exit(-1); } // hbz catalog transformation final FileOpener opener = new FileOpener(); if (inputPath.toLowerCase().endsWith("bz2")) { opener.setCompression("BZIP2"); } else if (inputPath.toLowerCase().endsWith("gz")) opener.setCompression("GZIP"); final Triples2RdfModel triple2model = new Triples2RdfModel(); triple2model.setInput("N-TRIPLE"); ElasticsearchIndexer esIndexer = new ElasticsearchIndexer(); esIndexer.setClustername(cluster); esIndexer.setHostname(node); esIndexer.setIndexName(indexName); esIndexer.setIndexAliasSuffix(indexAliasSuffix); esIndexer.setUpdateNewestIndex(update); esIndexer.onSetReceiver(); BatchLogger batchLogger = new BatchLogger(); batchLogger.setBatchSize(100000); ObjectBatchLogger<HashMap<String, String>> objectBatchLogger = new ObjectBatchLogger<>(); objectBatchLogger.setBatchSize(500000); StreamTee streamTee = new StreamTee(); final Stats stats = new Stats(); streamTee.addReceiver(stats); streamTee.addReceiver(batchLogger); batchLogger.setReceiver(new PipeEncodeTriples()).setReceiver(triple2model) .setReceiver(new RdfModel2ElasticsearchJsonLd()) .setReceiver(objectBatchLogger).setReceiver(esIndexer); opener.setReceiver(new TarReader()).setReceiver(new XmlDecoder()) .setReceiver(new MabXmlHandler()) .setReceiver( new Metamorph("src/main/resources/morph-hbz01-to-lobid.xml")) .setReceiver(streamTee); opener.process(new File(inputPath).getAbsolutePath()); opener.closeStream(); } }