package eu.ehri.project.indexing; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import eu.ehri.project.indexing.converter.Converter; import eu.ehri.project.indexing.converter.impl.JsonConverter; import eu.ehri.project.indexing.converter.impl.NoopConverter; import eu.ehri.project.indexing.index.Index; import eu.ehri.project.indexing.index.impl.SolrIndex; import eu.ehri.project.indexing.sink.Sink; import eu.ehri.project.indexing.sink.impl.CallbackSink; import eu.ehri.project.indexing.sink.impl.IndexJsonSink; import eu.ehri.project.indexing.sink.impl.OutputStreamJsonSink; import eu.ehri.project.indexing.source.Source; import eu.ehri.project.indexing.source.impl.FileJsonSource; import eu.ehri.project.indexing.source.impl.InputStreamJsonSource; import eu.ehri.project.indexing.source.impl.WebJsonSource; import eu.ehri.project.indexing.utils.Stats; import org.apache.commons.cli.*; import javax.ws.rs.core.UriBuilder; import java.io.IOException; import java.net.URI; import java.util.List; import java.util.Properties; /** * Pull data from the EHRI REST API and index it in Solr. * <p/> * Designed allow very flexible input/output options without * incurring excessive complexity in the main logic. Orchestrates * a source, a converter, and one or more sink objects to get some JSON * data, convert it to another format, and put it somewhere. */ public class IndexHelper { // Default service end points. // TODO: Store these in a properties file? public static final String DEFAULT_SOLR_URL = "http://localhost:8983/solr/portal"; public static final String DEFAULT_EHRI_URL = "http://localhost:7474/ehri"; enum ErrCodes { BAD_SOURCE_ERR(3), BAD_SINK_ERR(4), BAD_CONVERSION_ERR(5), BAD_STATE_ERR(6), INDEX_ERR(7); private final int code; ErrCodes(int code) { this.code = code; } public int getCode() { return code; } } /** * Turn a list of specs into a set of EHRI web service URLs to download * JSON data from. * <p/> * Specs can be: * <ul> * <li>An item class name, e.g. "DocumentaryUnit"</li> * <li>Item ids prefixed with an "@"</li> * <li>An item type <i>and</i> ID, which denotes downloading * the contents of that item, e.g. it's child items, and * their descendants.</li> * </ul> * * @param serviceUrl The base REST URL * @param specs A list of specs * @return A list of URLs */ public static List<URI> urlsFromSpecs(String serviceUrl, String... specs) { List<URI> urls = Lists.newArrayList(); List<String> ids = Lists.newArrayList(); for (String spec : specs) { // Item type and id - denotes fetching child items (?) if (spec.contains("|")) { Iterable<String> split = Splitter.on("|").limit(2).split(spec); String type = Iterables.get(split, 0); String id = Iterables.get(split, 1); URI url = UriBuilder.fromPath(serviceUrl) .segment("classes") .segment(type).segment(id).segment("list") .queryParam("limit", -1) .queryParam("all", true).build(); urls.add(url); } else if (spec.startsWith("@")) { ids.add(spec.substring(1)); } else { URI url = UriBuilder.fromPath(serviceUrl) .segment("classes") .segment(spec) .queryParam("limit", -1).build(); urls.add(url); } } // Unlike types or children, multiple ids are done in one request. if (!ids.isEmpty()) { UriBuilder idBuilder = UriBuilder.fromPath(serviceUrl).segment("entities"); for (String id : ids) { idBuilder = idBuilder.queryParam("id", id); } urls.add(idBuilder.queryParam("limit", -1).build()); } return urls; } @SuppressWarnings("static-access") public static void main(String[] args) throws IOException, ParseException { // Long opts final String PRINT = "print"; final String PRETTY = "pretty"; final String CLEAR_ALL = "clear-all"; final String CLEAR_KEY_VALUE = "clear-key-value"; final String CLEAR_ID = "clear-id"; final String CLEAR_TYPE = "clear-type"; final String FILE = "file"; final String REST_URL = "rest"; final String HEADERS = "H"; final String SOLR_URL = "solr"; final String INDEX = "index"; final String NO_CONVERT = "noconvert"; final String VERBOSE = "verbose"; final String VERSION = "version"; final String STATS = "stats"; final String HELP = "help"; Options options = new Options(); options.addOption("p", "print", false, "Print converted JSON to stdout. The default action in the omission of --index."); options.addOption("D", CLEAR_ALL, false, "Clear entire index first (use with caution.)"); options.addOption(Option.builder("K").longOpt(CLEAR_KEY_VALUE) .argName("key=value") .numberOfArgs(2) .valueSeparator() .desc("Clear items with a given key=value pair. Can be used multiple times.") .build()); options.addOption("c", CLEAR_ID, true, "Clear an individual id. Can be used multiple times."); options.addOption("C", CLEAR_TYPE, true, "Clear an item type. Can be used multiple times."); options.addOption("P", PRETTY, false, "Pretty print out JSON given by --print (implies --print)."); options.addOption("s", SOLR_URL, true, "Base URL for Solr service (minus the action segment.)"); options.addOption("f", FILE, true, "Read input from a file instead of the REST service. Use '-' for stdin."); options.addOption("r", REST_URL, true, "Base URL for EHRI REST service."); options.addOption(Option.builder(HEADERS) .argName("header=value") .numberOfArgs(2) .valueSeparator() .desc("Set a header for the REST service.") .build()); options.addOption("i", INDEX, false, "Index the data. This is NOT the default for safety reasons."); options.addOption("n", NO_CONVERT, false, "Don't convert data to index format."); options.addOption("v", VERBOSE, false, "Print individual item ids to show progress."); options.addOption(Option.builder().longOpt(VERSION) .desc("Print the version number and exit.") .build()); options.addOption("S", STATS, false, "Print indexing stats."); options.addOption("h", HELP, false, "Print this message."); CommandLineParser parser = new DefaultParser(); CommandLine cmd = parser.parse(options, args); final String toolName = IndexHelper.class.getPackage().getImplementationTitle(); final String toolVersion = IndexHelper.class.getPackage().getImplementationVersion(); if (cmd.hasOption(VERSION)) { System.out.println(toolName + " " + toolVersion); System.exit(0); } String usage = toolName + " [OPTIONS] <spec> ... <specN>"; String help = "\n" + "Each <spec> should consist of:\n" + " * an item type (all items of that type)\n" + " * an item id prefixed with '@' (individual items)\n" + " * a type|id (bar separated - all children of an item)\n\n\n" + "The default URIs for Solr and the REST service are:\n" + " * " + DEFAULT_EHRI_URL + "\n" + " * " + DEFAULT_SOLR_URL + "\n\n"; if (cmd.hasOption(HELP)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(usage, null, options, help); System.exit(1); } String ehriUrl = cmd.getOptionValue(REST_URL, DEFAULT_EHRI_URL); String solrUrl = cmd.getOptionValue(SOLR_URL, DEFAULT_SOLR_URL); Properties restHeaders = cmd.getOptionProperties(HEADERS); Pipeline.Builder<JsonNode, JsonNode> builder = new Pipeline.Builder<>(); // Initialize the index... Index index = new SolrIndex(solrUrl); // Determine if we're printing the data... if (!cmd.hasOption(INDEX) || cmd.hasOption(PRINT) || cmd.hasOption(PRETTY)) { builder.addSink(new OutputStreamJsonSink(System.out, cmd.hasOption(PRETTY))); } // Determine if we need to actually index the data... if (cmd.hasOption(INDEX)) { builder.addSink(new IndexJsonSink(index, new IndexJsonSink.EventHandler() { @Override public void handleEvent(Object event) { System.err.println(event); } })); } // Determine if we want to convert the data or print the incoming // JSON as-is... builder.addConverter(cmd.hasOption(NO_CONVERT) ? new NoopConverter<JsonNode>() : new JsonConverter()); // See if we want to print stats... if so create a callback sink // to count the individual items and optionally print them... if (cmd.hasOption(VERBOSE) || cmd.hasOption(STATS)) { final Stats stats = new Stats(); final boolean printStats = cmd.hasOption(STATS); final boolean printItems = cmd.hasOption(VERBOSE); CallbackSink.Callback<JsonNode> cb = new CallbackSink.Callback<JsonNode>() { @Override public void call(JsonNode jsonNode) { stats.incrementCount(); if (printItems) { System.err.println(jsonNode.path("type").asText() + " -> " + jsonNode.path("id").asText()); } } @Override public void finish() { if (printStats) { stats.printReport(System.err); } } }; builder.addSink(new CallbackSink<>(cb)); } // Determine the source, either stdin, a file, or the rest service. if (cmd.hasOption(FILE)) { for (String fileName : cmd.getOptionValues(FILE)) { if (fileName.trim().equals("-")) { builder.addSource(new InputStreamJsonSource(System.in)); } else { builder.addSource(new FileJsonSource(fileName)); } } } // Parse the command line specs... for (URI uri : urlsFromSpecs(ehriUrl, cmd.getArgs())) { builder.addSource(new WebJsonSource(uri, restHeaders)); } try { // Check if we need to clear anything in index... do this if we're NOT indexing. boolean commitOnDelete = !cmd.hasOption(INDEX); if (cmd.hasOption(CLEAR_ALL)) { index.deleteAll(commitOnDelete); } else { if (cmd.hasOption(CLEAR_ID)) { String[] ids = cmd.getOptionValues(CLEAR_ID); index.deleteItems(Lists.newArrayList(ids), commitOnDelete); } if (cmd.hasOption(CLEAR_TYPE)) { String[] types = cmd.getOptionValues(CLEAR_TYPE); index.deleteTypes(Lists.newArrayList(types), commitOnDelete); } if (cmd.hasOption(CLEAR_KEY_VALUE)) { Properties kvs = cmd.getOptionProperties(CLEAR_KEY_VALUE); for (String key : kvs.stringPropertyNames()) { index.deleteByFieldValue(key, kvs.getProperty(key), commitOnDelete); } } } // Now do the main indexing tasks builder.build().run(); } catch (Source.SourceException e) { System.err.println(e.getMessage()); System.exit(ErrCodes.BAD_SOURCE_ERR.getCode()); } catch (Converter.ConverterException e) { System.err.println(e.getMessage()); System.exit(ErrCodes.BAD_CONVERSION_ERR.getCode()); } catch (Sink.SinkException e) { System.err.println(e.getMessage()); System.exit(ErrCodes.BAD_SINK_ERR.getCode()); } catch (Index.IndexException e) { System.err.println(e.getMessage()); System.exit(ErrCodes.INDEX_ERR.getCode()); } catch (IllegalStateException e) { System.err.println(e.getMessage()); System.exit(ErrCodes.BAD_STATE_ERR.getCode()); } } }