IndexLOV.java example

package org.deri.vocidex.cli;

import java.util.Iterator;

import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.deri.vocidex.JSONHelper;
import org.deri.vocidex.SPARQLRunner;
import org.deri.vocidex.VocidexDocument;
import org.deri.vocidex.VocidexException;
import org.deri.vocidex.VocidexIndex;
import org.deri.vocidex.describers.LOVTermMetricsDescriber;
import org.deri.vocidex.extract.LOVExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import arq.cmdline.CmdGeneral;

import com.hp.hpl.jena.query.Dataset;
import com.hp.hpl.jena.rdf.model.ResourceFactory;
import com.hp.hpl.jena.shared.NotFoundException;

/**
 * A command line tool that indexes an LOV dump, adding all vocabularies
 * and their terms to the index. Uses {@link LOVExtractor}.
 * 
 * @author Richard Cyganiak
 */
public class IndexLOV extends CmdGeneral {
	private final static Logger log = LoggerFactory.getLogger(IndexLOV.class);
	
	public static void main(String... args) {
		new IndexLOV(args).mainRun();
	}

	private String clusterName;
	private String hostName;
	private String indexName;
	private String lovDumpFile;
	
	public IndexLOV(String[] args) {
		super(args);
		getUsage().startCategory("Arguments");
		getUsage().addUsage("clusterName", "ElasticSearch cluster name (e.g., elasticsearch)");
		getUsage().addUsage("hostname", "ElasticSearch hostname (e.g., localhost)");
		getUsage().addUsage("indexName", "Target ElasticSearch index (e.g., lov)");
		getUsage().addUsage("lov.nq", "Filename or URL of the LOV N-Quads dump");
	}
	
	@Override
    protected String getCommandName() {
		return "index-lov";
	}
	
	@Override
	protected String getSummary() {
		return getCommandName() + " clusterName hostname indexName lov.nq";
	}

	@Override
	protected void processModulesAndArgs() {
		if (getPositional().size() < 4) {
			doHelp();
		}
		clusterName = getPositionalArg(0);
		hostName = getPositionalArg(1);
		indexName = getPositionalArg(2);
		lovDumpFile = getPositionalArg(3);
	}

	@Override
	protected void exec() {
		try {
			log.info("Loading LOV dump: " + lovDumpFile);
			Dataset dataset = RDFDataMgr.loadDataset(lovDumpFile, Lang.NQUADS);
			long graphCount = 1;
			long tripleCount = dataset.getDefaultModel().size();
			Iterator<String> it = dataset.listNames();
			while (it.hasNext()) {
				graphCount++;
				tripleCount += dataset.getNamedModel(it.next()).size();
			}
			log.info("Read " + tripleCount + " triples in " + graphCount + " graphs");

			VocidexIndex index = new VocidexIndex(clusterName, hostName, indexName);
			try {
				if (!index.exists()) {
					throw new VocidexException("Index '" + indexName + "' does not exist on the cluster. Create the index first!");
				}
				LOVExtractor lovTransformer = new LOVExtractor(dataset);
				for (VocidexDocument document: lovTransformer) {
					log.info("Indexing " + document.getId());
					String resultId = index.addDocument(document);
					log.debug("Added new " + document.getType() + ", id " + resultId);
				}
				log.info("Done!");
			} finally {
				index.close();
			}
		} catch (NotFoundException ex) {
			cmdError("Not found: " + ex.getMessage());
		} catch (VocidexException ex) {
			cmdError(ex.getMessage());
		}
	}
}