AddVocabulary.java example

package org.deri.vocidex.cli;

import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.system.StreamRDFBase;
import org.deri.vocidex.SPARQLRunner;
import org.deri.vocidex.VocidexDocument;
import org.deri.vocidex.VocidexException;
import org.deri.vocidex.VocidexIndex;
import org.deri.vocidex.extract.VocabularyTermExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import arq.cmdline.ArgDecl;
import arq.cmdline.CmdGeneral;

import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.shared.NotFoundException;
import com.hp.hpl.jena.sparql.core.Quad;

/**
 * A command line utility that adds a single RDFS/OWL file to the index.
 * 
 * @author Richard Cyganiak
 */
public class AddVocabulary extends CmdGeneral {
	private final static Logger log = LoggerFactory.getLogger(AddVocabulary.class);
	
	public static void main(String... args) {
		new AddVocabulary(args).mainRun();
	}

    private final ArgDecl prefixArg = new ArgDecl(true, "prefix");

	private String clusterName;
	private String hostName;
	private String indexName;
	private String inFile;
	private String prefix = null;
	
	public AddVocabulary(String[] args) {
		super(args);
		getUsage().startCategory("Arguments");
		getUsage().addUsage("clusterName", "ElasticSearch cluster name (e.g., elasticsearch)");
		getUsage().addUsage("hostname", "ElasticSearch hostname (e.g., localhost)");
		getUsage().addUsage("indexName", "ElasticSearch target index name (e.g., vocabs)");
		getUsage().addUsage("input.rdf", "RDFS/OWL file or URL to be indexed; many RDF formats supported");
        getUsage().startCategory("Options");
        add(prefixArg, "--prefix prefix", "Set prefix to be used for this vocabulary");
	}
	
	@Override
    protected String getCommandName() {
		return "add-vocabulary";
	}
	
	@Override
	protected String getSummary() {
		return getCommandName() + " clusterName hostname indexName input.rdf";
	}

	@Override
	protected void processModulesAndArgs() {
		if (getPositional().size() < 4 || getPositional().size() > 4) {
			doHelp();
		}
		clusterName = getPositionalArg(0);
		hostName = getPositionalArg(1);
		indexName = getPositionalArg(2);
		inFile = getPositionalArg(3);
		if (hasArg(prefixArg)) {
			prefix = getArg(prefixArg).getValue();
		}
	}

	@Override
	protected void exec() {
		try {
			log.info("Loading RDF file: " + inFile);
			final Model model = ModelFactory.createDefaultModel();
			// Ignore the fourth element in quad-based formats. There has to be a simpler way of doing this!?
			RDFDataMgr.parse(new StreamRDFBase() {
				@Override
				public void triple(Triple triple) {
					model.getGraph().add(triple);
				}
				@Override
				public void quad(Quad quad) {
					model.getGraph().add(quad.asTriple());
				}
			}, inFile);
			log.info("Read " + model.size() + " triples");
			
			VocidexIndex index = new VocidexIndex(clusterName, hostName, indexName);
			try {
				if (!index.exists()) {
					throw new VocidexException("Index '" + indexName + "' does not exist on the cluster. Create the index first!");
				}
				for (VocidexDocument document: new VocabularyTermExtractor(new SPARQLRunner(model), prefix)) {
					log.info("Indexing " + document.getId());
					String resultId = index.addDocument(document);
					log.debug("Added new " + document.getType() + ", id " + resultId);
				}
				log.info("Done!");
			} finally {
				index.close();
			}
		} catch (NotFoundException ex) {
			cmdError("Not found: " + ex.getMessage());
		} catch (VocidexException ex) {
			cmdError(ex.getMessage());
		}
	}
}