package org.deri.vocidex.cli;
import java.util.Iterator;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.deri.vocidex.JSONHelper;
import org.deri.vocidex.SPARQLRunner;
import org.deri.vocidex.VocidexDocument;
import org.deri.vocidex.VocidexException;
import org.deri.vocidex.VocidexIndex;
import org.deri.vocidex.describers.LOVTermMetricsDescriber;
import org.deri.vocidex.extract.LOVExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import arq.cmdline.CmdGeneral;
import com.hp.hpl.jena.query.Dataset;
import com.hp.hpl.jena.rdf.model.ResourceFactory;
import com.hp.hpl.jena.shared.NotFoundException;
/**
* A command line tool that indexes an LOV dump, adding all vocabularies
* and their terms to the index. Uses {@link LOVExtractor}.
*
* @author Richard Cyganiak
*/
public class IndexLOV extends CmdGeneral {
private final static Logger log = LoggerFactory.getLogger(IndexLOV.class);
public static void main(String... args) {
new IndexLOV(args).mainRun();
}
private String clusterName;
private String hostName;
private String indexName;
private String lovDumpFile;
public IndexLOV(String[] args) {
super(args);
getUsage().startCategory("Arguments");
getUsage().addUsage("clusterName", "ElasticSearch cluster name (e.g., elasticsearch)");
getUsage().addUsage("hostname", "ElasticSearch hostname (e.g., localhost)");
getUsage().addUsage("indexName", "Target ElasticSearch index (e.g., lov)");
getUsage().addUsage("lov.nq", "Filename or URL of the LOV N-Quads dump");
}
@Override
protected String getCommandName() {
return "index-lov";
}
@Override
protected String getSummary() {
return getCommandName() + " clusterName hostname indexName lov.nq";
}
@Override
protected void processModulesAndArgs() {
if (getPositional().size() < 4) {
doHelp();
}
clusterName = getPositionalArg(0);
hostName = getPositionalArg(1);
indexName = getPositionalArg(2);
lovDumpFile = getPositionalArg(3);
}
@Override
protected void exec() {
try {
log.info("Loading LOV dump: " + lovDumpFile);
Dataset dataset = RDFDataMgr.loadDataset(lovDumpFile, Lang.NQUADS);
long graphCount = 1;
long tripleCount = dataset.getDefaultModel().size();
Iterator<String> it = dataset.listNames();
while (it.hasNext()) {
graphCount++;
tripleCount += dataset.getNamedModel(it.next()).size();
}
log.info("Read " + tripleCount + " triples in " + graphCount + " graphs");
VocidexIndex index = new VocidexIndex(clusterName, hostName, indexName);
try {
if (!index.exists()) {
throw new VocidexException("Index '" + indexName + "' does not exist on the cluster. Create the index first!");
}
LOVExtractor lovTransformer = new LOVExtractor(dataset);
for (VocidexDocument document: lovTransformer) {
log.info("Indexing " + document.getId());
String resultId = index.addDocument(document);
log.debug("Added new " + document.getType() + ", id " + resultId);
}
log.info("Done!");
} finally {
index.close();
}
} catch (NotFoundException ex) {
cmdError("Not found: " + ex.getMessage());
} catch (VocidexException ex) {
cmdError(ex.getMessage());
}
}
}