package org.deri.vocidex.cli; import org.apache.jena.riot.RDFDataMgr; import org.apache.jena.riot.system.StreamRDFBase; import org.deri.vocidex.SPARQLRunner; import org.deri.vocidex.VocidexDocument; import org.deri.vocidex.VocidexException; import org.deri.vocidex.VocidexIndex; import org.deri.vocidex.extract.VocabularyTermExtractor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import arq.cmdline.ArgDecl; import arq.cmdline.CmdGeneral; import com.hp.hpl.jena.graph.Triple; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.shared.NotFoundException; import com.hp.hpl.jena.sparql.core.Quad; /** * A command line utility that adds a single RDFS/OWL file to the index. * * @author Richard Cyganiak */ public class AddVocabulary extends CmdGeneral { private final static Logger log = LoggerFactory.getLogger(AddVocabulary.class); public static void main(String... args) { new AddVocabulary(args).mainRun(); } private final ArgDecl prefixArg = new ArgDecl(true, "prefix"); private String clusterName; private String hostName; private String indexName; private String inFile; private String prefix = null; public AddVocabulary(String[] args) { super(args); getUsage().startCategory("Arguments"); getUsage().addUsage("clusterName", "ElasticSearch cluster name (e.g., elasticsearch)"); getUsage().addUsage("hostname", "ElasticSearch hostname (e.g., localhost)"); getUsage().addUsage("indexName", "ElasticSearch target index name (e.g., vocabs)"); getUsage().addUsage("input.rdf", "RDFS/OWL file or URL to be indexed; many RDF formats supported"); getUsage().startCategory("Options"); add(prefixArg, "--prefix prefix", "Set prefix to be used for this vocabulary"); } @Override protected String getCommandName() { return "add-vocabulary"; } @Override protected String getSummary() { return getCommandName() + " clusterName hostname indexName input.rdf"; } @Override protected void processModulesAndArgs() { if (getPositional().size() < 4 || getPositional().size() > 4) { doHelp(); } clusterName = getPositionalArg(0); hostName = getPositionalArg(1); indexName = getPositionalArg(2); inFile = getPositionalArg(3); if (hasArg(prefixArg)) { prefix = getArg(prefixArg).getValue(); } } @Override protected void exec() { try { log.info("Loading RDF file: " + inFile); final Model model = ModelFactory.createDefaultModel(); // Ignore the fourth element in quad-based formats. There has to be a simpler way of doing this!? RDFDataMgr.parse(new StreamRDFBase() { @Override public void triple(Triple triple) { model.getGraph().add(triple); } @Override public void quad(Quad quad) { model.getGraph().add(quad.asTriple()); } }, inFile); log.info("Read " + model.size() + " triples"); VocidexIndex index = new VocidexIndex(clusterName, hostName, indexName); try { if (!index.exists()) { throw new VocidexException("Index '" + indexName + "' does not exist on the cluster. Create the index first!"); } for (VocidexDocument document: new VocabularyTermExtractor(new SPARQLRunner(model), prefix)) { log.info("Indexing " + document.getId()); String resultId = index.addDocument(document); log.debug("Added new " + document.getType() + ", id " + resultId); } log.info("Done!"); } finally { index.close(); } } catch (NotFoundException ex) { cmdError("Not found: " + ex.getMessage()); } catch (VocidexException ex) { cmdError(ex.getMessage()); } } }