/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package jena ;
import java.util.HashSet ;
import java.util.Iterator ;
import java.util.Set ;
import org.apache.jena.graph.Node ;
import org.apache.jena.query.Dataset ;
import org.apache.jena.query.text.* ;
import org.apache.jena.sparql.core.Quad ;
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
import jena.cmd.ArgDecl ;
import jena.cmd.CmdException ;
import arq.cmdline.CmdARQ ;
/**
* Text indexer application that will read a dataset and index its triples in
* its text index.
*/
public class textindexer extends CmdARQ {
private static Logger log = LoggerFactory.getLogger(textindexer.class) ;
public static final ArgDecl assemblerDescDecl = new ArgDecl(ArgDecl.HasValue, "desc", "dataset") ;
protected DatasetGraphText dataset = null ;
protected TextIndex textIndex = null ;
protected EntityDefinition entityDefinition ;
protected ProgressMonitor progressMonitor ;
static public void main(String... argv) {
new textindexer(argv).mainRun() ;
}
static public void testMain(String... argv) {
new textindexer(argv).mainMethod() ;
}
protected textindexer(String[] argv) {
super(argv) ;
super.add(assemblerDescDecl, "--desc=", "Assembler description file") ;
progressMonitor = new ProgressMonitor("properties indexed") ;
}
@Override
protected void processModulesAndArgs() {
super.processModulesAndArgs() ;
// Two forms : with and without arg.
// Maximises similarity with other tools.
String file ;
if ( ! super.contains(assemblerDescDecl) && getNumPositional() == 0 )
throw new CmdException("No assembler description given") ;
if ( super.contains(assemblerDescDecl) ) {
if ( getValues(assemblerDescDecl).size() != 1 )
throw new CmdException("Multiple assembler descriptions given via --desc") ;
if ( getPositional().size() != 0 )
throw new CmdException("Additional assembler descriptions given") ;
file = getValue(assemblerDescDecl) ;
} else {
if ( getNumPositional() != 1 )
throw new CmdException("Multiple assembler descriptions given as positional arguments") ;
file = getPositionalArg(0) ;
}
if (file == null)
throw new CmdException("No dataset specified") ;
// Assumes a single test dataset description in the assembler file.
Dataset ds = TextDatasetFactory.create(file) ;
if (ds == null)
throw new CmdException("No dataset description found") ;
// get index.
dataset = (DatasetGraphText)(ds.asDatasetGraph()) ;
textIndex = dataset.getTextIndex() ;
if (textIndex == null)
throw new CmdException("Dataset has no text index") ;
entityDefinition = textIndex.getDocDef() ;
}
@Override
protected String getSummary() {
return getCommandName() + " assemblerFile" ;
}
@Override
protected void exec() {
Set<Node> properties = getIndexedProperties() ;
// there are various strategies possible here
// what is implemented is a first cut simple approach
// currently - for each indexed property
// list and index triples with that property
// that way only process triples that will be indexed
// but each entity may be updated several times
for ( Node property : properties )
{
Iterator<Quad> quadIter = dataset.find( Node.ANY, Node.ANY, property, Node.ANY );
for (; quadIter.hasNext(); )
{
Quad quad = quadIter.next();
if ( Quad.isDefaultGraph(quad.getGraph()) ) {
// Need to use urn:x-arq:DefaultGraphNode for text indexing (JENA-1133)
quad = Quad.create(Quad.defaultGraphNodeGenerated,
quad.getSubject(), quad.getPredicate(), quad.getObject());
}
Entity entity = TextQueryFuncs.entityFromQuad( entityDefinition, quad );
if ( entity != null )
{
textIndex.addEntity( entity );
progressMonitor.progressByOne();
}
}
}
textIndex.commit();
textIndex.close();
dataset.close();
progressMonitor.close() ;
}
private Set<Node> getIndexedProperties() {
Set<Node> result = new HashSet<>() ;
for (String f : entityDefinition.fields()) {
for ( Node p : entityDefinition.getPredicates(f) )
result.add(p) ;
}
return result ;
}
// TDBLoader has a similar progress monitor
// Not used here to avoid making ARQ dependent on TDB
// So potential to rationalise and put progress monitor in a common
// utility class
private static class ProgressMonitor {
String progressMessage ;
long startTime ;
long progressCount ;
long intervalStartTime ;
long progressAtStartOfInterval ;
long reportingInterval = 10000 ; // milliseconds
ProgressMonitor(String progressMessage) {
this.progressMessage = progressMessage ;
start() ; // in case start not called
}
void start() {
startTime = System.currentTimeMillis() ;
progressCount = 0L ;
startInterval() ;
}
private void startInterval() {
intervalStartTime = System.currentTimeMillis() ;
progressAtStartOfInterval = progressCount ;
}
void progressByOne() {
progressCount++ ;
long now = System.currentTimeMillis() ;
if (reportDue(now)) {
report(now) ;
startInterval() ;
}
}
boolean reportDue(long now) {
return now - intervalStartTime >= reportingInterval ;
}
private void report(long now) {
long progressThisInterval = progressCount - progressAtStartOfInterval ;
long intervalDuration = now - intervalStartTime ;
long overallDuration = now - startTime ;
String message = progressCount + " (" + progressThisInterval / (intervalDuration / 1000) + " per second)"
+ progressMessage + " (" + progressCount / Math.max(overallDuration / 1000, 1)
+ " per second overall)" ;
log.info(message) ;
}
void close() {
long overallDuration = System.currentTimeMillis() - startTime ;
String message = progressCount + " (" + progressCount / Math.max(overallDuration / 1000, 1)
+ " per second) " + progressMessage ;
log.info(message) ;
}
}
}