/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.entityhub.indexing.source.jenatdb; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils; import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService; import org.apache.stanbol.entityhub.indexing.core.EntityDataIterable; import org.apache.stanbol.entityhub.indexing.core.EntityIterator; import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.hp.hpl.jena.graph.Node; import com.hp.hpl.jena.graph.Triple; import com.hp.hpl.jena.sparql.core.DatasetGraph; import com.hp.hpl.jena.tdb.store.DatasetGraphTDB; import com.hp.hpl.jena.util.iterator.ExtendedIterator; /** * * Allows to use an EntityIterator over all Resources where <ul> * <li> the property is equals to the configured {@link #PARAM_FIELD} value * <li> the value is equals to one of the configured {@link #PARAM_VALUES} values * (separated by ';'). This implementation will first iterate over all value1 * than value2, ... * </ul> * <p> * <b>NOTE:</b> This implementation does NOT support Wildcard value, because in * such cases it is much faster to use the {@link RdfIndexingSource} as * {@link EntityDataIterable} for indexing! * * * @author Rupert Westenthaler * */ public class ResourceFilterIterator implements EntityIterator{ private final Logger log = LoggerFactory.getLogger(ResourceFilterIterator.class); public static final String PARAM_FIELD = "field"; public static final String PARAM_VALUES = "values"; public static final String DEFAULT_FIELD = "rdf:type"; private Node field; private Collection<Node> values; /** * The RDF data */ private DatasetGraph indexingDataset; /** * The Iterator over the current EntityFilter (or <code>null</code> if not * yet initialised) */ private ExtendedIterator<Triple> iterator; /** * Iterator over the configured {@link #values} */ private Iterator<Node> valueIterator; /** * The IndexingConfiguration */ private IndexingConfig indexingConfig; private NamespacePrefixService nsPrefixService; @Override public void setConfiguration(Map<String,Object> config) { indexingConfig = (IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG); this.indexingDataset = Utils.getTDBDataset(config); nsPrefixService = indexingConfig.getNamespacePrefixService(); Object value = config.get(PARAM_FIELD); if(value == null || value.toString().isEmpty()){ this.field = Node.createURI(NamespaceMappingUtils.getConfiguredUri( nsPrefixService, DEFAULT_FIELD)); log.info("Using default Field {}",field); } else { this.field = Node.createURI(NamespaceMappingUtils.getConfiguredUri( nsPrefixService, value.toString())); log.info("configured Field: {}",field); } value = config.get(PARAM_VALUES); if(value == null || value.toString().isEmpty()){ throw new IllegalArgumentException("Missing required Parameter "+PARAM_VALUES+". Set to '*' to deactivate Filtering"); } else if(value instanceof String){ String stringValue = value.toString().trim(); if(stringValue.startsWith("*")){ // * -> deactivate Filtering throw new IllegalArgumentException("Wildcard is NOT supported as" + "directoy using EntityDataIterable with the Jena TDB will" + "provide much better performance (change configuration to use" + "the RdfIndexingSource as EntityDataIterable)!"); } else { parseFieldValues(stringValue.split(";")); } } else if (value instanceof String[]){ parseFieldValues((String[])value); } else { throw new IllegalArgumentException("Type of parameter "+PARAM_VALUES+'='+value+ "(type:"+value.getClass()+") is not supported MUST be String or String[]!"); } valueIterator = this.values.iterator(); } /** * @param value * @param stringValues */ private void parseFieldValues(String...stringValues) { if(stringValues == null || stringValues.length < 1){ throw new IllegalArgumentException("Parameter "+PARAM_VALUES+" does not contain a field value!"); } Set<Node> values = new HashSet<Node>(); for(String fieldValue : stringValues){ fieldValue = fieldValue.trim(); if(fieldValue != null){ if(fieldValue.isEmpty()){ throw new IllegalArgumentException("no parsed value (seperated by ';') MUST BE an empty String"); } else if(fieldValue.equals("*")){ throw new IllegalArgumentException("Wildcard is NOT supported as" + "directoy using EntityDataIterable with the Jena TDB will" + "provide much better performance (change configuration to use" + "the RdfIndexingSource as EntityDataIterable)!"); } else { values.add(Node.createURI(NamespaceMappingUtils.getConfiguredUri( nsPrefixService, fieldValue))); } } } if(values.isEmpty()){ throw new IllegalArgumentException("Parameter "+PARAM_VALUES + " does not contain a valid field value (values = " + Arrays.toString(stringValues)+"!"); } else { this.values = values; } } @Override public boolean needsInitialisation() { // Nope no initialisation needed return false; } @Override public void initialise() { } @Override public void close() { indexingDataset.close(); } @Override public boolean hasNext() { do { if(iterator == null){ //NOTE: the #setConfiguration method ensures a value to be present Node value = valueIterator.next(); log.info("Iterator over Entities field: '{}' value '{}'",field,value); iterator = indexingDataset.getDefaultGraph().find(null,field,value); } if(iterator != null){ if(iterator.hasNext()){ return true; } iterator.close(); iterator = null; } } while(valueIterator.hasNext()); return false; //iterated over all elements of all configured values } @Override public EntityScore next() { return new EntityScore(iterator.next().getSubject().toString(),null); } @Override public void remove() { throw new UnsupportedOperationException("Removal of Entities is not allowed!"); } }