/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Sep 9, 2011
*/
package com.bigdata.rdf.sparql.ast.eval;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.log4j.Logger;
import org.openrdf.model.Literal;
import org.openrdf.model.URI;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.Var;
import com.bigdata.btree.IIndex;
import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.lexicon.ITextIndexer;
import com.bigdata.rdf.lexicon.ITextIndexer.FullTextQuery;
import com.bigdata.rdf.sparql.ast.ConstantNode;
import com.bigdata.rdf.sparql.ast.GroupNodeBase;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.TermNode;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.service.BigdataNativeServiceOptions;
import com.bigdata.rdf.sparql.ast.service.BigdataServiceCall;
import com.bigdata.rdf.sparql.ast.service.IServiceOptions;
import com.bigdata.rdf.sparql.ast.service.ServiceCallCreateParams;
import com.bigdata.rdf.sparql.ast.service.ServiceFactory;
import com.bigdata.rdf.sparql.ast.service.ServiceNode;
import com.bigdata.rdf.spo.ISPO;
import com.bigdata.rdf.spo.SPOKeyOrder;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.store.BD;
import com.bigdata.rdf.store.BDS;
import com.bigdata.relation.accesspath.EmptyCloseableIterator;
import com.bigdata.relation.accesspath.ThickCloseableIterator;
import com.bigdata.search.Hiterator;
import com.bigdata.search.IHit;
import cutthecrap.utils.striterators.ICloseableIterator;
/**
* A factory for a "search in search" service.
* It accepts a group that have a single triple pattern in it:
*
* service bd:searchInSearch {
* ?s bd:searchInSearch "search" .
* }
*
* This service will then use the full text index to filter out incoming
* bindings for ?s that do not link to a Literal that is found via the full
* text index with the supplied search string. If there are no incoming
* bindings (or none that have ?s bound), this service will produce no output.
*/
public class SearchInSearchServiceFactory extends AbstractServiceFactoryBase {
private static final Logger log = Logger
.getLogger(SearchInSearchServiceFactory.class);
/*
* Note: This could extend the base class to allow for search service
* configuration options.
*/
private final BigdataNativeServiceOptions serviceOptions;
public SearchInSearchServiceFactory() {
serviceOptions = new BigdataNativeServiceOptions();
// serviceOptions.setRunFirst(true);
}
@Override
public BigdataNativeServiceOptions getServiceOptions() {
return serviceOptions;
}
public BigdataServiceCall create(final ServiceCallCreateParams params) {
if (params == null)
throw new IllegalArgumentException();
final AbstractTripleStore store = params.getTripleStore();
if (store == null)
throw new IllegalArgumentException();
final ServiceNode serviceNode = params.getServiceNode();
if (serviceNode == null)
throw new IllegalArgumentException();
/*
* Validate the search predicates for a given search variable.
*/
final Map<IVariable<?>, Map<URI, StatementPatternNode>> map = verifyGraphPattern(
store, serviceNode.getGraphPattern());
if (map == null)
throw new RuntimeException("Not a search request.");
if (map.size() != 1)
throw new RuntimeException(
"Multiple search requests may not be combined.");
final Map.Entry<IVariable<?>, Map<URI, StatementPatternNode>> e = map
.entrySet().iterator().next();
final IVariable<?> searchVar = e.getKey();
final Map<URI, StatementPatternNode> statementPatterns = e.getValue();
validateSearch(searchVar, statementPatterns);
/*
* Create and return the ServiceCall object which will execute this
* query.
*/
return new SearchCall(store, searchVar, statementPatterns,
getServiceOptions());
}
/**
* Validate the search request. This looks for search magic predicates and
* returns them all. It is an error if anything else is found in the group.
* All such search patterns are reported back by this method, but the
* service can only be invoked for one a single search variable at a time.
* The caller will detect both the absence of any search and the presence of
* more than one search and throw an exception.
*/
private Map<IVariable<?>, Map<URI, StatementPatternNode>> verifyGraphPattern(
final AbstractTripleStore database,
final GroupNodeBase<IGroupMemberNode> group) {
// lazily allocate iff we find some search predicates in this group.
Map<IVariable<?>, Map<URI, StatementPatternNode>> tmp = null;
final int arity = group.arity();
for (int i = 0; i < arity; i++) {
final BOp child = group.get(i);
if (child instanceof GroupNodeBase<?>) {
throw new RuntimeException("Nested groups are not allowed.");
}
if (child instanceof StatementPatternNode) {
final StatementPatternNode sp = (StatementPatternNode) child;
final TermNode p = sp.p();
if (!p.isConstant())
throw new RuntimeException("Expecting search predicate: "
+ sp);
final URI uri = (URI) ((ConstantNode) p).getValue();
if (!uri.stringValue().startsWith(BDS.NAMESPACE))
throw new RuntimeException("Expecting search predicate: "
+ sp);
/*
* Some search predicate.
*/
if (!ASTSearchOptimizer.searchUris.contains(uri) &&
!BDS.SEARCH_IN_SEARCH.equals(uri)) {
throw new RuntimeException("Unknown search predicate: "
+ uri);
}
final TermNode s = sp.s();
if (!s.isVariable())
throw new RuntimeException(
"Subject of search predicate is constant: " + sp);
final IVariable<?> searchVar = ((VarNode) s)
.getValueExpression();
// Lazily allocate map.
if (tmp == null) {
tmp = new LinkedHashMap<IVariable<?>, Map<URI, StatementPatternNode>>();
}
// Lazily allocate set for that searchVar.
Map<URI, StatementPatternNode> statementPatterns = tmp
.get(searchVar);
if (statementPatterns == null) {
tmp.put(searchVar,
statementPatterns = new LinkedHashMap<URI, StatementPatternNode>());
}
// Add search predicate to set for that searchVar.
statementPatterns.put(uri, sp);
}
}
return tmp;
}
/**
* Validate the search. There must be exactly one {@link BD#SEARCH}
* predicate. There should not be duplicates of any of the search predicates
* for a given searchVar.
*/
private void validateSearch(final IVariable<?> searchVar,
final Map<URI, StatementPatternNode> statementPatterns) {
final Set<URI> uris = new LinkedHashSet<URI>();
for(StatementPatternNode sp : statementPatterns.values()) {
final URI uri = (URI)(sp.p()).getValue();
if (!uris.add(uri))
throw new RuntimeException(
"Search predicate appears multiple times for same search variable: predicate="
+ uri + ", searchVar=" + searchVar);
if (uri.equals(BDS.SEARCH_IN_SEARCH)) {
assertObjectIsLiteral(sp);
} else if (uri.equals(BDS.RELEVANCE) || uri.equals(BDS.RANK)) {
assertObjectIsVariable(sp);
} else if(uri.equals(BDS.MIN_RANK) || uri.equals(BDS.MAX_RANK)) {
assertObjectIsLiteral(sp);
} else if (uri.equals(BDS.MIN_RELEVANCE) || uri.equals(BDS.MAX_RELEVANCE)) {
assertObjectIsLiteral(sp);
} else if(uri.equals(BDS.MATCH_ALL_TERMS)) {
assertObjectIsLiteral(sp);
} else if(uri.equals(BDS.MATCH_EXACT)) {
assertObjectIsLiteral(sp);
} else if(uri.equals(BDS.SEARCH_TIMEOUT)) {
assertObjectIsLiteral(sp);
} else if(uri.equals(BDS.MATCH_REGEX)) {
// a variable for the object is equivalent to regex = null
// assertObjectIsLiteral(sp);
} else {
throw new AssertionError("Unverified search predicate: " + sp);
}
}
if (!uris.contains(BDS.SEARCH_IN_SEARCH)) {
throw new RuntimeException("Required search predicate not found: "
+ BDS.SUBJECT_SEARCH + " for searchVar=" + searchVar);
}
}
private void assertObjectIsLiteral(final StatementPatternNode sp) {
final TermNode o = sp.o();
if (!o.isConstant()
|| !(((ConstantNode) o).getValue() instanceof Literal)) {
throw new IllegalArgumentException("Object is not literal: " + sp);
}
}
private void assertObjectIsVariable(final StatementPatternNode sp) {
final TermNode o = sp.o();
if (!o.isVariable()) {
throw new IllegalArgumentException("Object must be variable: " + sp);
}
}
/**
*
* Note: This has the {@link AbstractTripleStore} reference attached. This
* is not a {@link Serializable} object. It MUST run on the query
* controller.
*/
private static class SearchCall implements BigdataServiceCall {
private final AbstractTripleStore store;
private final IIndex osp;
private final IServiceOptions serviceOptions;
private final Literal query;
private final IVariable<?>[] vars;
private final Literal minRank;
private final Literal maxRank;
private final Literal minRelevance;
private final Literal maxRelevance;
private final boolean matchAllTerms;
private final boolean matchExact;
private final Literal searchTimeout;
private final Literal matchRegex;
public SearchCall(
final AbstractTripleStore store,
final IVariable<?> searchVar,
final Map<URI, StatementPatternNode> statementPatterns,
final IServiceOptions serviceOptions) {
if(store == null)
throw new IllegalArgumentException();
if(searchVar == null)
throw new IllegalArgumentException();
if(statementPatterns == null)
throw new IllegalArgumentException();
if(serviceOptions == null)
throw new IllegalArgumentException();
this.store = store;
this.osp = store.getSPORelation().getIndex(SPOKeyOrder.OSP);
this.serviceOptions = serviceOptions;
/*
* Unpack the "search" magic predicate:
*
* [?searchVar bd:search objValue]
*/
final StatementPatternNode sp = statementPatterns.get(BDS.SEARCH_IN_SEARCH);
query = (Literal) sp.o().getValue();
/*
* Unpack the search service request parameters.
*/
IVariable<?> relVar = null;
IVariable<?> rankVar = null;
Literal minRank = null;
Literal maxRank = null;
Literal minRelevance = null;
Literal maxRelevance = null;
boolean matchAllTerms = false;
boolean matchExact = false;
Literal searchTimeout = null;
Literal matchRegex = null;
for (StatementPatternNode meta : statementPatterns.values()) {
final URI p = (URI) meta.p().getValue();
final Literal oVal = meta.o().isConstant() ? (Literal) meta.o()
.getValue() : null;
final IVariable<?> oVar = meta.o().isVariable() ? (IVariable<?>) meta
.o().getValueExpression() : null;
if (BDS.RELEVANCE.equals(p)) {
relVar = oVar;
} else if (BDS.RANK.equals(p)) {
rankVar = oVar;
} else if (BDS.MIN_RANK.equals(p)) {
minRank = (Literal) oVal;
} else if (BDS.MAX_RANK.equals(p)) {
maxRank = (Literal) oVal;
} else if (BDS.MIN_RELEVANCE.equals(p)) {
minRelevance = (Literal) oVal;
} else if (BDS.MAX_RELEVANCE.equals(p)) {
maxRelevance = (Literal) oVal;
} else if (BDS.MATCH_ALL_TERMS.equals(p)) {
matchAllTerms = ((Literal) oVal).booleanValue();
} else if (BDS.MATCH_EXACT.equals(p)) {
matchExact = ((Literal) oVal).booleanValue();
} else if (BDS.SEARCH_TIMEOUT.equals(p)) {
searchTimeout = (Literal) oVal;
} else if (BDS.MATCH_REGEX.equals(p)) {
matchRegex = (Literal) oVal;
}
}
this.vars = new IVariable[] {//
searchVar,//
relVar == null ? Var.var() : relVar,// must be non-null.
rankVar == null ? Var.var() : rankVar // must be non-null.
};
this.minRank = minRank;
this.maxRank = maxRank;
this.minRelevance = minRelevance;
this.maxRelevance = maxRelevance;
this.matchAllTerms = matchAllTerms;
this.matchExact = matchExact;
this.searchTimeout = searchTimeout;
this.matchRegex = matchRegex;
}
@SuppressWarnings({ "rawtypes", "unchecked" })
private Hiterator<IHit<?>> getHiterator() {
// final IValueCentricTextIndexer<IHit> textIndex = (IValueCentricTextIndexer) store
// .getLexiconRelation().getSearchEngine();
final ITextIndexer<IHit> textIndex = (ITextIndexer)
store.getLexiconRelation().getSearchEngine();
if (textIndex == null)
throw new UnsupportedOperationException("No free text index?");
String s = query.getLabel();
final boolean prefixMatch;
if (s.indexOf('*') >= 0) {
prefixMatch = true;
s = s.replaceAll("\\*", "");
} else {
prefixMatch = false;
}
return (Hiterator) textIndex.search(new FullTextQuery(
s,//
query.getLanguage(),//
prefixMatch,//
matchRegex == null ? null : matchRegex.stringValue(),
matchAllTerms,
matchExact,
minRelevance == null ? BDS.DEFAULT_MIN_RELEVANCE : minRelevance.doubleValue()/* minCosine */,
maxRelevance == null ? BDS.DEFAULT_MAX_RELEVANCE : maxRelevance.doubleValue()/* maxCosine */,
minRank == null ? BDS.DEFAULT_MIN_RANK/*1*/ : minRank.intValue()/* minRank */,
maxRank == null ? BDS.DEFAULT_MAX_RANK/*Integer.MAX_VALUE*/ : maxRank.intValue()/* maxRank */,
searchTimeout == null ? BDS.DEFAULT_TIMEOUT/*0L*/ : searchTimeout.longValue()/* timeout */,
TimeUnit.MILLISECONDS
));
}
private static final ConcurrentWeakValueCacheWithTimeout<String, Set<IV>> cache =
new ConcurrentWeakValueCacheWithTimeout<String, Set<IV>>(
10, 1000*60);
private Set<IV> getSubjects() {
final String s = query.getLabel();
if (cache.containsKey(s)) {
return cache.get(s);
}
if (log.isInfoEnabled()) {
log.info("entering full text search...");
}
// query the full text index
final Hiterator<IHit<?>> src = getHiterator();
if (log.isInfoEnabled()) {
log.info("done with full text search.");
}
if (log.isInfoEnabled()) {
log.info("starting subject collection...");
}
final Set<IV> subjects = new LinkedHashSet<IV>();
while (src.hasNext()) {
final IV o = (IV) src.next().getDocId();
final Iterator<ISPO> it =
store.getAccessPath((IV)null, (IV)null, o).iterator();
while (it.hasNext()) {
subjects.add(it.next().s());
}
}
if (log.isInfoEnabled()) {
log.info("done with subject collection: " + subjects.size());
}
cache.put(s, subjects);
return subjects;
}
/**
* {@inheritDoc}
*
* Iterate the incoming binding set. If it does not contain a binding
* for the searchVar, prune it. Then iterate the full text search
* results. For each result (O binding), test the incoming binding sets
* to see if there is a link between the binding for the searchVar and
* the O. If there is, add the binding set to the output and remove it
* from the set to be tested against subsequent O bindings.
*/
@Override
public ICloseableIterator<IBindingSet> call(
final IBindingSet[] bindingsClause) {
if (log.isInfoEnabled()) {
log.info(bindingsClause.length);
log.info(Arrays.toString(bindingsClause));
}
final IVariable<?> searchVar = vars[0];
// final IBindingSet[] tmp = new IBindingSet[bindingsClause.length];
// System.arraycopy(bindingsClause, 0, tmp, 0, bindingsClause.length);
// final boolean[] tmp = new boolean[bindingsClause.length];
boolean foundOne = false;
/*
* We are filtering out incoming binding sets that don't have a
* binding for the search var
*/
for (int i = 0; i < bindingsClause.length; i++) {
final IBindingSet bs = bindingsClause[i];
if (bs.isBound(searchVar)) {
// we need to test this binding set
// tmp[i] = true;
// we have at least one binding set to test
foundOne = true;
}
}
// filtered everything out
if (!foundOne) {
return new EmptyCloseableIterator<IBindingSet>();
}
final IBindingSet[] out = new IBindingSet[bindingsClause.length];
int numAccepted = 0;
// if (log.isInfoEnabled()) {
// log.info("entering full text search...");
// }
//
// // query the full text index
// final Hiterator<IHit<?>> src = getHiterator();
//
// if (log.isInfoEnabled()) {
// log.info("done with full text search.");
// }
//
// while (src.hasNext()) {
//
// final IV o = (IV) src.next().getDocId();
//
// for (int i = 0; i < bindingsClause.length; i++) {
//
// /*
// * The binding set has either been filtered out or already
// * accepted.
// */
// if (!tmp[i])
// continue;
//
// /*
// * We know it's bound. If it weren't it would have been
// * filtered out above.
// */
// final IV s = (IV) bindingsClause[i].get(searchVar).get();
//
// final IKeyBuilder kb = KeyBuilder.newInstance();
// o.encode(kb);
// s.encode(kb);
//
// final byte[] fromKey = kb.getKey();
// final byte[] toKey = SuccessorUtil.successor(fromKey.clone());
//
// if (log.isInfoEnabled()) {
// log.info("starting range count...");
// }
//
// final long rangeCount = osp.rangeCount(fromKey, toKey);
//
// if (log.isInfoEnabled()) {
// log.info("done with range count: " + rangeCount);
// }
//
// /*
// * Test the OSP index to see if we have a link.
// */
//// if (!store.getAccessPath(s, null, o).isEmpty()) {
// if (rangeCount > 0) {
//
// // add the binding set to the output
// out[numAccepted++] = bindingsClause[i];
//
// // don't need to test this binding set again
// tmp[i] = false;
//
// }
//
// }
//
// }
final Set<IV> subjects = getSubjects();
for (int i = 0; i < bindingsClause.length; i++) {
/*
* We know it's bound. If it weren't it would have been
* filtered out above.
*/
final IV s = (IV) bindingsClause[i].get(searchVar).get();
if (subjects.contains(s)) {
// add the binding set to the output
out[numAccepted++] = bindingsClause[i];
}
}
if (log.isInfoEnabled()) {
log.info("finished search in search.");
}
return new ThickCloseableIterator<IBindingSet>(out, numAccepted);
}
@Override
public IServiceOptions getServiceOptions() {
return serviceOptions;
}
}
}