/* Copyright (2005-2012) Schibsted ASA
* This file is part of Possom.
*
* Possom is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Possom is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Possom. If not, see <http://www.gnu.org/licenses/>.
*/
package no.sesat.search.query.token;
import com.opensymphony.oscache.base.NeedsRefreshException;
import com.opensymphony.oscache.general.GeneralCacheAdministrator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import no.sesat.search.query.token.AbstractEvaluatorFactory.Context;
import org.apache.log4j.Level;
import static no.sesat.search.query.parser.AbstractQueryParser.SKIP_REGEX;
import static no.sesat.search.query.parser.AbstractQueryParser.OPERATOR_REGEX;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
/**
*
* @version $Id$
*/
public final class SolrTokenEvaluator implements TokenEvaluator{
// Constants -----------------------------------------------------
private static final Logger LOG = Logger.getLogger(SolrTokenEvaluator.class);
private static final Logger DUMP = Logger.getLogger("no.sesat.search.Dump");
/** General properties to regular expressions configured. **/
private static final int REG_EXP_OPTIONS = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
// use the lowercase version of TokenPredicate.EXACT_PREFIX
private static final String EXACT_PREFIX = TokenPredicate.EXACT_PREFIX.toLowerCase();
private static final GeneralCacheAdministrator CACHE_QUERY = new GeneralCacheAdministrator();
private static final int REFRESH_PERIOD = 60;
// smaller than usual as each entry can contain up to 600 values!
private static final int CACHE_QUERY_CAPACITY = 100;
private static final int INITIAL_ROWS_TO_FETCH = 25;
private static final String ERR_QUERY_FAILED = "Querying Solr failed on ";
private static final String ERR_FAILED_TO_ENCODE = "Failed to encode query string: ";
// Attributes ----------------------------------------------------
private final Context context;
private SolrEvaluatorFactory factory;
private final Map<String, List<TokenMatch>> analysisResult;
// Static --------------------------------------------------------
static{
CACHE_QUERY.setCacheCapacity(CACHE_QUERY_CAPACITY);
}
// Constructors --------------------------------------------------
/** Only possible constructor.
*
* @param cxt the required context
* @param factory the factory constructing this.
* @throws EvaluationException if the evaluation, request to the solr index, fails.
*/
SolrTokenEvaluator(final Context cxt, final SolrEvaluatorFactory factory) throws EvaluationException{
context = cxt;
this.factory = factory;
// the responsible() method checks if we are responsible for any tokenPredicate at all.
// if we are not then the restful request in query(..) is a waste of time and resource.
analysisResult = factory.responsible()
? query(cleanString(context.getQueryString()))
: Collections.<String, List<TokenMatch>>emptyMap();
}
// Public --------------------------------------------------------
public boolean evaluateToken(final TokenPredicate token, final String term, final String query) {
boolean evaluation = false;
if(!analysisResult.isEmpty()){
final String[] listnames = factory.getListNames(token);
if(null != listnames){
for(int i = 0; !evaluation && i < listnames.length; ++i){
final String listname = listnames[i];
if (analysisResult.containsKey(listname)) {
if (term == null) {
evaluation = true;
} else {
// HACK since DefaultOperatorClause wraps its children in parenthesis
final String hackTerm = cleanString(term.replaceAll("\\(|\\)",""));
for (TokenMatch occurance : analysisResult.get(listname)) {
final Matcher m = occurance.getMatcher(hackTerm);
evaluation = m.find() && m.start() == 0 && m.end() == hackTerm.length();
if (evaluation) {
break;
}
}
}
}
}
}else{
LOG.info(context.getSite() + " does not define lists behind the token predicate " + token);
}
}
return evaluation;
}
/**
* get all match values and values for given list .
*
* @param token
* @param term
* @return a list of Tokens
*/
public Set<String> getMatchValues(final TokenPredicate token, final String term) {
final Set<String> values;
if(!analysisResult.isEmpty()){
values = new HashSet<String>();
final String[] listnames = factory.getListNames(token);
if(null != listnames){
for(int i = 0; i < listnames.length; i++){
final String listname = listnames[i];
if (analysisResult.containsKey(listname)) {
// HACK since DefaultOperatorClause wraps its children in parenthesis
final String hackTerm = cleanString(term.replaceAll("\\(|\\)",""));
for (TokenMatch occurance : analysisResult.get(listname)) {
final Matcher m = occurance.getMatcher(hackTerm);
if (m.find() && m.start() == 0 && m.end() == hackTerm.length()) {
values.add(occurance.getValue());
}
}
}
}
}
}else{
values = Collections.<String>emptySet();
}
return Collections.unmodifiableSet(values);
}
public boolean isQueryDependant(TokenPredicate predicate) {
return predicate.name().startsWith(EXACT_PREFIX.toUpperCase());
}
// Z implementation ----------------------------------------------
// Y overrides ---------------------------------------------------
// Package protected ---------------------------------------------
// Protected -----------------------------------------------------
// Private -------------------------------------------------------
/**
* Search solr and find out if the given tokens are company, firstname, lastname etc
* @param query
*/
@SuppressWarnings("unchecked")
private Map<String, List<TokenMatch>> query(final String query) throws EvaluationException{
LOG.trace("queryFast( " + query + " )");
Map<String, List<TokenMatch>> result = null;
if (query != null && 0 < query.length()) {
try{
result = (Map<String, List<TokenMatch>>) CACHE_QUERY.getFromCache(query, REFRESH_PERIOD);
} catch (NeedsRefreshException nre) {
boolean updatedCache = false;
result = new HashMap<String,List<TokenMatch>>();
String url = null;
try {
final String token = query.replaceAll("\"", "");
// set up query
final SolrQuery solrQuery = new SolrQuery()
.setQuery("list_entry_shingle:\"" + token + "\"")
.setRows(INITIAL_ROWS_TO_FETCH);
// when the root logger is set to DEBUG do not limit connection times
if(Logger.getRootLogger().getLevel().isGreaterOrEqual(Level.INFO)){
// default timeout is half second. TODO make configuration.
solrQuery.setTimeAllowed(500);
}
// query for hits
QueryResponse response = factory.getSolrServer().query(solrQuery);
final int numberOfHits = (int)response.getResults().getNumFound();
boolean more = false;
do {
DUMP.info(solrQuery.toString());
final SolrDocumentList docs = response.getResults();
// iterate through docs
for(SolrDocument doc : docs){
final String name = (String) doc.getFieldValue("list_name");
final String exactname = EXACT_PREFIX + name;
// remove words made solely of characters that the parser considers whitespace
final String hit = ((String) doc.getFieldValue("list_entry"))
.replaceAll("\\b" + SKIP_REGEX + "+\\b", " ");
final String synonym = (String) doc.getFieldValue("list_entry_synonym");
if(factory.usesListName(name, exactname)){
addMatch(name, hit, synonym, query, result);
if (hit.equalsIgnoreCase(query.trim())) {
addMatch(exactname, hit, synonym, query, result);
}
}
}
int rest = numberOfHits - INITIAL_ROWS_TO_FETCH;
if (!more && rest > 0) {
more = true;
solrQuery.setStart(INITIAL_ROWS_TO_FETCH);
solrQuery.setRows(rest);
// query
response = factory.getSolrServer().query(solrQuery);
}else {
more = false;
}
}while (more);
result = Collections.<String,List<TokenMatch>>unmodifiableMap(result);
CACHE_QUERY.putInCache(query, result);
updatedCache = true;
} catch (SolrServerException ex) {
LOG.error(ex.getMessage(), ex);
throw new EvaluationException(ERR_QUERY_FAILED + url, ex);
}finally{
if(!updatedCache){
CACHE_QUERY.cancelUpdate(query);
}
}
}
} else {
result = Collections.<String, List<TokenMatch>>emptyMap();
}
return result;
}
private static void addMatch(
final String name,
final String match,
final String value,
final String query,
final Map<String, List<TokenMatch>> result) {
final String expr = "\\b" + match + "\\b";
final Pattern pattern = Pattern.compile(expr, REG_EXP_OPTIONS);
final String qNew = query.replaceAll("\\b" + SKIP_REGEX + "+\\b", " ");
final Matcher m = pattern.matcher(
// remove words made solely of characters that the parser considers whitespace
qNew);
while (m.find()) {
final TokenMatch tknMatch = TokenMatch.instanceOf(name, match, value);
if (!result.containsKey(name)) {
result.put(name, new ArrayList<TokenMatch>());
}
result.get(name).add(tknMatch);
if (result.get(name).size() % 100 == 0) {
LOG.warn("Pattern: " + pattern.pattern()
+ " name: " + name
+ " query: " + query
+ " match: " + match
+ " query2: " + qNew);
}
}
}
private String cleanString(final String string){
// Strip out SKIP characters we are not interested in.
// Also remove any operator characters. (SEARCH-3883 & SEARCH-3967)
return string.toLowerCase()
.replaceAll(" ", "xxKEEPWSxx") // Hack to keep spaces. multiple spaces always normalised.
.replaceAll(SKIP_REGEX, " ")
.replaceAll("xxKEEPWSxx", " ") // Hack to keep spaces.
.replaceAll(OPERATOR_REGEX, " ")
.replaceAll(" +", " "); // normalise
}
// Inner classes -------------------------------------------------
}