/* Copyright (2005-2012) Schibsted ASA
* This file is part of Possom.
*
* Possom is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Possom is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Possom. If not, see <http://www.gnu.org/licenses/>.
*/
package no.sesat.search.query.token;
import com.opensymphony.oscache.base.NeedsRefreshException;
import com.opensymphony.oscache.general.GeneralCacheAdministrator;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import no.sesat.commons.ioc.ContextWrapper;
import no.sesat.search.site.config.SiteConfiguration;
import no.sesat.search.site.config.DocumentLoader;
import no.sesat.search.http.HTTPClient;
import no.sesat.search.query.token.AbstractEvaluatorFactory.Context;
import static no.sesat.search.query.parser.AbstractQueryParser.SKIP_REGEX;
import static no.sesat.search.query.parser.AbstractQueryParser.OPERATOR_REGEX;
import no.sesat.search.site.Site;
import no.sesat.search.site.SiteContext;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
* VeryFastTokenEvaluator is part of no.sesat.search.query.
*
* @todo make CGI_PATH easily configurable. configurable cache settings.
*
*
*
*
* @version $Id$
*/
public final class VeryFastTokenEvaluator implements TokenEvaluator {
// Constants -----------------------------------------------------
private static final Logger LOG = Logger.getLogger(VeryFastTokenEvaluator.class);
private static final String ERR_FAILED_INITIALISATION = "Failed reading configuration files";
private static final String ERR_QUERY_FAILED = "Querying the fast list failed on ";
private static final String ERR_PARSE_FAILED = "XML parsing of fast list response failed on ";
/** The configuration file from the skin that specifies token predicate to list mappings. **/
public static final String VERYFAST_EVALUATOR_XMLFILE = "VeryFastEvaluators.xml";
private static final String TOKEN_HOST_PROPERTY = "tokenevaluator.host";
private static final String TOKEN_PORT_PROPERTY = "tokenevaluator.port";
private static final String LIST_PREFIX = "FastQT_";
private static final String LIST_SUFFIX = "QM";
// use the lowercase version of TokenPredicate.EXACT_PREFIX
private static final String EXACT_PREFIX = TokenPredicate.EXACT_PREFIX.toLowerCase();
private static final String CGI_PATH = "/cgi-bin/xsearch?sources=alone&qtpipeline=lookupword&query=";
private static final String ERR_FAILED_TO_ENCODE = "Failed to encode query string: ";
/** General properties to regular expressions configured. **/
private static final int REG_EXP_OPTIONS = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
// @todo this will leak when sites are redeploy without Possom being restarted.
// @todo move deserialisation & this map to FastQueryMatchingEvaluatorFactory
private static final Map<Site,Map<TokenPredicate,String[]>> LIST_NAMES
= new HashMap<Site,Map<TokenPredicate,String[]>>();
private static final ReentrantReadWriteLock LIST_NAMES_LOCK = new ReentrantReadWriteLock();
private static final GeneralCacheAdministrator CACHE_QUERY = new GeneralCacheAdministrator();
private static final int REFRESH_PERIOD = 60;
// smaller than usual as each entry can contain up to 600 values!
private static final int CACHE_QUERY_CAPACITY = 100;
// Attributes ----------------------------------------------------
private final Context context;
private final Site site;
private final Map<String, List<TokenMatch>> analysisResult;
// Static --------------------------------------------------------
static{
CACHE_QUERY.setCacheCapacity(CACHE_QUERY_CAPACITY);
}
// Constructors -------------------------------------------------
/** Only possible constructor.
*
* @param cxt the required context
* @param factory the factory constructing this.
* @throws EvaluationException if the evaluation, request to the fast query matching index, fails.
**/
VeryFastTokenEvaluator(final Context cxt) throws EvaluationException{
// pre-condition check
context = cxt;
site = cxt.getSite();
init();
// the responsible() method checks if we are responsible for any tokenPredicate at all.
// if we are not then the restful request in query(..) is a waste of time and resource.
analysisResult = responsible()
? queryFast(cleanString(context.getQueryString()))
: Collections.<String, List<TokenMatch>>emptyMap();
}
// Public --------------------------------------------------------
@Override
public boolean evaluateToken(final TokenPredicate token, final String term, final String query) {
boolean evaluation = false;
if(!analysisResult.isEmpty()){
final String[] listnames = getListNames(token);
if(null != listnames){
for(int i = 0; !evaluation && i < listnames.length; ++i){
final String listname = listnames[i];
if (analysisResult.containsKey(listname)) {
if (term == null) {
evaluation = true;
} else {
// HACK since DefaultOperatorClause wraps its children in parenthesis
final String hackTerm = cleanString(term.replaceAll("\\(|\\)",""));
for (TokenMatch occurance : analysisResult.get(listname)) {
final Matcher m = occurance.getMatcher(hackTerm);
evaluation = m.find() && m.start() == 0 && m.end() == hackTerm.length();
if (evaluation) {
break;
}
}
}
}
}
}else{
LOG.info(site + " does not define lists behind the token predicate " + token);
}
}
return evaluation;
}
@Override
public Set<String> getMatchValues(final TokenPredicate token, final String term) {
final Set<String> values;
if(!analysisResult.isEmpty()){
values = new HashSet<String>();
final String[] listnames = getListNames(token);
if(null != listnames){
for(int i = 0; i < listnames.length; i++){
final String listname = listnames[i];
if (analysisResult.containsKey(listname)) {
// HACK since DefaultOperatorClause wraps its children in parenthesis
final String hackTerm = cleanString(term.replaceAll("\\(|\\)",""));
for (TokenMatch occurance : analysisResult.get(listname)) {
final Matcher m = occurance.getMatcher(hackTerm);
if (m.find() && m.start() == 0 && m.end() == hackTerm.length()) {
values.add(occurance.getValue());
}
}
}
}
}
}else{
values = Collections.<String>emptySet();
}
return Collections.unmodifiableSet(values);
}
@Override
public boolean isQueryDependant(final TokenPredicate predicate) {
return predicate.name().startsWith(EXACT_PREFIX.toUpperCase());
}
public boolean isResponsibleFor(final TokenPredicate predicate){
return null != getListNames(predicate);
}
// Package protected ---------------------------------------------
// Protected -----------------------------------------------------
// Private -------------------------------------------------------
private void init() {
try {
initImpl(context);
} catch (ParserConfigurationException ex) {
LOG.error(ERR_FAILED_INITIALISATION, ex);
}
}
static boolean initImpl(final Context cxt) throws ParserConfigurationException {
final Site site = cxt.getSite();
final Site parent = site.getParent();
final boolean parentUninitialised;
try{
LIST_NAMES_LOCK.readLock().lock();
// initialise the parent loopSite's configuration
parentUninitialised = (null != parent && null == LIST_NAMES.get(parent));
}finally{
LIST_NAMES_LOCK.readLock().unlock();
}
if(parentUninitialised){
initImpl(ContextWrapper.wrap(
AbstractEvaluatorFactory.Context.class,
new SiteContext(){
@Override
public Site getSite(){
return parent;
}
},
cxt
));
}
if(null == LIST_NAMES.get(site)){
try{
LIST_NAMES_LOCK.writeLock().lock();
// create map entry for this loopSite
LIST_NAMES.put(site, new HashMap<TokenPredicate,String[]>());
// initialise this loopSite's configuration
final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
final DocumentBuilder builder = factory.newDocumentBuilder();
final DocumentLoader loader = cxt.newDocumentLoader(cxt, VERYFAST_EVALUATOR_XMLFILE, builder);
loader.abut();
LOG.info("Parsing " + VERYFAST_EVALUATOR_XMLFILE + " started");
final Map<TokenPredicate,String[]> listNames = LIST_NAMES.get(site);
final Document doc = loader.getDocument();
if(null != doc && null != doc.getDocumentElement()){
final Element root = doc.getDocumentElement();
final NodeList lists = root.getElementsByTagName("list");
for (int i = 0; i < lists.getLength(); ++i) {
final Element list = (Element) lists.item(i);
final String tokenName = list.getAttribute("token");
LOG.info(" ->list@token: " + tokenName);
TokenPredicate token;
try{
token = TokenPredicateUtility.getTokenPredicate(tokenName);
}catch(IllegalArgumentException iae){
LOG.debug(tokenName + " does not exist. Will create it. Underlying exception was " + iae);
token = TokenPredicateUtility.createAnonymousTokenPredicate(tokenName);
}
final String[] listNameArr = list.getAttribute("list-name").split(",");
LOG.info(" ->lists: " + list.getAttribute("list-name"));
// update each listname to the format the fast query matching servers use
if(null != listNameArr){
for(int j = 0; j < listNameArr.length; ++j){
listNameArr[j] = LIST_PREFIX + listNameArr[j] + LIST_SUFFIX;
}
// put the listnames in
Arrays.sort(listNameArr, null);
listNames.put(token, listNameArr);
}
}
}
LOG.info("Parsing " + VERYFAST_EVALUATOR_XMLFILE + " finished");
}finally{
LIST_NAMES_LOCK.writeLock().unlock();
}
}
try{
LIST_NAMES_LOCK.readLock().lock();
Site s = site;
boolean evaluatorUsedAnywhere = false;
while(!evaluatorUsedAnywhere && null != s){
evaluatorUsedAnywhere |= 0 < LIST_NAMES.get(s).values().size();
if(!evaluatorUsedAnywhere){
// prepare to go to parent
s = s.getParent();
}
}
return evaluatorUsedAnywhere;
}finally{
LIST_NAMES_LOCK.readLock().unlock();
}
}
/**
* Search fast and find out if the given tokens are company, firstname, lastname etc
* @param query
*/
@SuppressWarnings("unchecked")
private Map<String, List<TokenMatch>> queryFast(final String query) throws EvaluationException{
LOG.trace("queryFast( " + query + " )");
Map<String, List<TokenMatch>> result = null;
if (query != null && 0 < query.length()) {
try{
result = (Map<String, List<TokenMatch>>) CACHE_QUERY.getFromCache(query, REFRESH_PERIOD);
} catch (NeedsRefreshException nre) {
boolean updatedCache = false;
result = new HashMap<String,List<TokenMatch>>();
String url = null;
try {
final Properties props = SiteConfiguration.instanceOf(
ContextWrapper.wrap(SiteConfiguration.Context.class,context)).getProperties();
final String host = props.getProperty(TOKEN_HOST_PROPERTY);
final int port = Integer.parseInt(props.getProperty(TOKEN_PORT_PROPERTY));
if(0 < port){
final HTTPClient httpClient = HTTPClient.instance(host, port);
final String token = URLEncoder.encode(query.replaceAll("\"", ""), "utf-8");
url = CGI_PATH + token;
final Document doc = httpClient.getXmlDocument(url);
NodeList l = doc.getElementsByTagName("QUERYTRANSFORMS");
final Element e = (Element) l.item(0);
l = e.getElementsByTagName("QUERYTRANSFORM");
for (int i = 0; i < l.getLength(); ++i) {
final Element trans = (Element) l.item(i);
final String name = trans.getAttribute("NAME");
final String custom = trans.getAttribute("CUSTOM");
final String exactname = 0 <= name.indexOf(LIST_PREFIX) && 0 < name.indexOf(LIST_SUFFIX)
? LIST_PREFIX + EXACT_PREFIX
+ name.substring(name.indexOf('_') + 1, name.indexOf("QM"))
+ LIST_SUFFIX
: null;
if(custom.matches(".+->.*") && usesListName(name, exactname)){
final String match = (custom.indexOf("->") >0
? custom.substring(0, custom.indexOf("->"))
: custom)
// remove words made solely of characters that the parser considers whitespace
.replaceAll("\\b" + SKIP_REGEX + "+\\b", " ");
final String value = custom.indexOf("->") > 0
? custom.substring(custom.indexOf("->") + 2)
: null;
addMatch(name, match, value,query, result);
if (match.equalsIgnoreCase(query.trim())) {
addMatch(exactname, match, value, query, result);
}
}
}
}
result = Collections.unmodifiableMap(result);
CACHE_QUERY.putInCache(query, result);
updatedCache = true;
} catch (UnsupportedEncodingException ignore) {
LOG.warn(ERR_FAILED_TO_ENCODE + query);
result = (Map<String, List<TokenMatch>>)nre.getCacheContent();
} catch (IOException e1) {
LOG.error(ERR_QUERY_FAILED + url, e1);
result = (Map<String, List<TokenMatch>>)nre.getCacheContent();
throw new EvaluationException(ERR_QUERY_FAILED + url, e1);
} catch (SAXException e1) {
LOG.error(ERR_PARSE_FAILED + url, e1);
result = (Map<String, List<TokenMatch>>)nre.getCacheContent();
throw new EvaluationException(ERR_PARSE_FAILED + url, e1);
}finally{
if(!updatedCache){
CACHE_QUERY.cancelUpdate(query);
}
}
}
} else {
result = Collections.<String, List<TokenMatch>>emptyMap();
}
return result;
}
private static void addMatch(
final String name,
final String match,
final String value,
final String query,
final Map<String, List<TokenMatch>> result) {
final String expr = "\\b" + match + "\\b";
final Pattern pattern = Pattern.compile(expr, REG_EXP_OPTIONS);
final String qNew = query.replaceAll("\\b" + SKIP_REGEX + "+\\b", " ");
final Matcher m = pattern.matcher(
// remove words made solely of characters that the parser considers whitespace
qNew);
while (m.find()) {
final TokenMatch tknMatch = TokenMatch.instanceOf(name, match, value);
if (!result.containsKey(name)) {
result.put(name, new ArrayList<TokenMatch>());
}
result.get(name).add(tknMatch);
if (result.get(name).size() % 100 == 0) {
LOG.warn("Pattern: " + pattern.pattern()
+ " name: " + name
+ " query: " + query
+ " match: " + match
+ " query2: " + qNew);
}
}
}
private boolean usesListName(final String listname, final String exactname){
boolean uses = false;
try{
LIST_NAMES_LOCK.readLock().lock();
Site site = this.site;
while(!uses && null != site){
// find listnames used for this token predicate
for(String[] listnames : LIST_NAMES.get(site).values()){
uses |= 0 <= Arrays.binarySearch(listnames, listname, null);
uses |= null != exactname && 0 <= Arrays.binarySearch(listnames, exactname, null);
if(uses){ break; }
}
// prepare to go to parent
site = site.getParent();
}
}finally{
LIST_NAMES_LOCK.readLock().unlock();
}
return uses;
}
private String[] getListNames(final TokenPredicate token){
String[] listNames = null;
try{
LIST_NAMES_LOCK.readLock().lock();
Site site = this.site;
while(null == listNames && null != site){
// find listnames used for this token predicate
listNames = LIST_NAMES.get(site).get(token);
// prepare to go to parent
site = site.getParent();
}
}finally{
LIST_NAMES_LOCK.readLock().unlock();
}
return listNames;
}
private String cleanString(final String string){
// Strip out SKIP characters we are not interested in.
// Also remove any operator characters. (SEARCH-3883 & SEARCH-3967)
return string.toLowerCase()
.replaceAll(" ", "xxKEEPWSxx") // Hack to keep spaces. multiple spaces always normalised.
.replaceAll(SKIP_REGEX, " ")
.replaceAll("xxKEEPWSxx", " ") // Hack to keep spaces.
.replaceAll(OPERATOR_REGEX, " ")
.replaceAll(" +", " "); // normalise
}
private boolean responsible(){
boolean responsible = false;
try{
LIST_NAMES_LOCK.readLock().lock();
Site loopSite = this.site;
while(null != loopSite){
// find listnames used for this token predicate
responsible = !LIST_NAMES.get(loopSite).isEmpty();
if(responsible){ break; }
// prepare to go to parent
loopSite = loopSite.getParent();
}
}finally{
LIST_NAMES_LOCK.readLock().unlock();
}
return responsible;
}
// Inner classes -------------------------------------------------
}