package org.apache.lucene.queryparser.flexible.aqp.processors; import java.io.IOException; import java.io.StringReader; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.queryparser.flexible.aqp.ADSEscapeQuerySyntaxImpl; import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpFunctionQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpOrQueryNode; import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQProcessor.OriginalInput; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler; import org.apache.lucene.queryparser.flexible.core.nodes.BoostQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorImpl; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; import org.apache.solr.common.params.DisMaxParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.util.SolrPluginUtils; /** * This analyzer is a WORKAROUND for the unfieded searches - edismax * does not know how to handle multi-token synonyms. And this problem * will be 'fully' solved only after I rewrite the whole edismax * into aqp (and I don't want to mess with that right now) * * But it should be done, because we are running analysis just * to discover a multitoken synonym (ie. synonym spanning over * several tokens). We will remember it and make the parser to * build a new AST * * <pre> * AqpFunctionQueryNode("some hubble space telescope") * </pre> * * becomes: * * <pre> * OR * | * ------------------------------------------ * / \ * FieldQN AqpFunctionQueryNode("some hubble space telescope") * | * ------------- * / \ * title 'hubble space telescope' * </pre> * * This workaround is necessary only for PHRASE queries (because in ADS * we index even multi-token as single words, so everything work as expected * even with edismax) * * To activate it, you MUST set: * aqp.unfielded.phrase.edismax.synonym.workaround = true * qf = list of edismax fields */ public class AqpAdsabsExtractMultisynonymsProcessor extends QueryNodeProcessorImpl { private CharTermAttribute termAtt; private float boostCorrection = 0.9f; public AqpAdsabsExtractMultisynonymsProcessor() { // empty } @Override public QueryNode process(QueryNode queryTree) throws QueryNodeException { QueryConfigHandler config = this.getQueryConfigHandler(); if (config.has(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST) && config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST) .getRequest() != null) { Map<String, String> args = getQueryConfigHandler().get( AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER); if (!args.containsKey("aqp.unfielded.phrase.edismax.synonym.workaround")) { return queryTree; } if (args.containsKey("aqp.unfielded.phrase.edismax.synonym.workaround.boost.correction")) { boostCorrection = Float.parseFloat(args.get("aqp.unfielded.phrase.edismax.synonym.workaround.boost.correction")); } return super.process(queryTree); } return queryTree; } @Override protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException { if (node instanceof AqpFunctionQueryNode && ((AqpFunctionQueryNode) node).getName().contains("edismax")) { OriginalInput oi = ((AqpFunctionQueryNode) node).getOriginalInput(); if (oi == null) return node; String subQuery = oi.value; subQuery = ADSEscapeQuerySyntaxImpl.discardEscapeChar(subQuery).toString(); if (!subQuery.contains("\"")) { return node; // not necessary for normal queries } QueryConfigHandler config = this.getQueryConfigHandler(); SolrQueryRequest req = config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST) .getRequest(); SolrParams params = req.getParams(); String qf = params.get(DisMaxParams.QF); if (qf == null) { return node; } List<QueryNode> lst = new ArrayList<QueryNode>(); Map<String,Float> parsedQf = SolrPluginUtils.parseFieldBoosts(qf); for (Entry<String, Float> entry: parsedQf.entrySet()) { String field = entry.getKey(); if (field.charAt(0) == '_') // special fields continue; Float boost = entry.getValue(); ArrayList<String> result = extractMultiSpanTokens(field, subQuery); if (result != null && result.size() > 0) { for (String val: result) { lst.add(new BoostQueryNode( new AqpNonAnalyzedQueryNode(new FieldQueryNode(field, val, -1, -1)), boost!=null ? new BigDecimal(boost*boostCorrection).setScale(3, RoundingMode.HALF_EVEN).floatValue() : 1.0f)); } } } if (lst.size() >0 ) { lst.add(node); return new AqpOrQueryNode(lst); } } return node; } private ArrayList<String> extractMultiSpanTokens(CharSequence field, String value) throws QueryNodeException { QueryConfigHandler config = this.getQueryConfigHandler(); Locale locale = getQueryConfigHandler().get(ConfigurationKeys.LOCALE); if (locale == null) { locale = Locale.getDefault(); } Analyzer analyzer = config.get(StandardQueryConfigHandler.ConfigurationKeys.ANALYZER); ArrayList<String> out = new ArrayList<String>(); TokenStream source = null; try { source = analyzer.tokenStream(field.toString(), new StringReader(value)); source.reset(); } catch (IOException e1) { return null; } int start = -1; int end = -1; int lastTokenStart = -1; int lastTokenEnd = -1; boolean isMultiSpan = false; OffsetAttribute offsetAtt = null; try { while (source.incrementToken()) { if (source.hasAttribute(OffsetAttribute.class)) { offsetAtt = source.getAttribute(OffsetAttribute.class); } else { break; } start = offsetAtt.startOffset(); end = offsetAtt.endOffset(); termAtt = source.getAttribute(CharTermAttribute.class); if (start == lastTokenStart && end > lastTokenEnd && isMultiSpan) { out.add(termAtt.toString()); continue; } if (start <= lastTokenEnd && end > lastTokenEnd) { isMultiSpan = true; out.add(termAtt.toString()); continue; } else { isMultiSpan = false; } lastTokenStart = start; lastTokenEnd = end; } } catch (IOException e) { // pass } return out; } @Override protected QueryNode preProcessNode(QueryNode node) throws QueryNodeException { return node; } @Override protected List<QueryNode> setChildrenOrder(List<QueryNode> children) throws QueryNodeException { return children; } }