package org.apache.lucene.queryparser.flexible.aqp.processors; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Locale; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.LegacyNumericTokenStream.LegacyNumericTermAttribute; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpDefopQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpOrQueryNode; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler; import org.apache.lucene.queryparser.flexible.core.messages.QueryParserMessages; import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.FieldableNode; import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.ModifierQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.TextableQueryNode; import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorImpl; import org.apache.lucene.queryparser.flexible.messages.MessageImpl; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; /* * A first incarnation of the code that was able to deal with * multi-token synonym replacements. This class was superseded * by: AqpAdsabsPostAnalysisProcessor * * This processor will extract all synonyms from the "multi token stream" * it will join the synonyms with OR's and keep other tokens in place * ie. it will create a new tree: * * <pre> * hubble space telescope goes home * | * ---------------- * / \ \ * (hubble space telescope | HST) goes home * * </pre> * * to use it, your pipeline must look like: * * <pre> * add(new AqpDEFOPMarkPlainNodes(false)); * .... * add(new AqpMultiWordProcessor()); * </pre> */ public class AqpMultiWordProcessor extends QueryNodeProcessorImpl { private CachingTokenFilter buffer; private CharTermAttribute termAtt; private LegacyNumericTermAttribute numAtt; private TypeAttribute typeAtt; private PositionIncrementAttribute posAtt; private OffsetAttribute offsetAtt; private ArrayList<int[]> multiTokenSynonymSpans = new ArrayList<int[]>(); public AqpMultiWordProcessor() { // empty } @Override public QueryNode process(QueryNode queryTree) throws QueryNodeException { QueryConfigHandler config = this.getQueryConfigHandler(); if (config.has(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST) && config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST) .getRequest() != null) { return super.process(queryTree); } return queryTree; } @Override protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException { if (node instanceof AqpDefopQueryNode) { LinkedList<QueryNode> newChildren = new LinkedList<QueryNode>(); List<QueryNode> children = node.getChildren(); String multiToken; for (int i=0;i<children.size();i++) { QueryNode child = children.get(i); QueryNode terminalNode = getTerminalNode(child); multiToken = (String) terminalNode.getTag(AqpDEFOPMarkPlainNodes.PLAIN_TOKEN_CONCATENATED); if (multiToken != null && analyzeMultiToken(((FieldableNode) terminalNode).getField(), multiToken) > 0) { i = expandWithSynonyms(children, newChildren, terminalNode, i, ((ModifierQueryNode) child).getModifier()); } else { newChildren.add(child); } } node.set(newChildren); } return node; } private int expandWithSynonyms(List<QueryNode> children, LinkedList<QueryNode> newChildren, QueryNode terminalNode, int i, ModifierQueryNode.Modifier modifier) { FieldableNode fieldNode = (FieldableNode) terminalNode; int startingPosition = ((FieldQueryNode) fieldNode).getBegin(); int maxOffset = ((String) terminalNode.getTag(AqpDEFOPMarkPlainNodes.PLAIN_TOKEN_CONCATENATED)).length() + startingPosition; int startOffset = 0; int endOffset = 0; LinkedList<QueryNode> synChildren = new LinkedList<QueryNode>(); // The difficulty here is that we are looking into two streams of tokens // the buffer shows us synonyms, but we must find their source-tokens // inside chilren[] try { buffer.reset(); while (buffer.incrementToken()) { typeAtt = buffer.getAttribute(TypeAttribute.class); offsetAtt = buffer.getAttribute(OffsetAttribute.class); posAtt = buffer.getAttribute(PositionIncrementAttribute.class); termAtt = buffer.getAttribute(CharTermAttribute.class); //System.out.println(termAtt.toString() + "; type=" + typeAtt.type()); // seek until we find the first synonym if (!typeAtt.type().equals(SynonymFilter.TYPE_SYNONYM)) { if (posAtt.getPositionIncrement()==0 && synChildren.size()>0) { // add acronyms and other stuff (it is already analyzed, so we wrap it into AqpNonAnalyzedQN) synChildren.add(new AqpNonAnalyzedQueryNode(getNewNode((FieldQueryNode) fieldNode))); } if (synChildren.size()>0) { // we already have synonyms from the previous run // move the pointer to exclude tokens *before* the syn-tokens for (int j=i;j<children.size();j++) { FieldQueryNode tn = (FieldQueryNode) getTerminalNode(children.get(j)); if (tn.getBegin() >= startOffset+startingPosition && tn.getEnd() <= endOffset+startingPosition) { // token which is inside the synonym fieldNode = (FieldableNode) tn; //mark the current synonym source (its first token) i++; } else { break; } } newChildren.add(new ModifierQueryNode(new AqpOrQueryNode(synChildren), modifier)); synChildren = new LinkedList<QueryNode>(); } continue; } // test we deal with multi-token synonym (as opposed to single-token synonym) if (!isMultiTokenSynonym(offsetAtt.startOffset(), offsetAtt.endOffset())) continue; // discover offsets (the longest range) for synonyms startOffset = offsetAtt.startOffset(); endOffset = offsetAtt.endOffset() > endOffset ? offsetAtt.endOffset() : endOffset; // find tokens that are *before* the syn-tokens for (int j=i;j<children.size();j++) { FieldQueryNode tn = (FieldQueryNode) getTerminalNode(children.get(j)); if (tn.getBegin() >= startOffset+startingPosition && tn.getEnd() <= endOffset+startingPosition) { // token which is inside the synonym fieldNode = (FieldableNode) tn; //mark the current synonym source (its first token) break; } newChildren.add(children.get(j)); i++; } // add synonym (it is already analyzed, so we wrap it into AqpNonAnalyzedQN) synChildren.add(new AqpNonAnalyzedQueryNode(getNewNode((FieldQueryNode) fieldNode))); } } catch (IOException e) { getQueryConfigHandler().get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_LOGGER).error(e.getLocalizedMessage()); } catch (CloneNotSupportedException e) { getQueryConfigHandler().get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_LOGGER).error(e.getLocalizedMessage()); } // find tokens that are *after* the last syn-tokens (but not exceeding offset of the current buffer) for (int j=i;j<children.size();j++) { FieldQueryNode tn = (FieldQueryNode) getTerminalNode(children.get(j)); if (tn.getBegin() >= startOffset+startingPosition && tn.getEnd() <= endOffset+startingPosition) { // token which is inside the synonym i++; } else if(tn.getEnd() <= maxOffset) { newChildren.add(children.get(j)); i++; } else { break; } } // if there was a synonym at the end, it will be here if (synChildren.size()>0) { newChildren.add(new GroupQueryNode(new AqpOrQueryNode(synChildren))); synChildren = new LinkedList<QueryNode>(); } return i; } private boolean isMultiTokenSynonym(int startOffset, int endOffset) { for (int s[]: multiTokenSynonymSpans) { if (s[0]==startOffset && s[1]==endOffset) return true; } return false; } private FieldQueryNode getNewNode(FieldQueryNode master) throws CloneNotSupportedException { FieldableNode newNode = (FieldableNode) master.cloneTree(); if (buffer.hasAttribute(CharTermAttribute.class)) { termAtt = buffer.getAttribute(CharTermAttribute.class); ((TextableQueryNode) newNode).setText(termAtt.toString()); } else { numAtt = buffer.getAttribute(LegacyNumericTermAttribute.class); ((TextableQueryNode) newNode).setText(new Long(numAtt.getRawValue()).toString()); } return (FieldQueryNode) newNode; } private QueryNode getTerminalNode(QueryNode node) { if (node.isLeaf()) { return node; } for (QueryNode child: node.getChildren()) { if (child.containsTag(AqpDEFOPMarkPlainNodes.PLAIN_TOKEN)) { return child; } return getTerminalNode(child); } return null; } private int analyzeMultiToken(CharSequence field, String multiToken) { QueryConfigHandler config = this.getQueryConfigHandler(); Locale locale = getQueryConfigHandler().get(ConfigurationKeys.LOCALE); if (locale == null) { locale = Locale.getDefault(); } Analyzer analyzer = config.get(StandardQueryConfigHandler.ConfigurationKeys.ANALYZER); TokenStream source = null; try { source = analyzer.tokenStream(field.toString(), new StringReader(multiToken)); source.reset(); } catch (IOException e1) { return -1; } buffer = new CachingTokenFilter(source); ArrayList<int[]> spans = new ArrayList<int[]>(); ArrayList<int[]> synSpans = new ArrayList<int[]>(); try { while (buffer.incrementToken()) { typeAtt = buffer.getAttribute(TypeAttribute.class); offsetAtt = buffer.getAttribute(OffsetAttribute.class); if (typeAtt.type().equals(SynonymFilter.TYPE_SYNONYM)) { synSpans.add(new int[]{offsetAtt.startOffset(), offsetAtt.endOffset()}); } else { spans.add(new int[]{offsetAtt.startOffset(), offsetAtt.endOffset()}); } } } catch (IOException e) { // pass } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } multiTokenSynonymSpans.clear(); for (int[] synSpan: synSpans) { for (int[] sourceSpan: spans) { if (synSpan[0] <= sourceSpan[0] && synSpan[1] >= sourceSpan[1] && synSpan[1]-synSpan[0] > sourceSpan[1]-sourceSpan[0]) { multiTokenSynonymSpans.add(synSpan); } } } return multiTokenSynonymSpans.size(); } @Override protected QueryNode preProcessNode(QueryNode node) throws QueryNodeException { return node; } @Override protected List<QueryNode> setChildrenOrder(List<QueryNode> children) throws QueryNodeException { return children; } protected QueryNode expandMultiToken(QueryNode node) throws QueryNodeException { FieldableNode fieldNode = (FieldableNode) node; LinkedList<QueryNode> children = new LinkedList<QueryNode>(); //children.add(new AqpNonAnalyzedQueryNode((FieldQueryNode) fieldNode)); // original input try { buffer.reset(); while (buffer.incrementToken()) { posAtt = buffer.getAttribute(PositionIncrementAttribute.class); typeAtt = buffer.getAttribute(TypeAttribute.class); if (!typeAtt.type().equals(SynonymFilter.TYPE_SYNONYM)) { continue; } FieldableNode newNode = (FieldableNode) fieldNode.cloneTree(); if (buffer.hasAttribute(CharTermAttribute.class)) { termAtt = buffer.getAttribute(CharTermAttribute.class); ((TextableQueryNode) newNode).setText(termAtt.toString()); } else if(buffer.hasAttribute(LegacyNumericTermAttribute.class)) { numAtt = buffer.getAttribute(LegacyNumericTermAttribute.class); ((TextableQueryNode) newNode).setText(new Long(numAtt.getRawValue()).toString()); } children.add(new AqpNonAnalyzedQueryNode((FieldQueryNode) newNode)); } } catch (IOException e) { getQueryConfigHandler().get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_LOGGER).error(e.getLocalizedMessage()); } catch (CloneNotSupportedException e) { getQueryConfigHandler().get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_LOGGER).error(e.getLocalizedMessage()); } if (children.size() < 1) { throw new QueryNodeException(new MessageImpl( QueryParserMessages.PARAMETER_VALUE_NOT_SUPPORTED, "This should never hapeeeeennnn! Error expanding synonyms for: " + node.toString() + "")); } return new GroupQueryNode(new AqpOrQueryNode(children)); } }