package org.apache.lucene.queryparser.flexible.aqp.processors; import java.util.ArrayList; import java.util.List; import org.apache.lucene.queryparser.flexible.aqp.AqpAdsabsQueryParser; import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.config.AqpRequestParams; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAndQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNearQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNotQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpOrQueryNode; import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler; import org.apache.lucene.queryparser.flexible.core.messages.QueryParserMessages; import org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.TokenizedPhraseQueryNode; import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorImpl; import org.apache.lucene.queryparser.flexible.messages.MessageImpl; import org.apache.lucene.queryparser.flexible.standard.nodes.MultiPhraseQueryNode; import org.apache.solr.common.params.SolrParams; /** * This processor must follow the {@link AqpAnalyzerQueryNodeProcessor} * It will build a graph of the query node and will handle the cases * where a synonym expansion spans over several tokens. Basically, * we build every possible path that queries can be constructed. * * You can supply your own query builder(s) which can do different things * based on the type of the resulting query graph. Ie. you may want to * add a boost to queries that were original input, or wrap queries with * many terms into a spanquery instead of a phrase. The options are many! * * This processor will extract all synonyms from the "multi token stream" * it will join the synonyms with OR's and keep other tokens in place * ie. it will create a new tree: * * <pre> * hubble space telescope goes home * | * ---------------- * / \ \ * (hubble space telescope | HST) goes home * * </pre> * */ public class AqpPostAnalysisProcessor extends QueryNodeProcessorImpl { @Override protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException { return node; } @Override protected QueryNode preProcessNode(QueryNode node) throws QueryNodeException { if (node.getTag(AqpAdsabsAnalyzerProcessor.ANALYZED) != null) { List<List<List<QueryNode>>> queryStructure; AqpRequestParams req = getRequest(); SolrParams params = req.getParams(); final String unfieldedDefaultOperator = "and"; if (params != null) { params.get(AqpAdsabsQueryParser.AQP_UNFIELDED_OPERATOR_PARAM, "or").toLowerCase(); } if (node instanceof TokenizedPhraseQueryNode) { // no need to do anything return node; } else if (node instanceof GroupQueryNode ) { // may have multi-token expansion (when it wasn't surrounded by "") if (node.getChildren().size() > 0 && node.getChildren().get(0) instanceof BooleanQueryNode && ((BooleanQueryNode) node.getChildren().get(0)).getChildren().size() > 1) { queryStructure = extractQueries(node.getChildren().get(0)); final int proximity = getDefaultProximityValue(); return buildNewQueryNode(queryStructure, new QueryBuilder() { @Override public QueryNode buildQuery(List<QueryNode> clauses) { if (unfieldedDefaultOperator.equals("span")) { return new AqpNearQueryNode(clauses, proximity); } else if (unfieldedDefaultOperator.equals("and")) { return new AqpAndQueryNode(clauses); } else if (unfieldedDefaultOperator.equals("not")) { return new AqpNotQueryNode(clauses); } else { return new AqpOrQueryNode(clauses); } } } ); } } else if (node instanceof MultiPhraseQueryNode ) { queryStructure = extractQueries(node); if (node.getParent() instanceof FuzzyQueryNode) { // "some span query"~3 final FuzzyQueryNode parent = (FuzzyQueryNode) node.getParent(); return buildNewQueryNode(queryStructure, new QueryBuilder() { @Override public QueryNode buildQuery(List<QueryNode> clauses) { return new AqpNearQueryNode(clauses, parent.getPositionIncrement()); } } ); } else { return buildNewQueryNode(queryStructure, // default: create boolean ((+a +b) OR (+a +(b|c))) new QueryBuilder() { @Override public QueryNode buildQuery(List<QueryNode> clauses) { if (this.isMultiDimensional) { MultiPhraseQueryNode pq = new MultiPhraseQueryNode(); for (QueryNode c: clauses) { if (c.isLeaf()) { pq.add(c); } else { for (QueryNode child: c.getChildren()) { pq.add(child); } } } return pq; } else { TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); //MultiPhraseQueryNode pq = new MultiPhraseQueryNode(); pq.add(clauses); return pq; } } } ); } } // do nothing, we don't know how to process this type return node; } return node; } private AqpRequestParams getRequest() throws QueryNodeException { QueryConfigHandler config = getQueryConfigHandler(); AqpRequestParams reqAttr = config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST); if (config == null || config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST) == null) { throw new QueryNodeException(new MessageImpl( QueryParserMessages.LUCENE_QUERY_CONVERSION_ERROR, "Configuration error: " + "SOLR_REQUEST is missing")); } return config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST); } private Integer getDefaultProximityValue() throws QueryNodeException { QueryConfigHandler queryConfig = getQueryConfigHandler(); if (queryConfig == null || !queryConfig.has(AqpStandardQueryConfigHandler.ConfigurationKeys.DEFAULT_PROXIMITY)) { throw new QueryNodeException(new MessageImpl( QueryParserMessages.LUCENE_QUERY_CONVERSION_ERROR, "Configuration error: " + "DefaultProximity value is missing")); } return queryConfig.get(AqpStandardQueryConfigHandler.ConfigurationKeys.DEFAULT_PROXIMITY); } /* * Build a simple Query node from * queries * - list of queries, all the possible combinations of consecutive * QueryNodes ordered to cover the query input */ protected QueryNode buildNewQueryNode (List<List<List<QueryNode>>> queries, QueryBuilder queryBuilder) { List<QueryNode> mainQueryClauses = new ArrayList<QueryNode>(); for (List<List<QueryNode>> oneQuery: queries) { List<QueryNode> clauses = new ArrayList<QueryNode>(); for (List<QueryNode> qElement: oneQuery) { clauses.add(queryBuilder.buildQueryElement(qElement)); } if (clauses.size() > 1) { mainQueryClauses.add(queryBuilder.buildQuery(clauses)); } else { mainQueryClauses.add(clauses.get(0)); } } return queryBuilder.buildTopQuery(mainQueryClauses); } /* * this method knows to handle FieldQueryNodes, it is especially useful * for MultiPhraseQueryNode * * If there are non-fieldable nodes, it will fail. We cannot process * such queries (and we shouldn't!) */ protected List<List<List<QueryNode>>> extractQueries(QueryNode node) throws QueryNodeException { List<QueryNode> children = node.getChildren(); NodeOfQuery graph = new NodeOfQuery(-1, -1); for (QueryNode child : children) { //System.out.println("addToken(): " + child); graph.consume(child); } //System.out.println(graph.toString()); List<List<List<QueryNode>>> queries; try { queries = graph.traverseGraphFindAllQueries(); } catch (CloneNotSupportedException e) { throw new QueryNodeException(e); } // each list is a query - inside the query, every // element is a list (if there are more elements, they // share the same span) return queries; } @Override protected List<QueryNode> setChildrenOrder(List<QueryNode> children) throws QueryNodeException { return children; } class NodeOfQuery { protected int startPos; private List<QueryNode> payload = new ArrayList<QueryNode>(); private List<NodeOfQuery> children; private int nodeRetrieved = 0; protected int endPos = -1; public NodeOfQuery(int startPosition, int endPosition) { startPos = startPosition; endPos = endPosition; payload = new ArrayList<QueryNode>(); children = new ArrayList<NodeOfQuery>(); } public NodeOfQuery(QueryNode node) { startPos = ((FieldQueryNode) node).getBegin(); payload = new ArrayList<QueryNode>(); children = new ArrayList<NodeOfQuery>(); endPos = ((FieldQueryNode) node).getEnd(); payload.add(node); } @Override public String toString() { return prn(0); } public String prn(int indent) { StringBuilder sb = new StringBuilder(); for (int i=0;i<indent;i++) { sb.append(" "); } String ind = sb.toString(); sb = new StringBuilder(); sb.append(ind + "<NodeOfQuery startPos=\"" + this.startPos + "\" endPos=\"" + this.endPos + "\"/>\n"); for (QueryNode child: payload) { sb.append(ind + "<payload>" + child + "</payload>\n"); } for (NodeOfQuery child: children) { sb.append(ind + "<child>\n"); sb.append(child.prn(indent + 2)); sb.append(ind +"</child>\n"); } sb.append(ind + "</NodeOfQuery>\n"); return sb.toString(); } public void consume(QueryNode qnode) { FieldQueryNode node = ((FieldQueryNode) qnode); boolean descended = false; for (NodeOfQuery child: children) { if (child.startPos == node.getBegin() && child.endPos == node.getEnd()) { child.addPayload(node); return; } if (child.startPos < node.getBegin() && child.endPos < node.getEnd()) { child.consume(qnode); descended = true; } } if (descended == false && node.getBegin() > this.startPos) { children.add(new NodeOfQuery(node)); } } public void addPayload(QueryNode node) { if (!payload.contains(node)) //System.out.println("Adding payload: " + node); payload.add(node); } public void drillDown(QueryPath path) { if (children.size() == 0) { // terminal node path.terminus(); return; } for (NodeOfQuery child: children) { path.push(child.startPos); path.push(child.endPos); child.drillDown(path); path.pop(); path.pop(); } } public List<List<List<QueryNode>>> traverseGraphFindAllQueries() throws CloneNotSupportedException { QueryPath path = new QueryPath(); // find all queries drillDown(path); // measure how long a string the query covers List<List<Integer>> paths = path.getAllPaths(); int[] measured = measurePathsInclGaps(paths); // we'll consider only the queries that cover the max distance int max = 0; for (int m: measured) { if (m > max) max = m; } List<List<List<QueryNode>>> queries = new ArrayList<List<List<QueryNode>>>(); // retrieve only the queries made of query elements that cover the longest distance for (int i=0;i<measured.length;i++) { if (measured[i] != max) { //System.out.println("ignoring:" + measured[i] + " " + paths.get(i).toString()); continue; } List<List<QueryNode>> oneQuery = new ArrayList<List<QueryNode>>(); retrieveQueryElements(oneQuery, paths.get(i), 0); assert oneQuery.size() == paths.get(i).size() / 2; queries.add(oneQuery); } assert queries.size() > 0; return queries; } private void retrieveQueryElements(List<List<QueryNode>> oneQuery, List<Integer> path, int pos) throws CloneNotSupportedException { if (pos >= path.size()) return; Integer keyStart = path.get(pos); Integer keyEnd = path.get(pos+1); for (NodeOfQuery child: children) { if (child.startPos == keyStart && child.endPos == keyEnd) { child.insertItself(oneQuery); child.retrieveQueryElements(oneQuery, path, pos+2); return; } } throw new IllegalStateException("Trying to get query element that doesn't exist: " + keyStart + ":" + keyEnd); } private void insertItself(List<List<QueryNode>> oneQuery) throws CloneNotSupportedException { //if (nodeRetrieved > 0) { ArrayList<QueryNode> copyOfNodes = new ArrayList<QueryNode>(payload.size()); for (QueryNode n: payload) { copyOfNodes.add(n.cloneTree()); } oneQuery.add(copyOfNodes); //} //else { // oneQuery.add(payload); //} } private int[] measurePaths(List<List<Integer>> paths) { int[] measuredPaths = new int[paths.size()]; int j = 0; for (List<Integer> path: paths) { assert path.size() % 2 == 0; int length = 0; for (int i=0;i<path.size(); i=i+2) { length += path.get(i+1) - path.get(i); } length += (path.size() / 2)-1; // number of edges (assuming it equals 1 space, hm...) measuredPaths[j++] = length; } return measuredPaths; } /** * Measure the length that the path covers; but penalize gaps; * eg. if there is a gap between tokens bigger than 2; the total * length will be decreased * * @param paths * list of lists of lengths (consecutive segments of path segments) */ private int[] measurePathsInclGaps(List<List<Integer>> paths) { int[] measuredPaths = new int[paths.size()]; int pathLength = 0; for (int j=0; j<measuredPaths.length; j++) { List<Integer> path = paths.get(j); assert path.size() % 2 == 0; pathLength = path.get(path.size()-1) - path.get(0); int gaps = 0; // measure the gaps between tokens for (int i=1;i<path.size()-1; i=i+2) { int g = path.get(i+1) - path.get(i); gaps += g; if (g <= 2) { gaps -= g; } } // total length the measuredPaths[j] = pathLength - gaps; } return measuredPaths; } } class QueryPath { private ArrayList<Integer> data; private ArrayList<List<Integer>> paths; public QueryPath() { data = new ArrayList<Integer>(); paths = new ArrayList<List<Integer>>(); } public void push(Integer position) { data.add(position); } public Integer pop() { return data.remove(data.size()-1); } public void terminus() { ArrayList<Integer> newData = new ArrayList<Integer>(data); paths.add(newData); } public List<List<Integer>> getAllPaths() { return paths; } } class QueryBuilder { public boolean isMultiDimensional = false; public void reset() { isMultiDimensional = false; } public QueryNode buildQueryElement(List<QueryNode> samePositionElements) { if (samePositionElements.size() > 1) { // synonymous tokens at the same position/offset isMultiDimensional = true; return new AqpOrQueryNode(samePositionElements); } else { return samePositionElements.get(0); } } public QueryNode buildQuery(List<QueryNode> queryElements) { return new AqpAndQueryNode(queryElements); } public QueryNode buildTopQuery(List<QueryNode> mainQueryClauses) { if (mainQueryClauses.size() == 1) { return mainQueryClauses.get(0); } else { return new AqpOrQueryNode(mainQueryClauses); } } } }