package org.apache.lucene.queryparser.flexible.aqp.processors; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.antlr.runtime.CharStream; import org.apache.commons.lang.StringUtils; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpANTLRNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpImmutableGroupQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpOrQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpWhiteSpacedQueryNode; import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQProcessor; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.queryparser.flexible.messages.MessageImpl; import org.apache.lucene.queryparser.flexible.standard.parser.ParseException; /** * * Looks at the nodes below DEFOP QN and marks the nodes * and concatenates them if possible into one tokens. * * Using the following example: * * <pre> * DEFOP * | * / | \ * MODIFIER MOD.. CLAUSE * / | \ * TMODIFIER TMODIFIER MODIFIER * / | \ * FIELD FIELD ..... * / | * QNORMAL QNORMAL * / | * weak lensing * </pre> * * Several options are available: * * REPLACE - it will replace the concatenated token; it will also * check whether one of the parts was wildcard search, * in that case the new token will be marked as QTRUNCATED * * <pre> * DEFOP * | * / \ * MODIFIER CLAUSE * / \ * TMODIFIER MODIFIER * / \ * FIELD ..... * / * QNORMAL * / * weak lensing * </pre> * * ADD - it adds the concatenated token next to the originals * * <pre> * DEFOP * |\ * | \------------ * OR \ * ---- \ * | \ \ * DEFOP MODIFIER \ * / | \ \ * MODIFIER MOD.. TMOD.. CLAUSE * / | \ \ * TMODIFIER TMODIFIER FIELD MODIFIER * / | \ \ * FIELD FIELD QNORMAL ..... * / | \ * QNORMAL QNORMAL weak lensing * / | * weak lensing * </pre> * * <p> * Care is taken not to join when the fields are different and * when there is operator/clause/modifier inbetween * * */ public class AqpDEFOPUnfieldedTokens extends AqpQProcessor { public static String PLAIN_TOKEN = "PLAIN_TOKEN"; public static String PLAIN_TOKEN_SEPARATOR = " "; public static String PLAIN_TOKEN_CONCATENATED = "PLAIN_TOKEN_CONCATENATED"; /* * Nodes will be considered 'bare' even if they have any of these modifiers * e.g. +foo will be effectively treated as if it was 'foo' */ private List<String> ignoreModifiers; /* * Dtto as above */ private List<String> ignoreTModifiers; private List<String> ignoreFields; private List<String> catchQTypes; private String operationMode; private List<String> wildcardQTypes; /* * Default constructor with sensible defaults */ public AqpDEFOPUnfieldedTokens() { ignoreModifiers = Arrays.asList("PLUS", "MINUS"); ignoreTModifiers = Arrays.asList(""); ignoreFields = null; //Arrays.asList("pubdate"); catchQTypes = Arrays.asList("QNORMAL", "QTRUNCATED", "QDELIMITER"); wildcardQTypes = Arrays.asList("QTRUNCATED"); operationMode = null; // null == wait for the request } public AqpDEFOPUnfieldedTokens( List<String> firstChildAllowedModifiers, List<String> firstChildAllowedFields, List<String> ignoreFields, List<String> catchQTypes, List<String> wildcardQTypes, String strategy ) { this.ignoreModifiers = firstChildAllowedModifiers; this.ignoreTModifiers = firstChildAllowedFields; this.ignoreFields = ignoreFields; this.catchQTypes = catchQTypes; this.wildcardQTypes = wildcardQTypes; this.operationMode = strategy; } public boolean nodeIsWanted(AqpANTLRNode node) { if (node.getTokenLabel().equals("DEFOP")) { if (node.getParent() != null && node.getParent().getParent() != null) { QueryNode immediateParent = node.getParent(); QueryNode distantParent = node.getParent().getParent(); if (!(immediateParent instanceof AqpANTLRNode)) return false; AqpANTLRNode distantP = (AqpANTLRNode) distantParent; AqpANTLRNode immediateP = (AqpANTLRNode) immediateParent; if (immediateP.getTokenName().equals("CLAUSE") && distantP.getTokenName().equals("FIELD") && distantP.getChildren().size() == 2 && ((AqpANTLRNode) (distantP.getChildren().get(0))).getTokenInput() != null) { return false; } if (immediateP.getTokenName().equals("TMODIFIER") && distantP.getTokenName().equals("MODIFIER") && distantP.getChildren().size() == 2) { AqpANTLRNode modifier = ((AqpANTLRNode) (distantP.getChildren().get(0))); if (modifier.getTokenName() != null && !ignoreModifiers.contains(modifier.getTokenName())) return false; } } return true; } return false; } public QueryNode createQNode(AqpANTLRNode node) throws QueryNodeException { // only one child, do nothing if (node.getChildren().size() == 1) { return node; } // harvest node info into a manageable form List<NodeInfo> nodeInfos = harvestNodeChildren(node); List<NodeInfo> newGroup = new ArrayList<NodeInfo>(); List<QueryNode> newChildren = new ArrayList<QueryNode>(); // go through the list of nodes, decide what to do with them // insert the results into 'newChildren' try { for (NodeInfo ninfo: nodeInfos) { if (ninfo.isBareNode(newGroup.size() == 0)) { newGroup.add(ninfo); } else { if (newGroup.size() > 1) { decideInsertChild(newChildren, newGroup); newGroup.clear(); if (ninfo.isBareNode(true)) { newGroup.add(ninfo); } else { newChildren.add(ninfo.getOriginalNode()); } } else { if (newGroup.size() == 1) { newChildren.add(newGroup.remove(0).getOriginalNode()); } if (ninfo.isBareNode(true)) { newGroup.add(ninfo); } else { newChildren.add(ninfo.getOriginalNode()); } } } } if (newGroup.size() > 1) { decideInsertChild(newChildren, newGroup); } else if (newGroup.size() == 1) { newChildren.add(newGroup.get(0).getOriginalNode()); } } catch (CloneNotSupportedException e) { throw new QueryNodeException(e); } // set the modifications back into the parent node.set(newChildren); return node; } private void decideInsertChild(List<QueryNode> newChildren, List<NodeInfo> newGroup) throws CloneNotSupportedException, QueryNodeException { if (operationMode == null) { operationMode = getStrategy(); } if (operationMode.equals("tag")) { tagChildren(newGroup); for (NodeInfo ninfo: newGroup) { newChildren.add(ninfo.getOriginalNode()); } } else if (operationMode.equals("join")) { // concatenates into one single node newChildren.add(createReplacementNode(newGroup, null)); } else if (operationMode.equals("add")) { // (original original...) OR (single node) QueryNode replacementNode = createReplacementNode(newGroup, null); QueryNode defopNode = cloneNode(newGroup.get(0).getOriginalNode().getParent()); ArrayList<QueryNode> defopChildren = new ArrayList<QueryNode>(); defopNode.set(defopChildren); for (NodeInfo n: newGroup) { defopChildren.add(n.getOriginalNode()); // shall we clone? } defopNode.set(defopChildren); ArrayList<QueryNode> orClauses = new ArrayList<QueryNode>(); orClauses.add(new AqpImmutableGroupQueryNode(defopNode)); orClauses.add(new AqpImmutableGroupQueryNode(replacementNode)); AqpOrQueryNode orNode = new AqpOrQueryNode(orClauses); newChildren.add(orNode); } else if (operationMode.equals("multiply")) { // (single node) OR ("single node") // this strategy is best for ADS as we want to support // multi-token synonym replacement, edismax, and also // non-quoted strings should be searched in sensible field AqpWhiteSpacedQueryNode normalNode = (AqpWhiteSpacedQueryNode) createReplacementNode(newGroup, "simple"); AqpWhiteSpacedQueryNode phraseNode = normalNode.cloneTree(); phraseNode.setValue("\"" + phraseNode.getValue() + "\""); ArrayList<QueryNode> orClauses = new ArrayList<QueryNode>(); orClauses.add(normalNode); orClauses.add(phraseNode); AqpOrQueryNode orNode = new AqpOrQueryNode(orClauses); newChildren.add(orNode); } else { throw new ParseException(new MessageImpl("Unknown strategy: " + operationMode)); } } private void fixTheFieldProblem(List<NodeInfo> newGroup) { NodeInfo firstNode = newGroup.get(0); String f = firstNode.getField(); if (f != null) { QueryNode n = firstNode.getOriginalNode(); removeField(n); } } private void removeField(QueryNode n) { if (!n.isLeaf() && n instanceof AqpANTLRNode) { if (((AqpANTLRNode) n).getTokenLabel().equals("FIELD")) { List<QueryNode> children = n.getChildren(); if (children.size() > 1) { AqpANTLRNode f = (AqpANTLRNode) children.remove(0); AqpANTLRNode c = (AqpANTLRNode) AqpQProcessor.getTerminalNode(children.get(0)); c.setTokenInput(f.getTokenInput() + ":" + c.getTokenInput()); c.setInputTokenStart(f.getInputTokenStart()); c.setTokenStart(f.getInputTokenStart()); } } for (QueryNode child: n.getChildren()) { removeField(child); } } } private QueryNode createReplacementNode(List<NodeInfo> newGroup, String tt) throws CloneNotSupportedException, QueryNodeException { String newValue = getConcatenatedValue(newGroup); String field = ""; if (newGroup.get(0).getField() != null && newGroup.get(0).getField().length() > 0) field = newGroup.get(0).getField() + ":"; boolean isWildcard = false; for (NodeInfo ninfo: newGroup) { if (wildcardQTypes.contains(ninfo.getQType())) { isWildcard = true; break; } } if (tt == null) tt = getNewTokenType(); if (tt.equals("simple")) { return new AqpWhiteSpacedQueryNode( field != "" ? newGroup.get(0).getField() : null , newValue, -1, -1); } // we'll reuse the first node (but make its copy) QueryNode firstNode = cloneNode(newGroup.get(0).getOriginalNode()); // inject the new value QueryNode terminalNode = AqpQProcessor.getTerminalNode(firstNode); ((AqpANTLRNode) terminalNode).setTokenInput(newValue); ((AqpANTLRNode) terminalNode).setTokenName("INJECTED"); // change parent's type // TODO: change the position information? QueryNode terminalParent = terminalNode.getParent(); if (isWildcard) { ((AqpANTLRNode) terminalParent).setTokenName("QTRUNCATED"); ((AqpANTLRNode) terminalParent).setTokenLabel("QTRUNCATED"); } else { if (tt.contains("QPHRASE")) { ((AqpANTLRNode) terminalNode).setTokenInput("\"" + newValue + "\""); } ((AqpANTLRNode) terminalParent).setTokenName(tt); ((AqpANTLRNode) terminalParent).setTokenLabel(tt); } return firstNode; } /* * Ufff....this is necessary, because the QueryNodeImpl is NOT * resetting the parent. sooooo stupid.... */ private QueryNode cloneNode(QueryNode node) throws CloneNotSupportedException { QueryNode n = node.cloneTree(); fixClone(n); return n; } private void fixClone(QueryNode node) { ArrayList<QueryNode> newChildren = new ArrayList<QueryNode>(); newChildren.addAll(node.getChildren()); node.set(newChildren); for (QueryNode qn: node.getChildren()) { if (!qn.isLeaf()) { fixClone(qn); } } } private String getConcatenatedValue(List<NodeInfo> newGroup) throws ParseException { boolean allIsAqp = true; for (NodeInfo ninfo: newGroup) { if (!(ninfo.getTerminalNode() instanceof AqpANTLRNode)) { allIsAqp = false; break; } } if (allIsAqp) { AqpANTLRNode first = (AqpANTLRNode) newGroup.get(0).getTerminalNode(); AqpANTLRNode last = (AqpANTLRNode) newGroup.get(newGroup.size()-1).getTerminalNode(); int start = first.getInputTokenStart(); int end = last.getInputTokenEnd(); CharStream is = AqpQProcessor.getInputStream(first); return is.substring(start, end); } else { StringBuffer concatenated = new StringBuffer(); boolean first = false; for (NodeInfo ninfo: newGroup) { if (first) { concatenated.append(PLAIN_TOKEN_SEPARATOR); } concatenated.append(ninfo.getInput()); first = true; } return concatenated.toString(); } } private void tagChildren(List<NodeInfo> newGroup) throws ParseException { String value = getConcatenatedValue(newGroup); for (NodeInfo ninfo: newGroup) { ninfo.getTerminalNode().setTag(PLAIN_TOKEN_CONCATENATED, value); } } private List<NodeInfo> harvestNodeChildren(AqpANTLRNode node) { ArrayList<NodeInfo> out = new ArrayList<NodeInfo>(); for (QueryNode child: node.getChildren()) { out.add(new NodeInfo(child)); } return out; } private Boolean isFieldIgnored(String fld) { return getIgnoredFields().contains(fld); } private Object _getConfigVal(String key) { Map<String, String> args = getQueryConfigHandler().get( AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER); if (args.containsKey(key)) { return args.get(key); } return null; } private String getStrategy() { Object obj = _getConfigVal("aqp.unfielded.tokens.strategy"); if (obj == null) return "tag"; return (String) obj; } private Set<String> aqpIgnorableFields = null; private Set<String> getIgnoredFields() { if (aqpIgnorableFields != null) return aqpIgnorableFields; Object obj = _getConfigVal("aqp.unfielded.ignore.fields"); aqpIgnorableFields = new HashSet<String>(); if (obj != null) { String[] vals = StringUtils.split((String) obj); for (String v: vals) { aqpIgnorableFields.add(v); } } if (ignoreFields != null) { for (String v: ignoreFields) { aqpIgnorableFields.add(v); } } return aqpIgnorableFields; } private String getNewTokenType() { Object obj = _getConfigVal("aqp.unfielded.tokens.new.type"); if (obj == null) return "simple"; String m = ((String) obj).toLowerCase(); if (m.contains("phrase")) { return "QPHRASE"; } else if (m.contains("simple")) { return "simple"; } else { return "QNORMAL"; } } private class NodeInfo { private QueryNode originalNode; private String qType; private String field; private String tModifier; private String modifier; private String input; public NodeInfo(QueryNode node) { originalNode = node; qType = null; field = null; tModifier = null; modifier = null; initValues(); } public String getInput() throws ParseException { if (originalNode instanceof AqpANTLRNode) { return AqpQProcessor.getOriginalInput((AqpANTLRNode) originalNode, new HashSet()).value; } else { QueryNode terminal = getTerminalNode(); if (terminal instanceof FieldQueryNode) { return ((FieldQueryNode) terminal).getTextAsString(); } else { if (input != null && input != "") { return input; } throw new ParseException(new MessageImpl("Hmmm, should never happen that we don't know how to get user input. Error is ours, not yours though!")); } } } public String getField() { return field; } public QueryNode getOriginalNode() { return originalNode; } public boolean isBareNode(boolean isFirstInGroup) { // we allow only the following: // /MODIFIER/TMODIFIER/FIELD/QNORMAL // and all elements must be either empty // or have values that can be ignored if (modifier == "" || (isFirstInGroup && ignoreModifiers.contains(modifier))) { if (tModifier == "" || (isFirstInGroup && ignoreTModifiers.contains(tModifier))) { if (field == "" || (isFirstInGroup && !isFieldIgnored(field))) { if (catchQTypes.contains(qType)) { return true; } } } } return false; } public String getQType() { return qType; } private void initValues() { Map<Integer, String> labels = new HashMap<Integer, String>(); harvestLabels(originalNode, labels, 5, 0); // check the node has correct structure, otherwise pull out // that leaves the values to be null, which means 'get out!' if (!(labels.containsKey(0) && labels.get(0).equals("MODIFIER") && labels.containsKey(1) && labels.get(1).equals("TMODIFIER") && labels.containsKey(2) && labels.get(2).equals("FIELD"))) { return; } if (labels.containsKey(3)) { qType = labels.get(3); } Map<Integer, String> values = new HashMap<Integer, String>(); harvestValues(originalNode, values, 5, 0); modifier = values.containsKey(1) ? values.get(1) : ""; tModifier = values.containsKey(2) ? values.get(2) : ""; field = values.containsKey(3) ? values.get(3) : ""; input = values.containsKey(4) ? values.get(4) : ""; } private void harvestValues(QueryNode node, Map<Integer, String> data, int maxDepth, int level) { assert maxDepth >= level; if (maxDepth == level) return; if (node.isLeaf()) { if (node instanceof AqpANTLRNode) { data.put(level, ((AqpANTLRNode) node).getTokenInput() != null ? ((AqpANTLRNode) node).getTokenInput() : ((AqpANTLRNode) node).getTokenName()); } else { data.put(level, "???"); } } else { for (QueryNode n: node.getChildren()) { harvestValues(n, data, maxDepth, level+1); } } } private void harvestLabels(QueryNode node, Map<Integer, String> data, int maxDepth, int level) { assert maxDepth >= level; //System.out.println(data); if (maxDepth == level) return; if (!node.isLeaf()) { if (!data.containsKey(level)) { //System.out.println("checking=" + level + "node=" + node); //QueryNode parent = node.getParent(); if (node instanceof AqpANTLRNode) { data.put(level, ((AqpANTLRNode) node).getTokenLabel()); } else { data.put(level, "???"); } } for (QueryNode n: node.getChildren()) { harvestLabels(n, data, maxDepth, level+1); } } } public QueryNode getTerminalNode() { return AqpQProcessor.getTerminalNode(originalNode); } } }