package org.apache.lucene.queryparser.flexible.aqp; import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler; import org.apache.lucene.queryparser.flexible.core.processors.NoChildOptimizationQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorPipeline; import org.apache.lucene.queryparser.flexible.core.processors.RemoveDeletedQueryNodesProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.AllowLeadingWildcardProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.BooleanSingleChildOptimizationQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.BoostQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.DefaultPhraseSlopQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.FuzzyQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.MatchAllDocsQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.MultiFieldQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.MultiTermRewriteMethodProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.LegacyNumericQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.LegacyNumericRangeQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.PhraseSlopQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.RemoveEmptyNonLeafQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.standard.processors.TermRangeQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsAnalyzerProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsAuthorPreProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsCarefulAnalyzerProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsExpandAuthorSearchProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsFieldNodePreAnalysisProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsFixQPOSITIONProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsMODIFIERProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQDELIMITERProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQTRUNCATEDProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsFieldMapperProcessorPostAnalysis; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpDEFOPUnfieldedTokens; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpPostAnalysisProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQREGEXProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpUnfieldedSearchProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQNORMALProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQPOSITIONProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsRegexNodeProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsSynonymNodeProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpBOOSTProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpBibcodeProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpCLAUSEProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpDEFOPProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFIELDProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFUZZYProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFieldMapperProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFuzzyModifierProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpGroupQueryOptimizerProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpLowercaseExpandedTermsQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpNullDefaultFieldProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpOPERATORProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpOptimizationProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQANYTHINGProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQDATEProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQFUNCProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQIDENTIFIERProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQPHRASEProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQPHRASETRUNCProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQRANGEINProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpTMODIFIERProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpTreeRewriteProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpVirtualFieldsQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpWhiteSpacedQueryNodeProcessor; import org.apache.solr.search.AqpAdsabsQParser; /** * This is the MAIN PIPELINE - it sets out how to build query, * deal with exceptions and stuff for ADS language * * @see AqpAdsabsQueryConfigHandler * @see AqpAdsabsQueryTreeBuilder * @see AqpAdsabsQParser * */ public class AqpAdsabsNodeProcessorPipeline extends QueryNodeProcessorPipeline { public AqpAdsabsNodeProcessorPipeline(QueryConfigHandler queryConfig) { super(queryConfig); QueryConfigHandler config = getQueryConfigHandler(); // function queries are handled first, because they will be parsed // again (during resolution) add(new AqpAdsabsFixQPOSITIONProcessor()); // handles QPHRASE:"^some phrase$" and QNORMAL:word$ add(new AqpAdsabsQPOSITIONProcessor()); // rewrites ^author$ into a functional form add(new AqpQFUNCProcessor()); // prepares function node (may decide which implementation to call) // find separate tokens and join them into one 'string' // true=modify the parse tree, otherwise values are 'added' // into the first token // this was the original behaviour, you can still activate: // add(new AqpDEFOPMarkPlainNodes()); // this was the 2nd strategy // add(new AqpDEFOPMarkPlainNodes(true, Arrays.asList("+", "-"), // Arrays.asList("author", "first_author"))); // and this is the best so far, it can nicely extend the query // and is configurable through url params add(new AqpDEFOPUnfieldedTokens()); /** * Most of the processors immediately below can be seen as * 'transducers' from ANTLR to QueryNode. Because ANTLR is * producing Abstract Syntax Trees (AST). You can view these * processors as a layer between 'pure syntax parsing' (ANTLR) * and query building. If you need to check for 'semantics' * (e.g. invalid combinations, for example: this NEAR5 wildca*) * this is the right place to do it */ add(new AqpDEFOPProcessor()); // sets DEFOP to be AND|OR.... add(new AqpTreeRewriteProcessor()); // makes (AND(AND(AND... to be (AND... add(new AqpAdsabsMODIFIERProcessor()); // extends PLUS and MINUS with # and = add(new AqpOPERATORProcessor()); add(new AqpCLAUSEProcessor()); add(new AqpTMODIFIERProcessor()); // changes AST to more manageable form add(new AqpBOOSTProcessor()); add(new AqpFUZZYProcessor()); add(new AqpQRANGEINProcessor()); //add(new AqpQRANGEEXProcessor()); // exclusive ranges not used by ADS add(new AqpQDATEProcessor()); add(new AqpQPHRASEProcessor()); add(new AqpQREGEXProcessor()); add(new AqpAdsabsQNORMALProcessor()); // keeps the tag information (AqpDEFOPMarkPlainNodes) add(new AqpQPHRASETRUNCProcessor()); add(new AqpAdsabsQTRUNCATEDProcessor()); add(new AqpQANYTHINGProcessor()); add(new AqpQIDENTIFIERProcessor()); add(new AqpFIELDProcessor()); // sets the field name (if user specified one, or there is a default) /** * After this point, the AST tree usually does not contain ANTLR * specific nodes, but it is made entirely from the transformed * QueryNode(s). The tree is also much simpler and 'flatter' */ add(new AqpBibcodeProcessor()); // finds bibcode and converts to AqpAdslabsIdentifier add(new AqpFuzzyModifierProcessor()); //this //add(new WildcardQueryNodeProcessor()); // expands to multiple fields if field=null (ie. unfielded search) // however, this solution is the old-style lucene mechanism which // was superseded by edismax() function calls; it works when // ConfigurationKeys.MULTI_FIELDS are not empty. It is kept here // because of unittest compatibility add(new MultiFieldQueryNodeProcessor()); add(new AqpNullDefaultFieldProcessor()); // expands virtual fields into real fields, that can be analyzed // normal ways (it also adds boosts, if necessary) add(new AqpVirtualFieldsQueryNodeProcessor()); add(new FuzzyQueryNodeProcessor()); add(new MatchAllDocsQueryNodeProcessor()); /** * Analysis block: here we use the Solr/Lucene analyzers * This is the most complex part of the analysis chain * it can generate synonyms, multi-token synonyms, * call functions() etc...we are changing AST tree and * doing some other acrobatics */ // ADS specific modification of the tree before the analysis // helps with syntactic sugar add(new AqpAdsabsFieldNodePreAnalysisProcessor()); // translate the field name before we try to find the tokenizer chain // useful when you want to use particular analyzer for several fields // but you don't want to define these fields as separate indexes add(new AqpFieldMapperProcessor()); // was the old-time behaviour, before the AqpDEFOPMarkPlainNodes() was // modifying the AST; now this is obsolete, we solve it elsewhere // find synonyms if we have 'plain word token group', this processor // add(new AqpMultiWordProcessor()); // this block applies only when query parser is ran inside SOLR if (config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_READY) == true) { // take the 'unfielded search' values and wrap them into edismax('xxxxx') call // it will be executed/built in the 'build' phase (after processors finished) add(new AqpUnfieldedSearchProcessor()); // this takes the unfielded search (multi-token group) add(new AqpWhiteSpacedQueryNodeProcessor()); // TEMPORARY solution for the unfielded multi-token searches, edismax // does not know how to handle properly token expansions spanning several // positions; so we extract these and add them next to the edismax generated query // 28/10/13 - I've finally modified edismax (to call our aqp) when building // a query, so this is being taken care off by us // add(new AqpAdsabsExtractMultisynonymsProcessor()); } add(new LegacyNumericQueryNodeProcessor()); add(new LegacyNumericRangeQueryNodeProcessor()); add(new TermRangeQueryNodeProcessor()); add(new AqpAdsabsRegexNodeProcessor()); // wraps regex QN w/ NonAnalyzedQueryNode // simply wraps into non-analyzed node; possibly to remove because we now use // AqpAdsabsCarefulAnalyzerProcessor below add(new AqpAdsabsSynonymNodeProcessor()); // ADS is doing lots of things with regard to parsing author searches // here we clean up author searches before analysis stage add(new AqpAdsabsAuthorPreProcessor()); // the main analysis happens here (but not for wildcard nodes and similar // non-analyzed nodes), these will stay untouched add(new AqpAdsabsAnalyzerProcessor()); // here we analyze input that was not analyzed by the previous step // this applies mostly to wildcard, regex, fuzzy searches; the ADS // convention is that if a '<field>_wildcard' will be used to analyze // and create Wildcard node etc add(new AqpAdsabsCarefulAnalyzerProcessor()); // lowercase everything else which wasn't caught by the previous steps // a special case are non-analyzed nodes - these are left =UnTouchEd add(new AqpLowercaseExpandedTermsQueryNodeProcessor()); // author search: 'kurtz, michael' is expanded with "kurtz, michael *" ... // ADS has a 'very special' requirement for expanding the author search // this expansion cannot be solved inside the analysis chain (because // it depends on the context [knowing the original input]), so it is here add(new AqpAdsabsExpandAuthorSearchProcessor()); // translate the field names back into their index-name variants add(new AqpAdsabsFieldMapperProcessorPostAnalysis()); // deals with the the-same-position tokens: // "(word | synonym) phrase query" becomes "word phrase query" | synonym add(new AqpPostAnalysisProcessor()); /** * Analysis-phase is over, these are the standard flex guys * massaging remaining query elements */ add(new AqpAdsabsQDELIMITERProcessor()); add(new PhraseSlopQueryNodeProcessor()); add(new AllowLeadingWildcardProcessor()); // add(new GroupQueryNodeProcessor()); // this removes the boolean opearator precedence add(new NoChildOptimizationQueryNodeProcessor()); add(new RemoveDeletedQueryNodesProcessor()); add(new RemoveEmptyNonLeafQueryNodeProcessor()); add(new BooleanSingleChildOptimizationQueryNodeProcessor()); add(new DefaultPhraseSlopQueryNodeProcessor()); add(new BoostQueryNodeProcessor()); add(new MultiTermRewriteMethodProcessor()); // purely aestetical, final touches to the query // we are rewriting the tree, for example: // +(+this +that) +what' becomes '+this +that +what' add(new AqpGroupQueryOptimizerProcessor()); add(new AqpOptimizationProcessor()); } }