AqpAdsabsNodeProcessorPipeline.java example

Explorer
montysolr-master
- contrib
package org.apache.lucene.queryparser.flexible.aqp;

import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler;
import org.apache.lucene.queryparser.flexible.core.processors.NoChildOptimizationQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorPipeline;
import org.apache.lucene.queryparser.flexible.core.processors.RemoveDeletedQueryNodesProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.AllowLeadingWildcardProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.BooleanSingleChildOptimizationQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.BoostQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.DefaultPhraseSlopQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.FuzzyQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.MatchAllDocsQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.MultiFieldQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.MultiTermRewriteMethodProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.LegacyNumericQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.LegacyNumericRangeQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.PhraseSlopQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.RemoveEmptyNonLeafQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.standard.processors.TermRangeQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsAnalyzerProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsAuthorPreProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsCarefulAnalyzerProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsExpandAuthorSearchProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsFieldNodePreAnalysisProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsFixQPOSITIONProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsMODIFIERProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQDELIMITERProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQTRUNCATEDProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsFieldMapperProcessorPostAnalysis;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpDEFOPUnfieldedTokens;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpPostAnalysisProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQREGEXProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpUnfieldedSearchProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQNORMALProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsQPOSITIONProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsRegexNodeProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAdsabsSynonymNodeProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpBOOSTProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpBibcodeProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpCLAUSEProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpDEFOPProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFIELDProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFUZZYProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFieldMapperProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpFuzzyModifierProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpGroupQueryOptimizerProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpLowercaseExpandedTermsQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpNullDefaultFieldProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpOPERATORProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpOptimizationProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQANYTHINGProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQDATEProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQFUNCProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQIDENTIFIERProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQPHRASEProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQPHRASETRUNCProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpQRANGEINProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpTMODIFIERProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpTreeRewriteProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpVirtualFieldsQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpWhiteSpacedQueryNodeProcessor;
import org.apache.solr.search.AqpAdsabsQParser;

/**
 * This is the MAIN PIPELINE - it sets out how to build query,
 * deal with exceptions and stuff for ADS language
 * 
 * @see AqpAdsabsQueryConfigHandler
 * @see AqpAdsabsQueryTreeBuilder
 * @see AqpAdsabsQParser
 *
 */
public class AqpAdsabsNodeProcessorPipeline extends QueryNodeProcessorPipeline {

	public AqpAdsabsNodeProcessorPipeline(QueryConfigHandler queryConfig) {
		super(queryConfig);
		
		QueryConfigHandler config = getQueryConfigHandler();
		
		// function queries are handled first, because they will be parsed
		// again (during resolution)
		add(new AqpAdsabsFixQPOSITIONProcessor()); // handles QPHRASE:"^some phrase$" and QNORMAL:word$
		add(new AqpAdsabsQPOSITIONProcessor()); // rewrites ^author$ into a functional form
		add(new AqpQFUNCProcessor()); // prepares function node (may decide which implementation to call)
		
	
		// find separate tokens and join them into one 'string'
		// true=modify the parse tree, otherwise values are 'added'
		// into the first token
	  // this was the original behaviour, you can still activate:
		// add(new AqpDEFOPMarkPlainNodes());
		// this was the 2nd strategy
		// add(new AqpDEFOPMarkPlainNodes(true, Arrays.asList("+", "-"),
		//		Arrays.asList("author", "first_author")));
		// and this is the best so far, it can nicely extend the query
		// and is configurable through url params
		add(new AqpDEFOPUnfieldedTokens());
		

		/**
		 * Most of the processors immediately below can be seen as
		 * 'transducers' from ANTLR to QueryNode. Because ANTLR is
		 * producing Abstract Syntax Trees (AST). You can view these
		 * processors as a layer between 'pure syntax parsing' (ANTLR)
		 * and query building. If you need to check for 'semantics'
		 * (e.g. invalid combinations, for example: this NEAR5 wildca*)
		 * this is the right place to do it
		 */
		
		add(new AqpDEFOPProcessor()); // sets DEFOP to be AND|OR....
		add(new AqpTreeRewriteProcessor()); // makes (AND(AND(AND... to be (AND...
		
		
		add(new AqpAdsabsMODIFIERProcessor()); // extends PLUS and MINUS with # and =
		add(new AqpOPERATORProcessor()); 
		add(new AqpCLAUSEProcessor());
		
		add(new AqpTMODIFIERProcessor()); // changes AST to more manageable form
		add(new AqpBOOSTProcessor());
		add(new AqpFUZZYProcessor());
	
		add(new AqpQRANGEINProcessor());
		//add(new AqpQRANGEEXProcessor());  // exclusive ranges not used by ADS
		
		add(new AqpQDATEProcessor());
		add(new AqpQPHRASEProcessor());
		add(new AqpQREGEXProcessor());
		add(new AqpAdsabsQNORMALProcessor()); // keeps the tag information (AqpDEFOPMarkPlainNodes)
		add(new AqpQPHRASETRUNCProcessor());
		add(new AqpAdsabsQTRUNCATEDProcessor());
		add(new AqpQANYTHINGProcessor());
		add(new AqpQIDENTIFIERProcessor());
		add(new AqpFIELDProcessor()); // sets the field name (if user specified one, or there is a default)
		
		
		/**
		 * After this point, the AST tree usually does not contain ANTLR
		 * specific nodes, but it is made entirely from the transformed
		 * QueryNode(s). The tree is also much simpler and 'flatter' 
		 */
		
		
		add(new AqpBibcodeProcessor()); // finds bibcode and converts to AqpAdslabsIdentifier		
		add(new AqpFuzzyModifierProcessor());
		
		//this
		//add(new WildcardQueryNodeProcessor());
		
		
	  // expands to multiple fields if field=null (ie. unfielded search)
		// however, this solution is the old-style lucene mechanism which 
		// was superseded by edismax() function calls; it works when
		// ConfigurationKeys.MULTI_FIELDS are not empty. It is kept here 
		// because of unittest compatibility
	  add(new MultiFieldQueryNodeProcessor()); 
		add(new AqpNullDefaultFieldProcessor());
		
		// expands virtual fields into real fields, that can be analyzed
		// normal ways (it also adds boosts, if necessary)
		add(new AqpVirtualFieldsQueryNodeProcessor());
		
		add(new FuzzyQueryNodeProcessor());
		add(new MatchAllDocsQueryNodeProcessor());
		
		
		
		
		/**
		 * Analysis block: here we use the Solr/Lucene analyzers
		 * This is the most complex part of the analysis chain
		 * it can generate synonyms, multi-token synonyms,
		 * call functions() etc...we are changing AST tree and 
		 * doing some other acrobatics
		 */
		
	  // ADS specific modification of the tree before the analysis
		// helps with syntactic sugar 
		add(new AqpAdsabsFieldNodePreAnalysisProcessor()); 
		
	  // translate the field name before we try to find the tokenizer chain
		// useful when you want to use particular analyzer for several fields
		// but you don't want to define these fields as separate indexes
		add(new AqpFieldMapperProcessor()); 
		
		// was the old-time behaviour, before the AqpDEFOPMarkPlainNodes() was
		// modifying the AST; now this is obsolete, we solve it elsewhere
	  // find synonyms if we have 'plain word token group', this processor
		// add(new AqpMultiWordProcessor()); 
		
		
		// this block applies only when query parser is ran inside SOLR
		if (config.get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_READY) == true) {
			
			// take the 'unfielded search' values and wrap them into edismax('xxxxx') call
			// it will be executed/built in the 'build' phase (after processors finished)
      add(new AqpUnfieldedSearchProcessor());
      
      // this takes the unfielded search (multi-token group)
      add(new AqpWhiteSpacedQueryNodeProcessor());
      
      // TEMPORARY solution for the unfielded multi-token searches, edismax
      // does not know how to handle properly token expansions spanning several
      // positions; so we extract these and add them next to the edismax generated query
      // 28/10/13 - I've finally modified edismax (to call our aqp) when building
      // a query, so this is being taken care off by us
      // add(new AqpAdsabsExtractMultisynonymsProcessor());
      
    }
		
		add(new LegacyNumericQueryNodeProcessor());
    add(new LegacyNumericRangeQueryNodeProcessor());
		add(new TermRangeQueryNodeProcessor());
		add(new AqpAdsabsRegexNodeProcessor()); // wraps regex QN w/ NonAnalyzedQueryNode
		
    // simply wraps into non-analyzed node; possibly to remove because we now use 
		// AqpAdsabsCarefulAnalyzerProcessor below
		add(new AqpAdsabsSynonymNodeProcessor());  
		

	  // ADS is doing lots of things with regard to parsing author searches
		// here we clean up author searches before analysis stage
		add(new AqpAdsabsAuthorPreProcessor()); 
		
		
  	// the main analysis happens here (but not for wildcard nodes and similar
		// non-analyzed nodes), these will stay untouched
		add(new AqpAdsabsAnalyzerProcessor()); 
		
		
		// here we analyze input that was not analyzed by the previous step
		// this applies mostly to wildcard, regex, fuzzy searches; the ADS 
		// convention is that if a '<field>_wildcard' will be used to analyze
		// and create Wildcard node etc
		add(new AqpAdsabsCarefulAnalyzerProcessor());
		
	  
		// lowercase everything else which wasn't caught by the previous steps
		// a special case are non-analyzed nodes - these are left =UnTouchEd
		add(new AqpLowercaseExpandedTermsQueryNodeProcessor());

		
		// author search: 'kurtz, michael' is expanded with "kurtz, michael *" ...
	  // ADS has a 'very special' requirement for expanding the author search
		// this expansion cannot be solved inside the analysis chain (because
		// it depends on the context [knowing the original input]), so it is here
		add(new AqpAdsabsExpandAuthorSearchProcessor()); 
		
		
    // translate the field names back into their index-name variants
		add(new AqpAdsabsFieldMapperProcessorPostAnalysis()); 
		
		
	  // deals with the the-same-position tokens: 
		// "(word | synonym) phrase query" becomes "word phrase query" | synonym
		add(new AqpPostAnalysisProcessor()); 
		
		
		/**
		 * Analysis-phase is over, these are the standard flex guys
		 * massaging remaining query elements
		 */
		add(new AqpAdsabsQDELIMITERProcessor());
		add(new PhraseSlopQueryNodeProcessor());
		add(new AllowLeadingWildcardProcessor());
		// add(new GroupQueryNodeProcessor()); // this removes the boolean opearator precedence
		add(new NoChildOptimizationQueryNodeProcessor());
		add(new RemoveDeletedQueryNodesProcessor());
		add(new RemoveEmptyNonLeafQueryNodeProcessor());
		add(new BooleanSingleChildOptimizationQueryNodeProcessor());
		add(new DefaultPhraseSlopQueryNodeProcessor());
		add(new BoostQueryNodeProcessor());
		add(new MultiTermRewriteMethodProcessor());
	
		
		// purely aestetical, final touches to the query
		// we are rewriting the tree, for example:
		// +(+this +that) +what' becomes '+this +that +what'
		add(new AqpGroupQueryOptimizerProcessor());
		add(new AqpOptimizationProcessor());
		
		
	}

}