AqpAnalyzerQueryNodeProcessor.java example

Explorer
montysolr-master
- contrib
package org.apache.lucene.queryparser.flexible.aqp.processors;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.NoTokenFoundQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QuotedFieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.RangeQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.TextableQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.TokenizedPhraseQueryNode;
import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorImpl;
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys;
import org.apache.lucene.queryparser.flexible.standard.nodes.MultiPhraseQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.StandardBooleanQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode;
import org.apache.lucene.queryparser.flexible.standard.processors.AnalyzerQueryNodeProcessor;

/**
 * This is an improved version of the {@link AnalyzerQueryNodeProcessor} it is
 * better because it keeps track of the position offset which is absolutely
 * indispensable for proper parsing of expanded queries. And also we save the
 * type attribute name with the node
 * 
 * TODO: send a patch and make them accept it
 * 
 * This processor verifies if {@link ConfigurationKeys#ANALYZER} is defined in
 * the {@link QueryConfigHandler}. If it is and the analyzer is not
 * <code>null</code>, it looks for every {@link FieldQueryNode} that is not
 * {@link WildcardQueryNode}, {@link FuzzyQueryNode} or {@link RangeQueryNode}
 * contained in the query node tree, then it applies the analyzer to that
 * {@link FieldQueryNode} object.
 * <p>
 * If the analyzer return only one term, the returned term is set to the
 * {@link FieldQueryNode} and it's returned.
 * <p>
 * If the analyzer return more than one term, a {@link TokenizedPhraseQueryNode}
 * or {@link MultiPhraseQueryNode} is created, whether there is one or more
 * terms at the same position, and it's returned.
 * <p>
 * If no term is returned by the analyzer a {@link NoTokenFoundQueryNode} object
 * is returned.
 * 
 * 
 * @see ConfigurationKeys#ANALYZER
 * @see Analyzer
 * @see TokenStream
 */

public class AqpAnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {

  public String TYPE_ATTRIBUTE = "token_type_attribute";
  private Analyzer analyzer;

  private boolean positionIncrementsEnabled;

  public AqpAnalyzerQueryNodeProcessor() {
    // empty constructor
  }

  @Override
  public QueryNode process(QueryNode queryTree) throws QueryNodeException {
    Analyzer analyzer = getQueryConfigHandler().get(ConfigurationKeys.ANALYZER);

    if (analyzer != null) {
      this.analyzer = analyzer;
      this.positionIncrementsEnabled = false;
      Boolean positionIncrementsEnabled = getQueryConfigHandler().get(
          ConfigurationKeys.ENABLE_POSITION_INCREMENTS);

      if (positionIncrementsEnabled != null) {
        this.positionIncrementsEnabled = positionIncrementsEnabled;
      }

      if (this.analyzer != null) {
        return super.process(queryTree);
      }

    }

    return queryTree;

  }

  @Override
  protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException {

    if (node instanceof TextableQueryNode
        && !(node instanceof WildcardQueryNode)
        && !(node instanceof FuzzyQueryNode)
        && !(node instanceof RegexpQueryNode)
        && !(node.getParent() instanceof RangeQueryNode)) {

      FieldQueryNode fieldNode = ((FieldQueryNode) node);
      int queryStart = Math.max(fieldNode.getBegin(), 0); // could be -1
      String text = fieldNode.getTextAsString();
      String field = fieldNode.getFieldAsString();

      TokenStream source;
      try {
        source = this.analyzer.tokenStream(field, new StringReader(text));
        source.reset();
      } catch (IOException e1) {
        throw new RuntimeException(e1);
      }
      CachingTokenFilter buffer = new CachingTokenFilter(source);

      PositionIncrementAttribute posIncrAtt = null;
      int numTokens = 0;
      int positionCount = 0;
      boolean severalTokensAtSamePosition = false;

      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }

      TypeAttribute typeAtt = null;
      if (buffer.hasAttribute(TypeAttribute.class)) {
        typeAtt = buffer.getAttribute(TypeAttribute.class);
      }

      try {

        while (buffer.incrementToken()) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt
              .getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;

          } else {
            severalTokensAtSamePosition = true;
          }

        }

      } catch (IOException e) {
        // ignore
      }

      try {
        // rewind the buffer stream
        buffer.reset();

        // close original stream - all tokens buffered
        source.close();
      } catch (IOException e) {
        // ignore
      }

      if (!buffer.hasAttribute(CharTermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }

      CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);

      int offsetStart = -1;
      int offsetEnd = -1;
      OffsetAttribute offsetAtt;
      if (buffer.hasAttribute(OffsetAttribute.class)) {
        offsetAtt = buffer.getAttribute(OffsetAttribute.class);
      } else {
        offsetAtt = null;
      }

      if (numTokens == 0) {
        return new NoTokenFoundQueryNode();

      } else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.toString();

        } catch (IOException e) {
          // safe to ignore, because we know the number of tokens
        }

        fieldNode.setText(term);
        if (offsetAtt != null) {
          fieldNode.setBegin(queryStart + offsetAtt.startOffset());
          fieldNode.setEnd(queryStart + offsetAtt.endOffset());
        }
        if (typeAtt != null)
          fieldNode.setTag(TYPE_ATTRIBUTE, typeAtt.type());
        return fieldNode;

      } else if (severalTokensAtSamePosition
          || !(node instanceof QuotedFieldQueryNode)) {
        if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
          // no phrase query:
          LinkedList<QueryNode> children = new LinkedList<QueryNode>();

          for (int i = 0; i < numTokens; i++) {
            String term = null;
            offsetStart = offsetEnd = -1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (offsetAtt != null) {
                offsetStart = queryStart + offsetAtt.startOffset();
                offsetEnd = queryStart + offsetAtt.endOffset();
              }

            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            FieldQueryNode fq = new FieldQueryNode(field, term, offsetStart,
                offsetEnd);
            if (typeAtt != null)
              fq.setTag(TYPE_ATTRIBUTE, typeAtt.type());
            children.add(fq);

          }
          return new GroupQueryNode(new StandardBooleanQueryNode(children,
              positionCount == 1));
        } else {
          // phrase query:
          MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

          List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
          int position = -1;
          int i = 0;
          int termGroupCount = 0;

          for (; i < numTokens; i++) {
            String term = null;
            offsetStart = offsetEnd = -1;
            int positionIncrement = 1;
            String tokenType = null;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }
              if (offsetAtt != null) {
                offsetStart = queryStart + offsetAtt.startOffset();
                offsetEnd = queryStart + offsetAtt.endOffset();
              }
              if (typeAtt != null)
                tokenType = typeAtt.type();
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            if (positionIncrement > 0 && multiTerms.size() > 0) {

              for (FieldQueryNode termNode : multiTerms) {

                if (this.positionIncrementsEnabled) {
                  termNode.setPositionIncrement(position);
                } else {
                  termNode.setPositionIncrement(termGroupCount);
                }

                mpq.add(termNode);

              }

              // Only increment once for each "group" of
              // terms that were in the same position:
              termGroupCount++;

              multiTerms.clear();

            }

            position += positionIncrement;
            FieldQueryNode fq = new FieldQueryNode(field, term, offsetStart,
                offsetEnd);
            fq.setTag(TYPE_ATTRIBUTE, tokenType);
            multiTerms.add(fq);

          }

          for (FieldQueryNode termNode : multiTerms) {

            if (this.positionIncrementsEnabled) {
              termNode.setPositionIncrement(position);

            } else {
              termNode.setPositionIncrement(termGroupCount);
            }

            mpq.add(termNode);

          }

          return mpq;

        }

      } else {

        TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

        int position = -1;

        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;
          offsetStart = offsetEnd = -1;

          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();

            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }

            if (offsetAtt != null) {
              offsetStart = queryStart + offsetAtt.startOffset();
              offsetEnd = queryStart + offsetAtt.endOffset();
            }

          } catch (IOException e) {
            // safe to ignore, because we know the number of tokens
          }

          FieldQueryNode newFieldNode = new FieldQueryNode(field, term,
              offsetStart, offsetEnd);
          if (typeAtt != null)
            newFieldNode.setTag(TYPE_ATTRIBUTE, typeAtt.type());

          if (this.positionIncrementsEnabled) {
            position += positionIncrement;
            newFieldNode.setPositionIncrement(position);

          } else {
            newFieldNode.setPositionIncrement(i);
          }

          pq.add(newFieldNode);

        }

        return pq;

      }

    }

    return node;

  }

  @Override
  protected QueryNode preProcessNode(QueryNode node) throws QueryNodeException {

    return node;

  }

  @Override
  protected List<QueryNode> setChildrenOrder(List<QueryNode> children)
      throws QueryNodeException {

    return children;

  }

}