BigramQueryRewriter.java example

Explorer

xtf-dsc-master
- WEB-INF
  - contrib
    - xtf-lucene
      - src
        java
        org
        apache
        lucene
        bigram
        BigramQueryRewriter.java
        BigramSpanRangeQuery.java
        BigramSpanWildcardQuery.java
        BigramStopFilter.java
        chunk
        Chunk.java
        ChunkMarkPos.java
        ChunkSource.java
        ChunkedWordIter.java
        DocNumMap.java
        SpanChunkedNotQuery.java
        SpanDechunkingQuery.java
        SparseStringComparator.java
        limit
        ExcessiveWorkException.java
        LimIndexReader.java
        LimTermDocs.java
        LimTermPositions.java
        TermLimitException.java
        mark
        BasicMarkPos.java
        BasicWordIter.java
        ContextMarker.java
        MarkCollector.java
        MarkPos.java
        WordIter.java
        search
        FieldSpanSource.java
        FlippableStringComparator.java
        QueryRewriter.java
        QueryTraverser.java
        RecordingSearcher.java
        SpanHitCollector.java
        spans
        EmptySpans.java
        FieldSpans.java
        NearSpans.java
        OrNearSpans.java
        Span.java
        SpanFirstQuery.java
        SpanNearQuery.java
        SpanNotNearQuery.java
        SpanNotQuery.java
        SpanOrNearQuery.java
        SpanOrQuery.java
        SpanPosComparator.java
        SpanQuery.java
        SpanRangeQuery.java
        SpanRecordingScorer.java
        SpanScorer.java
        SpanTermQuery.java
        SpanWeight.java
        SpanWildcardQuery.java
        Spans.java
        spell
        SpellKeywordTest.java
        spelt
        DoubleMetaphone.java
        FreqData.java
        LuceneIndexToDict.java
        MinimalAnalyzer.java
        QuerySpeller.java
        SimpleQueryRewriter.java
        SpellReader.java
        SpellTestCmdLine.java
        SpellWriter.java
        SpellWritingAnalyzer.java
        SpellWritingFilter.java
        TRStringDistance2.java
        WordEquiv.java
        util
        CountedInputStream.java
        CountedOutputStream.java
        FileSorter.java
        Hash64.java
        IntList.java
        LongList.java
        LongSet.java
        Prime.java
        PriorityQueue.java
        ProgressTracker.java
        RandomAccessInputStream.java
        StringUtil.java
      - test
        java
        org
        apache
        lucene
        spelt
        DoubleMetaphoneTest.java
        FreqDataTest.java
        LuceneIndexToDictTest.java
        QuerySpellerTest.java
        SimpleQueryRewriterTest.java
        SpellReadWriteTest.java
        SpellWritingAnalyzerTest.java
        TRStringDistance2Test.java
        util
        CountedInputStreamTest.java
        CountedOutputStreamTest.java
        FileSorterTest.java
        Hash64Test.java
        IntListTest.java
        LongListTest.java
        LongSetTest.java
        PrimeTest.java
        ProgressTrackerTest.java
        RandomAccessInputStreamTest.java
        StringUtilTest.java
  - src
    - net
      - sf
        saxon
        tinytree
        HackedTinyBuilder.java
        trans
        KeyManager.java
    - org
      - cdlib
        xtf
        cache
        Cache.java
        CacheDependency.java
        Dependency.java
        FileDependency.java
        GeneratingCache.java
        SimpleCache.java
        StringCache.java
        crossQuery
        CrossQuery.java
        CrossQueryConfig.java
        QueryRoute.java
        QueryRouteException.java
        TimeProfilingListener.java
        raw
        RawQuery.java
        test
        TestableCrossQuery.java
        dynaXML
        Authenticator.java
        DefaultDocLocator.java
        DocLocator.java
        DocRequest.java
        DynaXML.java
        DynaXMLConfig.java
        DynaXMLException.java
        InvalidDocumentException.java
        IpList.java
        NoPermissionException.java
        UnsupportedQueryException.java
        test
        TestableDynaXML.java
        lazyTree
        AncestorEnumeration.java
        AttributeEnumeration.java
        AttributeImpl.java
        ChildEnumeration.java
        DescendantEnumeration.java
        ElementImpl.java
        FastNodeTestPattern.java
        Flag.java
        FollowingEnumeration.java
        FollowingSiblingEnumeration.java
        LazyDocument.java
        LazyHashMap.java
        LazyKeyManager.java
        LazyProfilingListener.java
        LazyTreeBuilder.java
        NodeImpl.java
        ParentNodeImpl.java
        PersistentTree.java
        PrecedingEnumeration.java
        PrecedingOrAncestorEnumeration.java
        PrecedingSiblingEnumeration.java
        ProxyAttributeEnumeration.java
        ProxyAttributeImpl.java
        ProxyElement.java
        SearchElement.java
        SearchElementImpl.java
        SearchNode.java
        SearchTextImpl.java
        SearchTree.java
        TextImpl.java
        TreeEnumeration.java
        saxonExt
        ElementWithContent.java
        Exec.java
        Image.java
        InstructionWithContent.java
        Mail.java
        Pipe.java
        Redirect.java
        SQL.java
        exec
        ArgElement.java
        InputElement.java
        PipeImageElement.java
        PipeImageInstruction.java
        RunElement.java
        RunInstruction.java
        image
        ImageCache.java
        OutputElement.java
        mail
        SendElement.java
        pipe
        PipeBufferPool.java
        PipeFileElement.java
        PipeFopElement.java
        PipeRequestElement.java
        redirect
        HttpErrorElement.java
        RedirectElement.java
        sql
        SQLClose.java
        SQLColumn.java
        SQLConnect.java
        SQLDelete.java
        SQLInsert.java
        SQLProperty.java
        SQLQuery.java
        SQLUpdate.java
        servletBase
        CQLParseException.java
        DTDSuppressingXMLReader.java
        LatencyCutoffStream.java
        RedirectException.java
        SessionURLRewriter.java
        StylesheetCache.java
        TextConfig.java
        TextServlet.java
        test
        FakeOutputStream.java
        FakeServletConfig.java
        FakeServletContext.java
        FakeServletRequest.java
        FakeServletResponse.java
        NullOutputStream.java
        RegressTest.java
        textEngine
        AccentFoldingRewriter.java
        BoostSet.java
        BoostSetParams.java
        BoundedMarkPos.java
        BoundedWordIter.java
        ConfigCache.java
        Constants.java
        DefaultQueryProcessor.java
        DocHit.java
        DocHitImpl.java
        FlippingDirectory.java
        HitLoadException.java
        HitQueue.java
        IndexUtil.java
        IndexValidator.java
        IndexWarmer.java
        MoreLikeThisQuery.java
        NativeFSDirectory.java
        NumericFieldData.java
        NumericRangeQuery.java
        PluralFoldingRewriter.java
        QueryContext.java
        QueryGenException.java
        QueryProcessor.java
        QueryRequest.java
        QueryRequestParser.java
        QueryResult.java
        RefieldingQueryRewriter.java
        SlopFixupRewriter.java
        Snippet.java
        SnippetMaker.java
        SpanExactQuery.java
        SpanSectionTypeQuery.java
        SpellSuggRewriter.java
        SpellcheckParams.java
        SpellingSuggestion.java
        StdTermFilter.java
        StdTermRewriter.java
        TotalHitsComparator.java
        UnspanningQueryRewriter.java
        XtfBigramQueryRewriter.java
        XtfChunk.java
        XtfChunkMarkPos.java
        XtfChunkSource.java
        XtfChunkedWordIter.java
        XtfDocNumMap.java
        XtfLimIndexReader.java
        XtfQueryRewriter.java
        XtfQueryTraverser.java
        XtfSearcher.java
        XtfSpanRangeQuery.java
        XtfSpanWildcardQuery.java
        XtfWordEquiv.java
        facet
        ChildSelector.java
        DescendantSelector.java
        DocsSelector.java
        DynamicGroupData.java
        EmptySelector.java
        FRBRData.java
        FRBRGroupData.java
        FacetSpec.java
        GroupCounts.java
        GroupData.java
        GroupSelector.java
        MarkSelector.java
        NameSelector.java
        PageSelector.java
        ParseException.java
        RangeSelector.java
        ResultFacet.java
        ResultGroup.java
        RootSelector.java
        SelectedSelector.java
        SelectorParser.java
        SelectorParserConstants.java
        SelectorParserTokenManager.java
        SiblingSelector.java
        SimpleCharStream.java
        SingletonSelector.java
        StaticGroupData.java
        Token.java
        TokenMgrError.java
        TopChoiceSelector.java
        UnionSelector.java
        freeform
        CharStream.java
        FreeformQueryParser.java
        FreeformQueryParserConstants.java
        FreeformQueryParserTokenManager.java
        ParseException.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        textIndexer
        AccentFoldingFilter.java
        CrimsonBugWorkaround.java
        DocSelCache.java
        FacetTokenizer.java
        HTMLIndexSource.java
        HTMLToString.java
        IdxTreeCleaner.java
        IdxTreeCuller.java
        IdxTreeDictMaker.java
        IdxTreeOptimizer.java
        IndexDump.java
        IndexInfo.java
        IndexMerge.java
        IndexRecord.java
        IndexSource.java
        IndexStats.java
        IndexSync.java
        IndexerConfig.java
        MARCIndexSource.java
        MSWordIndexSource.java
        PDFIndexSource.java
        PDFToString.java
        PluralFoldingFilter.java
        SectionInfo.java
        SectionInfoStack.java
        SpellWritingFilter.java
        SrcTreeProcessor.java
        StartEndFilter.java
        StructuredFileProxy.java
        TagFilter.java
        TextIndexSource.java
        TextIndexer.java
        TextIndexerException.java
        XMLConfigParser.java
        XMLIndexSource.java
        XMLTextProcessor.java
        XTFTextAnalyzer.java
        XtfSpecialTokensFilter.java
        tokenizer
        CharStream.java
        FastCharStream.java
        ParseException.java
        Token.java
        TokenMgrError.java
        Tokenizer.java
        XTFTokenizer.java
        XTFTokenizerConstants.java
        XTFTokenizerTokenManager.java
        util
        ArrayUtil.java
        Attrib.java
        AttribList.java
        Base64.java
        CharMap.java
        CheckingTokenStream.java
        CircularQueue.java
        ConsecutiveMap.java
        DirSync.java
        DiskHashReader.java
        DiskHashWriter.java
        DocTypeDeclRemover.java
        EasyNode.java
        EmbeddedList.java
        FastIntCache.java
        FastStringCache.java
        FastStringReader.java
        FastTokenizer.java
        FileWalker.java
        FloatList.java
        GeneralException.java
        IntHash.java
        IntMultiMap.java
        LimitedOutputStream.java
        LineReader.java
        Linkable.java
        LinkableImpl.java
        Normalizer.java
        PackedByteBuf.java
        Path.java
        ProcessRunner.java
        StringHash.java
        StructuredFile.java
        StructuredStore.java
        SubDirFilter.java
        SubFileReader.java
        SubFileWriter.java
        SubStoreReader.java
        SubStoreWriter.java
        TagArray.java
        TagChars.java
        Tester.java
        ThreadWatcher.java
        Trace.java
        TraceWriter.java
        WordMap.java
        XMLFormatter.java
        XMLWriter.java
        XTFSaxonErrorListener.java
        xslt
        CharUtils.java
        FileUtils.java
        FreeformQuery.java
        Session.java
        XMLStubReader.java
        zing
        SRU.java
        SRUConfig.java

package org.apache.lucene.bigram;


/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;
import org.apache.lucene.chunk.SpanChunkedNotQuery;
import org.apache.lucene.chunk.SpanDechunkingQuery;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryRewriter;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotNearQuery;
import org.apache.lucene.search.spans.SpanOrNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;

/**
 * Rewrites a query to eliminate stop words by combining them with
 * adjacent non-stop-words, forming "bi-grams" (or bi-grams with 2 words).
 * This is a fairly in-depth process, as bi-gramming across NEAR and OR
 * queries is complex.
 */
public class BigramQueryRewriter extends QueryRewriter 
{
  /** Set of stop-words (e.g. "the", "a", "and", etc.) to remove */
  protected Set stopSet;

  /** Maximum slop to allow in a query, based on the index being queried */
  protected int maxSlop;

  /** Keeps track of all stop-words removed from the query */
  protected HashSet removedTerms = new HashSet();

  /**
   * Constructs a rewriter using the given stopword set.
   *
   * @param stopSet   Set of stopwords to remove or bi-gram. This can be
   *                  constructed easily by calling
   *                  {@link #makeStopSet(String)}.
   * @param maxSlop   Maximum slop to allow in a query, based on the index
   *                  being queried.
   */
  public BigramQueryRewriter(Set stopSet, int maxSlop) {
    this.stopSet = stopSet;
    this.maxSlop = maxSlop;
  } // constructor

  /**
   * Make a stop set given a space, comma, or semicolon delimited list of
   * stop words.
   *
   * @param stopWords   String of words to make into a set
   *
   * @return            A stop word set suitable for use when constructing
   *                    an {@link BigramQueryRewriter}.
   */
  public static Set makeStopSet(String stopWords) {
    return BigramStopFilter.makeStopSet(stopWords);
  } // makeStopSet()

  /**
   * Determines if the given string is an bi-gram of a real word with a
   * stop-word.
   *
   * @param stopWords   The set of stop-words
   * @param str         The string to check
   * @return            true if it's an bi-gram
   */
  public static boolean isBigram(Set stopWords, String str) 
  {
    int pos = str.indexOf('~');

    // A tilde tells us if it's a bigram.
    if (pos < 0)
      return false;

    // Let's do some sanity checking
    if (pos < 0 || pos >= str.length() - 1)
      return false;
    String before = str.substring(0, pos);
    String after = str.substring(pos + 1);
    
    // Sometimes the original token contains a bi-gram. Don't barf on it.
    if (!stopWords.contains(before) && !stopWords.contains(after)) // This wasn't correctly inverted before!
      return false;

    // It's a bi-gram.
    return true;
  } // isBigram()

  /**
   * Rewrite a BooleanQuery. Prohibited or allowed (not required) clauses
   * that are single stop words will be removed. Required clauses will not
   * have bi-gramming applied.
   *
   * @param bq  The query to rewrite
   * @return    Rewritten version, or 'bq' unchanged if no changed needed.
   */
  protected Query rewrite(BooleanQuery bq) 
  {
    // Classify all the clauses as required, prohibited, or just allowed.
    // Rewrite them along the way.
    //
    Vector required = new Vector();
    Vector prohibited = new Vector();
    Vector allowed = new Vector();

    // Process each clause in turn
    BooleanClause[] clauses = bq.getClauses();
    boolean anyChange = false;
    for (int i = 0; i < clauses.length; i++) 
    {
      // Single stop words must be removed. Make sure to add them to the 
      // removed list so the user will be notified.
      //
      if (stopSet.contains(extractTermText(clauses[i].getQuery()))) {
        removedTerms.add(extractTermText(clauses[i].getQuery()));
        anyChange = true;
        continue;
      }

      // Rewrite the clause and/or its descendants
      Query rewrittenQuery = rewriteQuery(clauses[i].getQuery());
      if (rewrittenQuery != clauses[i].getQuery())
        anyChange = true;

      // And add it to the appropriate vector.
      if (rewrittenQuery == null)
        continue;
      else if (clauses[i].getOccur() == BooleanClause.Occur.MUST_NOT)
        prohibited.add(rewrittenQuery);
      else if (clauses[i].getOccur() == BooleanClause.Occur.MUST)
        required.add(rewrittenQuery);
      else
        allowed.add(rewrittenQuery);
    } // for i

    // NOTE: 
    // Do NOT bi-gram the required clauses, because they don't have any real
    // order, and besides, they might be from entirely different fields.

    // If no changes were needed, return the original query unchanged.
    if (!anyChange)
      return bq;

    // If we ended up with nothing, let the caller know.
    if (required.isEmpty() && prohibited.isEmpty() && allowed.isEmpty())
      return null;

    // If we ended up with a single required clause and no other clauses, return
    // just that.
    //
    if (required.size() == 1 && prohibited.isEmpty() && allowed.isEmpty())
      return combineBoost(bq, (Query)required.elementAt(0));

    // Otherwise, we need to construct a new one.
    bq = (BooleanQuery)copyBoost(bq, new BooleanQuery(bq.isCoordDisabled()));

    for (Iterator iter = required.iterator(); iter.hasNext();)
      bq.add((Query)iter.next(), BooleanClause.Occur.MUST);

    for (Iterator iter = prohibited.iterator(); iter.hasNext();)
      bq.add((Query)iter.next(), BooleanClause.Occur.MUST_NOT);

    for (Iterator iter = allowed.iterator(); iter.hasNext();)
      bq.add((Query)iter.next(), BooleanClause.Occur.SHOULD);

    return bq;
  } // rewrite()

  /**
   * Rewrite a span NEAR query. Stop words will be bi-grammed into adjacent
   * terms.
   *
   * @param q  The query to rewrite
   * @return   Rewritten version, or 'q' unchanged if no changed needed.
   */
  protected Query rewrite(final SpanNearQuery q) 
  {
    // Rewrite each clause. Allow single clauses to be promoted, and
    // do perform bi-gramming.
    //
    return rewriteClauses(q,
      q.getClauses(),
      true,
      true,
      q.getSlop(),
      new SpanClauseJoiner() 
      {
        public SpanQuery join(SpanQuery[] clauses) {
          return new SpanNearQuery(clauses, q.getSlop(), false);
        }
      });
  } // rewrite()

  /**
   * Rewrite a span OR-NEAR query. Stop words will be bi-grammed into adjacent
   * terms.
   *
   * @param q  The query to rewrite
   * @return   Rewritten version, or 'q' unchanged if no changed needed.
   */
  protected Query rewrite(final SpanOrNearQuery q) 
  {
    // Rewrite each clause. Allow single clauses to be promoted, and
    // do perform bi-gramming.
    //
    return rewriteClauses(q,
      q.getClauses(),
      true,
      true,
      q.getSlop(),
      new SpanClauseJoiner() 
      {
        public SpanQuery join(SpanQuery[] clauses) {
          return new SpanOrNearQuery(clauses, q.getSlop(), false);
        }
      });
  } // rewrite()

  /**
   * Rewrite a span-based OR query. The procedure in this case is simple:
   * remove all stop words, with no bi-gramming performed.
   *
   * @param q  The query to rewrite
   * @return   Rewritten version, or 'q' unchanged if no changed needed.
   */
  protected Query rewrite(final SpanOrQuery q) 
  {
    // Rewrite each clause. Allow single clauses to be promoted, and
    // avoid bi-gramming.
    //
    return rewriteClauses(q,
      q.getClauses(),
      true,
      false,
      0,
      new SpanClauseJoiner() 
      {
        public SpanQuery join(SpanQuery[] clauses) {
          return new SpanOrQuery(clauses);
        }
      });
  } // rewrite()

  /**
   * Utility function that takes care of rewriting a series of span query
   * clauses.
   *
   * @param oldQuery    Query being rewritten
   * @param oldClauses  Clauses to rewrite
   * @param shuntSingle true to allow single-clause result to be returned,
   *                    false to force wrapping.
   * @param bigram      true to bigram stop-words, false to simply remove them
   * @param slop        if bigramming, 0 for phrase, non-zero for near
   * @param joiner      Handles joining new clauses into wrapper query
   * @return            New rewritten query, or 'oldQuery' if no changes.
   */
  protected Query rewriteClauses(Query oldQuery, SpanQuery[] oldClauses,
                                 boolean shuntSingle, boolean bigram, int slop,
                                 SpanClauseJoiner joiner) 
  {
    Vector newClauseVec = new Vector();
    boolean anyChanges = false;

    for (int i = 0; i < oldClauses.length; i++) 
    {
      SpanQuery clause = (SpanQuery)rewriteQuery(oldClauses[i]);
      if (clause != oldClauses[i])
        anyChanges = true;

      // If the clause ended up null, skip it.
      if (clause == null)
        continue;

      // Skip stop-words if we're not bigramming.
      if (!bigram && stopSet.contains(extractTermText(clause))) {
        removedTerms.add(extractTermText(clause));
        anyChanges = true;
        continue;
      }

      // Retain everything else.
      newClauseVec.add(clause);
    } // for i

    SpanQuery[] newClauses = (SpanQuery[])newClauseVec.toArray(
      new SpanQuery[newClauseVec.size()]);

    // Apply bi-gramming to the rewritten clauses if requested.
    boolean alreadyJoined = false;
    if (bigram) 
    {
      SpanQuery[] bigrammedClauses = bigramQueries(newClauses, slop, joiner);
      if (bigrammedClauses != newClauses) {
        assert bigrammedClauses.length <= 1 : "bigramQueries should result in one clause if any bigramming performed";
        newClauses = bigrammedClauses;
        anyChanges = true;
        alreadyJoined = true;
      }
    }

    // If no changes, just return the original clauses.
    boolean force = forceRewrite(oldQuery);
    if (!anyChanges && !force)
      return oldQuery;

    // If we ended up with zero clauses, let the caller know they can delete
    // the query.
    //
    if (newClauses.length == 0)
      return null;

    // If only one clause (and we're allowed to shunt), just return the single
    // clause instead of a wrapping query.
    //
    if (newClauses.length == 1 && (alreadyJoined || (shuntSingle && !force))) 
    {
      // Since we're getting rid of the parent, pass on its boost to the
      // child.
      //
      return combineBoost(oldQuery, newClauses[0]);
    }

    // Construct a new query joining all the rewritten clauses.
    Query newQuery = joiner.join(newClauses);
    return copyBoost(oldQuery, newQuery);
  }

  /**
   * Removes stop words from a set of consecutive queries by combining
   * them with adjacent non-stop-words.
   *
   * @param clauses     array of queries to work on
   * @param slop        zero for exact matching, non-zero for 'near' matching.
   * @param joiner      used to join the resulting bi-grammed clauses
   * @return            original list, or a new query containing bi-grams
   */
  protected SpanQuery[] bigramQueries(SpanQuery[] clauses, int slop,
                                      SpanClauseJoiner joiner) 
  {
    assert clauses.length > 0 : "cannot bigram empty list";

    // Extract the term text from each query.
    String[] terms = new String[clauses.length];
    for (int i = 0; i < clauses.length; i++)
      terms[i] = extractTermText(clauses[i]);

    // If there's only one query, and it's not a stop word, the we have 
    // nothing to do.
    //
    if (clauses.length == 1 && !stopSet.contains(terms[0]))
      return clauses;

    // Find out if none of the queries are stop words (so we can take the easy
    // way out). 
    //
    // Along the way, make note of the stop words so we can later 
    // inform the user (since we're going to remove them one way or another.)
    //
    // Also, count the max # of consecutive stop words.
    //
    int nStopWords = 0;
    int consecStopWords = 0;
    int maxConsecStopWords = 0;
    for (int i = 0; i < clauses.length; i++) 
    {
      if (!stopSet.contains(terms[i])) {
        consecStopWords = 0;
        continue;
      }
      removedTerms.add(terms[i]);
      nStopWords++;
      consecStopWords++;
      if (consecStopWords > maxConsecStopWords)
        maxConsecStopWords = consecStopWords;
    }

    // No stop words? Nothing to do.
    if (nStopWords == 0)
      return clauses;

    // If the query is entirely stop words, it's not going to produce
    // anything useful. Just clear the query list and let the caller know 
    // we have made a change.
    //
    if (nStopWords == clauses.length)
      return new SpanQuery[0];

    // At this point, we know the query has at least one stop word and
    // at least one real word.
    //
    // We have three cases to worry about:
    //    (1) Phrase search
    //    (2) Near search with max 2 consecutive stop words
    //    (3) Near search with 3 or more consecutive stop words.
    //

    // Case (1): Phrase search
    //
    SpanQuery ret;
    if (slop == 0)
      ret = bigramTermsExact(clauses, terms, joiner);

    // Case (2): Near search with max 2 consecutive stop words
    else if (maxConsecStopWords <= 2)
      ret = bigramTermsInexact(clauses, terms, joiner);

    // Case (3): Near search with 3 or more consecutive stop words
    else 
    {
      // This case is a bit strange. Since doing an inexact query will end
      // up eliminating at least one stop word, we also do an exact query,
      // and let the best match win. Give boost priority to the exact one.
      //
      SpanQuery[] both = new SpanQuery[2];
      both[0] = bigramTermsExact(clauses, terms, joiner);
      both[1] = bigramTermsInexact(clauses, terms, joiner);
      reduceBoost(both[1]);
      ret = new SpanOrQuery(both);
    }

    // We definitely made changes
    SpanQuery[] retArray = new SpanQuery[1];
    retArray[0] = ret;
    return retArray;
  } // bigramQueries()

  /**
   * Given a sequence of terms consisting of mixed stop and real words,
   * figure out the bigrammed sequence that will give hits on at least
   * the real words, and give priority to ones that are near the closest
   * stop words.
   *
   * Examples:  "man of the world"
   *                    -> "(man or man-of) near (the-world or world)"
   *            "hello there"
   *                    -> "hello there"
   *            "it is not a problem"
   *                    -> "(a-problem or problem)"
   *
   * @param queries Original queries in the sequence
   * @param terms   Corresponding term text of each query
   * @param joiner  Used to join the resulting bi-grammed clauses
   *
   * @return        A new query possibly containing bi-grams
   */
  protected SpanQuery bigramTermsInexact(Query[] queries, String[] terms,
                                         SpanClauseJoiner joiner) 
  {
    SpanQuery[] clauses = new SpanQuery[terms.length * 2];
    int nClauses = 0;

    // Process each term in turn, looking at its relation to the next term.
    for (int i = 0; i < terms.length; i++) 
    {
      // There are six cases to consider:
      // (1) real followed by nothing
      // (2) real followed by real
      // (3) real followed by stop
      // (4) stop followed by nothing
      // (5) stop followed by real
      // (6) stop followed by stop
      //
      // First, handle cases (1), (2) and (3), which all start with a 
      // real word.
      //
      if (!stopSet.contains(terms[i])) 
      {
        // If the previous term was a stop-word, then this real
        // word has already been incorporated. Skip it.
        //
        if (i > 0 && stopSet.contains(terms[i - 1]))
          continue;

        // Case 1 and 2: Real followed by nothing or another real word. 
        //               In these cases, there's no need to glom.
        //
        if (i == terms.length - 1 || !stopSet.contains(terms[i + 1])) {
          clauses[nClauses++] = convertToSpanQuery(queries[i]);
          continue;
        }

        // Case 3: Real followed by stop. In this case, we make an
        // OR-query, like this: (real OR real-stop). Slightly reduce the
        // boost factor on the real alone, so that the real-stop pair 
        // will be scored higher.
        //
        SpanQuery[] both = new SpanQuery[2];
        both[0] = convertToSpanQuery(queries[i]);
        both[1] = convertToSpanQuery(glomQueries(queries[i], queries[i + 1]));
        reduceBoost(both[0]);
        clauses[nClauses++] = new SpanOrQuery(both);
        continue;
      }

      // Now handle cases (4), (5) and (6) that begin with a stop word.
      //
      // Case (4): Stop followed by nothing. Just drop the stop word.
      //
      if (i == terms.length - 1)
        continue;

      // Case (5): Stop followed by real. In this case, we make an OR
      //           query, like this: (stop-real OR real). Reduce the
      //           boost factor on the real word alone, so that the
      //           stop-real pair will score higher.
      //
      if (!stopSet.contains(terms[i + 1])) {
        SpanQuery[] both = new SpanQuery[2];
        both[0] = convertToSpanQuery(glomQueries(queries[i], queries[i + 1]));
        both[1] = convertToSpanQuery(queries[i + 1]);
        reduceBoost(both[1]);
        clauses[nClauses++] = new SpanOrQuery(both);
        continue;
      }

      // Case (6): Stop followed by stop. Throw it away.
      continue;
    } // for i

    // If we ended up with only one clause, just return that.
    if (nClauses == 1)
      return clauses[0];

    // Otherwise, join them all together in a "near" query.
    SpanQuery[] resized = new SpanQuery[nClauses];
    System.arraycopy(clauses, 0, resized, 0, nClauses);
    return joiner.join(resized);
  } // bigramTermsInexact()

  /**
   * Converts non-span queries to span queries, and passes span queries through
   * unchanged.
   *
   * @param q   Query to convert (span or non-span)
   * @return    Equivalent SpanQuery.
   */
  protected SpanQuery convertToSpanQuery(Query q) 
  {
    if (q instanceof SpanQuery)
      return (SpanQuery)q;
    if (q instanceof TermQuery) {
      Term t = ((TermQuery)q).getTerm();
      int termLength = isBigram(stopSet, t.text()) ? 2 : 1;
      return (SpanQuery)copyBoost(q, new SpanTermQuery(t, termLength));
    }
    assert false : "case not handled";
    return null;
  } // convertToSpanQuery()

  /**
   * Construct a term given its text and field name. This function is used
   * instead of Term's constructor to add an extra check that the text
   * is never a stop word.
   *
   * @param text    Text for the new term
   * @param field   Field being queried
   *
   * @return        A properly constructed Term, never a stop-word.
   */
  protected Term newTerm(String field, String text) {
    assert !stopSet.contains(text) : "cannot directly query a stop-word";
    return new Term(field, text);
  } // newTerm()

  /**
   * Given a sequence of terms consisting of mixed stop and real words,
   * figure out the bigrammed sequence required to get an exact match with
   * the index.
   *
   * Examples:  "man of the world"     -> "man-of of-the the-world"
   *            "hello there"          -> "hello there"
   *            "it is not a problem"  -> "it-is is-not not-a a-problem"
   *
   * @param queries Original queries in the sequence
   * @param terms   Corresponding term text of each query
   * @param joiner  Used to join the resulting bi-grammed clauses
   *
   * @return        A new query possibly containing bi-grams
   */
  protected SpanQuery bigramTermsExact(Query[] queries, String[] terms,
                                       SpanClauseJoiner joiner) 
  {
    Vector newQueries = new Vector(queries.length * 2);

    // Process each term in turn, looking at its relation to the next term.
    for (int i = 0; i < terms.length; i++) 
    {
      // There are six cases to consider:
      // (1) real followed by nothing
      // (2) real followed by real
      // (3) real followed by stop
      // (4) stop followed by nothing
      // (5) stop followed by real
      // (6) stop followed by stop
      //
      // First, handle cases (1), (2) and (3), which all start with a 
      // real word.
      //
      if (!stopSet.contains(terms[i])) 
      {
        // Cases 1 and 2: Real followed by nothing or another real 
        //                word. In this case, there's no need to glom.
        //
        if (i == terms.length - 1 || !stopSet.contains(terms[i + 1])) 
        {
          // If the previous term was a stop-word, then this real
          // word has already been incorporated. Skip it.
          //
          if (i > 0 && stopSet.contains(terms[i - 1]))
            continue;
          newQueries.add(queries[i]);
          continue;
        }

        // Case 3: Real followed by stop. In this case, we stick the
        // real and the stop together.
        //
        newQueries.add(glomQueries(queries[i], queries[i + 1]));
        continue;
      }

      // Now handle cases (4), (5) and (6) that start with a stop word.
      //
      // Case (4): stop word followed by nothing. Just throw it away.
      //           Don't worry, it should have been incorporated into 
      //           the previous glommed term.
      //
      if (i == terms.length - 1)
        continue;

      // Cases (5) and (6): stop word followed by anything else. Just 
      //                    glom the stop word with whatever comes after.
      //
      newQueries.add(glomQueries(queries[i], queries[i + 1]));
    } // for i

    // Convert the vector of queries to a handy array.
    SpanQuery[] newArray = new SpanQuery[newQueries.size()];
    newQueries.toArray(newArray);

    // And finally, make the "near" query that will join them all.
    return joiner.join(newArray);
  } // bigramTermsExact()

  /**
   * Joins a stop word to a real word, or vice-versa. Also handles more complex
   * cases, like joining a stop-word to an OR query.
   *
   * Examples:  the rabbit -> the-rabbit
   *            the (white OR beige) -> the-white OR the-beige
   *
   * @param q1  First query
   * @param q2  Second query
   * @return    A query representing the join.
   */
  protected Query glomQueries(Query q1, Query q2) 
  {
    // If they're both terms, our work is easy.
    if (q1 instanceof SpanTermQuery && q2 instanceof SpanTermQuery) 
    {
      SpanTermQuery st1 = (SpanTermQuery)q1;
      SpanTermQuery st2 = (SpanTermQuery)q2;

      Term t = newTerm(st1.getField(),
                       st1.getTerm().text() + "~" + st2.getTerm().text());
      int termLength = isBigram(stopSet, t.text()) ? 2 : 1;
      return copyBoost(st1, st2, new SpanTermQuery(t, termLength));
    }

    if (q1 instanceof TermQuery && q2 instanceof TermQuery) 
    {
      TermQuery t1 = (TermQuery)q1;
      TermQuery t2 = (TermQuery)q2;

      Term t = newTerm(t1.getTerm().field(),
                       t1.getTerm().text() + "~" + t2.getTerm().text());
      return copyBoost(t1, t2, new TermQuery(t));
    }

    // If joining a term to an OR query or vice-versa, we have a bunch to do.
    if (q1 instanceof SpanTermQuery && q2 instanceof SpanOrQuery)
      return glomInside((SpanOrQuery)q2, (SpanTermQuery)q1, true);
    if (q1 instanceof SpanOrQuery && q2 instanceof SpanTermQuery)
      return glomInside((SpanOrQuery)q1, (SpanTermQuery)q2, false);

    // If joining a term to a NOT query, only glom it's include clause (the
    // exclude clause is independent.)
    //
    if (q1 instanceof SpanTermQuery && q2 instanceof SpanChunkedNotQuery)
      return glomInside((SpanChunkedNotQuery)q2, (SpanTermQuery)q1, true);
    if (q1 instanceof SpanChunkedNotQuery && q2 instanceof SpanTermQuery)
      return glomInside((SpanChunkedNotQuery)q1, (SpanTermQuery)q2, false);

    if (q1 instanceof SpanTermQuery && q2 instanceof SpanNotNearQuery)
      return glomInside((SpanNotNearQuery)q2, (SpanTermQuery)q1, true);
    if (q1 instanceof SpanNotNearQuery && q2 instanceof SpanTermQuery)
      return glomInside((SpanNotNearQuery)q1, (SpanTermQuery)q2, false);

    // Don't mess with near queries.
    if (q1 instanceof SpanTermQuery && q2 instanceof SpanNearQuery)
      return q2;
    if (q1 instanceof SpanNearQuery && q2 instanceof SpanTermQuery)
      return q1;
    assert false : "case not handled yet";
    return null;
  } // glomQueries()

  /**
   * Gloms the term onto each clause within an OR query.
   *
   * @param oq      Query to glom into
   * @param term    Term to glom on
   * @param before  true to prepend the term, false to append.
   * @return        A new glommed query.
   */
  protected SpanQuery glomInside(SpanOrQuery oq, SpanTermQuery term,
                                 boolean before) 
  {
    SpanQuery[] clauses = oq.getClauses();
    boolean anyChanges = false;
    for (int i = 0; i < clauses.length; i++) 
    {
      if (clauses[i] instanceof SpanTermQuery) 
      {
        String ctText = extractTermText(clauses[i]);
        String newText = before ? (extractTermText(term) + "~" + ctText)
                         : (ctText + "~" + extractTermText(term));
        SpanQuery oldClause = clauses[i];
        int termLength = isBigram(stopSet, newText) ? 2 : 1;
        clauses[i] = new SpanTermQuery(newTerm(term.getTerm().field(), newText),
                                       termLength);
        copyBoost(oldClause, term, clauses[i]);
        anyChanges = true;
      }
      else if (clauses[i] instanceof SpanOrQuery) {
        SpanQuery newq = glomInside((SpanOrQuery)clauses[i], term, before);
        if (newq != oq) {
          clauses[i] = newq;
          anyChanges = true;
        }
      }
      else
        assert false : "case not handled";
    } // for i

    // No changes? Return the unaltered original query.
    if (!anyChanges)
      return oq;

    // All done!
    return (SpanQuery)copyBoost(oq, new SpanOrQuery(clauses));
  } // glomInside()

  /**
   * Gloms the term onto each clause within a NOT query.
   *
   * @param nq      Query to glom into
   * @param term    Term to glom on
   * @param before  true to prepend the term, false to append.
   * @return        A new glommed query.
   */
  protected SpanQuery glomInside(SpanChunkedNotQuery nq, SpanTermQuery term,
                                 boolean before) 
  {
    // Only glom into the 'include' clause. The 'exclude' clause is entirely
    // independent.
    //
    SpanQuery newInclude;
    if (before)
      newInclude = (SpanQuery)glomQueries(term, nq.getInclude());
    else
      newInclude = (SpanQuery)glomQueries(nq.getInclude(), term);

    // If no change was made to the 'include' clause, then we needn't change
    // the NOT query.
    //
    if (newInclude == nq.getInclude())
      return nq;

    // Make a new NOT query then.
    return (SpanQuery)copyBoost(nq,
                                new SpanChunkedNotQuery(
                                                        newInclude,
                                                        nq.getExclude(),
                                                        nq.getSlop()));
  } // glomInside()

  /**
   * Gloms the term onto each clause within a NOT query.
   *
   * @param nq      Query to glom into
   * @param term    Term to glom on
   * @param before  true to prepend the term, false to append.
   * @return        A new glommed query.
   */
  protected SpanQuery glomInside(SpanNotNearQuery nq, SpanTermQuery term,
                                 boolean before) 
  {
    // Only glom into the 'include' clause. The 'exclude' clause is entirely
    // independent.
    //
    SpanQuery newInclude;
    if (before)
      newInclude = (SpanQuery)glomQueries(term, nq.getInclude());
    else
      newInclude = (SpanQuery)glomQueries(nq.getInclude(), term);

    // If no change was made to the 'include' clause, then we needn't change
    // the NOT query.
    //
    if (newInclude == nq.getInclude())
      return nq;

    // Make a new NOT query then.
    return (SpanQuery)copyBoost(nq,
                                new SpanNotNearQuery(newInclude,
                                                     nq.getExclude(),
                                                     nq.getSlop()));
  } // glomInside()

  /**
   * Given a term, term query, span term query (or plain string), extract
   * the term text. This method is handy so we don't have to sprinkle if
   * statements everywhere we need to get the text.
   *
   * @param obj   String, Term, TermQuery, or SpanTermQuery to check
   * @return      text of the term
   */
  protected String extractTermText(Object obj) {
    if (obj instanceof String)
      return (String)obj;
    Term t = extractTerm(obj);
    if (t == null)
      return "";
    return t.text();
  } // extractText()

  /**
   * Given a term query, span term query (or plain term), extract
   * the Term itself. This method is handy so we don't have to sprinkle if
   * statements everywhere we need to get the term from a query.
   *
   * @param obj   Term, TermQuery, or SpanTermQuery to check
   * @return      the Term
   */
  protected Term extractTerm(Object obj) {
    if (obj instanceof Term)
      return (Term)obj;
    if (obj instanceof TermQuery)
      return ((TermQuery)obj).getTerm();
    if (obj instanceof SpanTermQuery)
      return ((SpanTermQuery)obj).getTerm();
    if (obj instanceof SpanDechunkingQuery)
      return extractTerm(((SpanDechunkingQuery)obj).getWrapped());
    return null;
  } // extractTerm()

  /**
   * Reduces the boost factor of a query (typically the non-bigram of a pair in
   * an OR) so that the bigram will get scored higher.
   */
  protected void reduceBoost(Query query) {
    query.setBoost(query.getBoost() * 0.8f);
  } // reduceBoost()
} // class BigramQueryRewriter