TestBoolean2.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;



import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Random;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;

/** Test BooleanQuery2 against BooleanQuery by overriding the standard query parser.
 * This also tests the scoring order of BooleanQuery.
 */
public class TestBoolean2 extends LuceneTestCase {
  private static IndexSearcher searcher;
  private static IndexSearcher singleSegmentSearcher;
  private static IndexSearcher bigSearcher;
  private static IndexReader reader;
  private static IndexReader littleReader;
  private static IndexReader singleSegmentReader;
  /** num of empty docs injected between every doc in the (main) index */
  private static int NUM_FILLER_DOCS;
  /** num of empty docs injected prior to the first doc in the (main) index */
  private static int PRE_FILLER_DOCS;
  /** num "extra" docs containing value in "field2" added to the "big" clone of the index */
  private static final int NUM_EXTRA_DOCS = 6000;
  
  public static final String field = "field";
  private static Directory directory;
  private static Directory singleSegmentDirectory;
  private static Directory dir2;
  private static int mulFactor;

  private static Directory copyOf(Directory dir) throws IOException {
    Directory copy = newFSDirectory(createTempDir());
    for(String name : dir.listAll()) {
      if (name.startsWith("extra")) {
        continue;
      }
      copy.copyFrom(dir, name, name, IOContext.DEFAULT);
      copy.sync(Collections.singleton(name));
    }
    return copy;
  }

  @BeforeClass
  public static void beforeClass() throws Exception {
    // in some runs, test immediate adjacency of matches - in others, force a full bucket gap between docs
    NUM_FILLER_DOCS = random().nextBoolean() ? 0 : BooleanScorer.SIZE;
    PRE_FILLER_DOCS = TestUtil.nextInt(random(), 0, (NUM_FILLER_DOCS / 2));
    if (VERBOSE) {
      System.out.println("TEST: NUM_FILLER_DOCS=" + NUM_FILLER_DOCS + " PRE_FILLER_DOCS=" + PRE_FILLER_DOCS);
    }

    if (NUM_FILLER_DOCS * PRE_FILLER_DOCS > 100000) {
      directory = newFSDirectory(createTempDir());
    } else {
      directory = newDirectory();
    }

    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    // randomized codecs are sometimes too costly for this test:
    iwc.setCodec(Codec.forName("Lucene70"));
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter writer= new RandomIndexWriter(random(), directory, iwc);
    // we'll make a ton of docs, disable store/norms/vectors
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setOmitNorms(true);
    
    Document doc = new Document();
    for (int filler = 0; filler < PRE_FILLER_DOCS; filler++) {
      writer.addDocument(doc);
    }
    for (int i = 0; i < docFields.length; i++) {
      doc.add(new Field(field, docFields[i], ft));
      writer.addDocument(doc);
      
      doc = new Document();
      for (int filler = 0; filler < NUM_FILLER_DOCS; filler++) {
        writer.addDocument(doc);
      }
    }
    writer.close();
    littleReader = DirectoryReader.open(directory);
    searcher = newSearcher(littleReader);
    // this is intentionally using the baseline sim, because it compares against bigSearcher (which uses a random one)
    searcher.setSimilarity(new ClassicSimilarity());

    // make a copy of our index using a single segment
    if (NUM_FILLER_DOCS * PRE_FILLER_DOCS > 100000) {
      singleSegmentDirectory = newFSDirectory(createTempDir());
    } else {
      singleSegmentDirectory = newDirectory();
    }

    // TODO: this test does not need to be doing this crazy stuff. please improve it!
    for (String fileName : directory.listAll()) {
      if (fileName.startsWith("extra")) {
        continue;
      }
      singleSegmentDirectory.copyFrom(directory, fileName, fileName, IOContext.DEFAULT);
      singleSegmentDirectory.sync(Collections.singleton(fileName));
    }
    
    iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    // we need docID order to be preserved:
    // randomized codecs are sometimes too costly for this test:
    iwc.setCodec(Codec.forName("Lucene70"));
    iwc.setMergePolicy(newLogMergePolicy());
    try (IndexWriter w = new IndexWriter(singleSegmentDirectory, iwc)) {
      w.forceMerge(1, true);
    }
    singleSegmentReader = DirectoryReader.open(singleSegmentDirectory);
    singleSegmentSearcher = newSearcher(singleSegmentReader);
    singleSegmentSearcher.setSimilarity(searcher.getSimilarity(true));
    
    // Make big index
    dir2 = copyOf(directory);

    // First multiply small test index:
    mulFactor = 1;
    int docCount = 0;
    if (VERBOSE) {
      System.out.println("\nTEST: now copy index...");
    }
    do {
      if (VERBOSE) {
        System.out.println("\nTEST: cycle...");
      }
      final Directory copy = copyOf(dir2);

      iwc = newIndexWriterConfig(new MockAnalyzer(random()));
      // randomized codecs are sometimes too costly for this test:
      iwc.setCodec(Codec.forName("Lucene70"));
      RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);
      w.addIndexes(copy);
      copy.close();
      docCount = w.maxDoc();
      w.close();
      mulFactor *= 2;
    } while(docCount < 3000 * NUM_FILLER_DOCS);

    iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000));
    // randomized codecs are sometimes too costly for this test:
    iwc.setCodec(Codec.forName("Lucene70"));
    RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);

    doc = new Document();
    doc.add(new Field("field2", "xxx", ft));
    for(int i=0;i<NUM_EXTRA_DOCS/2;i++) {
      w.addDocument(doc);
    }
    doc = new Document();
    doc.add(new Field("field2", "big bad bug", ft));
    for(int i=0;i<NUM_EXTRA_DOCS/2;i++) {
      w.addDocument(doc);
    }
    reader = w.getReader();
    bigSearcher = newSearcher(reader);
    w.close();
  }

  @AfterClass
  public static void afterClass() throws Exception {
    reader.close();
    littleReader.close();
    singleSegmentReader.close();
    dir2.close();
    directory.close();
    singleSegmentDirectory.close();
    singleSegmentSearcher = null;
    singleSegmentReader = null;
    singleSegmentDirectory = null;
    searcher = null;
    reader = null;
    littleReader = null;
    dir2 = null;
    directory = null;
    bigSearcher = null;
  }

  private static String[] docFields = {
    "w1 w2 w3 w4 w5",
    "w1 w3 w2 w3",
    "w1 xx w2 yy w3",
    "w1 w3 xx w2 yy mm"
  };

  public void queriesTest(Query query, int[] expDocNrs) throws Exception {

    // adjust the expected doc numbers according to our filler docs
    if (0 < NUM_FILLER_DOCS) {
      expDocNrs = Arrays.copyOf(expDocNrs, expDocNrs.length);
      for (int i=0; i < expDocNrs.length; i++) {
        expDocNrs[i] = PRE_FILLER_DOCS + ((NUM_FILLER_DOCS + 1) * expDocNrs[i]);
      }
    }
    
    final int topDocsToCheck = atLeast(1000);
    // The asserting searcher will sometimes return the bulk scorer and
    // sometimes return a default impl around the scorer so that we can
    // compare BS1 and BS2
    TopScoreDocCollector collector = TopScoreDocCollector.create(topDocsToCheck);
    searcher.search(query, collector);
    ScoreDoc[] hits1 = collector.topDocs().scoreDocs;
    collector = TopScoreDocCollector.create(topDocsToCheck);
    searcher.search(query, collector);
    ScoreDoc[] hits2 = collector.topDocs().scoreDocs; 

    CheckHits.checkHitsQuery(query, hits1, hits2, expDocNrs);

    // Since we have no deleted docs, we should also be able to verify identical matches &
    // scores against an single segment copy of our index
    collector = TopScoreDocCollector.create(topDocsToCheck);
    singleSegmentSearcher.search(query, collector);
    hits2 = collector.topDocs().scoreDocs; 
    CheckHits.checkHitsQuery(query, hits1, hits2, expDocNrs);
    
    // sanity check expected num matches in bigSearcher
    assertEquals(mulFactor * collector.totalHits,
                 bigSearcher.search(query, 1).totalHits);

    // now check 2 diff scorers from the bigSearcher as well
    collector = TopScoreDocCollector.create(topDocsToCheck);
    bigSearcher.search(query, collector);
    hits1 = collector.topDocs().scoreDocs;
    collector = TopScoreDocCollector.create(topDocsToCheck);
    bigSearcher.search(query, collector);
    hits2 = collector.topDocs().scoreDocs; 

    // NOTE: just comparing results, not vetting against expDocNrs
    // since we have dups in bigSearcher
    CheckHits.checkEqual(query, hits1, hits2);
      
  }

  @Test
  public void testQueries01() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST);
    int[] expDocNrs = {2,3};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries02() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.SHOULD);
    int[] expDocNrs = {2,3,1,0};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries03() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.SHOULD);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.SHOULD);
    int[] expDocNrs = {2,3,1,0};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries04() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.SHOULD);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
    int[] expDocNrs = {1,0};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries05() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
    int[] expDocNrs = {1,0};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries06() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
    query.add(new TermQuery(new Term(field, "w5")), BooleanClause.Occur.MUST_NOT);
    int[] expDocNrs = {1};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries07() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST_NOT);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
    query.add(new TermQuery(new Term(field, "w5")), BooleanClause.Occur.MUST_NOT);
    int[] expDocNrs = {};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries08() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.SHOULD);
    query.add(new TermQuery(new Term(field, "w5")), BooleanClause.Occur.MUST_NOT);
    int[] expDocNrs = {2,3,1};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testQueries09() throws Exception {
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "w2")), BooleanClause.Occur.MUST);
    query.add(new TermQuery(new Term(field, "zz")), BooleanClause.Occur.SHOULD);
    int[] expDocNrs = {2, 3};
    queriesTest(query.build(), expDocNrs);
  }

  @Test
  public void testRandomQueries() throws Exception {
    String[] vals = {"w1","w2","w3","w4","w5","xx","yy","zzz"};

    int tot=0;

    BooleanQuery q1 = null;
    try {

      // increase number of iterations for more complete testing
      int num = atLeast(20);
      for (int i=0; i<num; i++) {
        int level = random().nextInt(3);
        q1 = randBoolQuery(new Random(random().nextLong()), random().nextBoolean(), level, field, vals, null).build();
        
        // Can't sort by relevance since floating point numbers may not quite
        // match up.
        Sort sort = Sort.INDEXORDER;

        QueryUtils.check(random(), q1,searcher); // baseline sim
        try {
          // a little hackish, QueryUtils.check is too costly to do on bigSearcher in this loop.
          searcher.setSimilarity(bigSearcher.getSimilarity(true)); // random sim
          QueryUtils.check(random(), q1, searcher);
        } finally {
          searcher.setSimilarity(new ClassicSimilarity()); // restore
        }

        // check diff (randomized) scorers (from AssertingSearcher) produce the same results
        TopFieldCollector collector = TopFieldCollector.create(sort, 1000, false, true, true);
        searcher.search(q1, collector);
        ScoreDoc[] hits1 = collector.topDocs().scoreDocs;
        collector = TopFieldCollector.create(sort, 1000, false, true, true);
        searcher.search(q1, collector);
        ScoreDoc[] hits2 = collector.topDocs().scoreDocs;
        tot+=hits2.length;
        CheckHits.checkEqual(q1, hits1, hits2);

        BooleanQuery.Builder q3 = new BooleanQuery.Builder();
        q3.add(q1, BooleanClause.Occur.SHOULD);
        q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD);
        TopDocs hits4 = bigSearcher.search(q3.build(), 1);
        assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, hits4.totalHits);

        // test diff (randomized) scorers produce the same results on bigSearcher as well
        collector = TopFieldCollector.create(sort, 1000 * mulFactor, false, true, true);
        bigSearcher.search(q1, collector);
        hits1 = collector.topDocs().scoreDocs;
        collector = TopFieldCollector.create(sort, 1000 * mulFactor, false, true, true);
        bigSearcher.search(q1, collector);
        hits2 = collector.topDocs().scoreDocs;
        CheckHits.checkEqual(q1, hits1, hits2);
        
      }

    } catch (Exception e) {
      // For easier debugging
      System.out.println("failed query: " + q1);
      throw e;
    }

    // System.out.println("Total hits:"+tot);
  }


  // used to set properties or change every BooleanQuery
  // generated from randBoolQuery.
  public static interface Callback {
    public void postCreate(BooleanQuery.Builder q);
  }

  // Random rnd is passed in so that the exact same random query may be created
  // more than once.
  public static BooleanQuery.Builder randBoolQuery(Random rnd, boolean allowMust, int level, String field, String[] vals, Callback cb) {
    BooleanQuery.Builder current = new BooleanQuery.Builder();
    for (int i=0; i<rnd.nextInt(vals.length)+1; i++) {
      int qType=0; // term query
      if (level>0) {
        qType = rnd.nextInt(10);
      }
      Query q;
      if (qType < 3) {
        q = new TermQuery(new Term(field, vals[rnd.nextInt(vals.length)]));
      } else if (qType < 4) {
        String t1 = vals[rnd.nextInt(vals.length)];
        String t2 = vals[rnd.nextInt(vals.length)];
        q = new PhraseQuery(10, field, t1, t2); // slop increases possibility of matching
      } else if (qType < 7) {
        q = new WildcardQuery(new Term(field, "w*"));
      } else {
        q = randBoolQuery(rnd, allowMust, level-1, field, vals, cb).build();
      }

      int r = rnd.nextInt(10);
      BooleanClause.Occur occur;
      if (r<2) {
        occur=BooleanClause.Occur.MUST_NOT;
      }
      else if (r<5) {
        if (allowMust) {
          occur=BooleanClause.Occur.MUST;
        } else {
          occur=BooleanClause.Occur.SHOULD;
        }
      } else {
        occur=BooleanClause.Occur.SHOULD;
      }

      current.add(q, occur);
    }
    if (cb!=null) cb.postCreate(current);
    return current;
  }


}