TestSimilarity2.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.search.similarities;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

/**
 * Tests against all the similarities we have
 */
public class TestSimilarity2 extends LuceneTestCase {
  List<Similarity> sims;
  
  @Override
  public void setUp() throws Exception {
    super.setUp();
    sims = new ArrayList<>();
    sims.add(new DefaultSimilarity());
    sims.add(new BM25Similarity());
    // TODO: not great that we dup this all with TestSimilarityBase
    for (BasicModel basicModel : TestSimilarityBase.BASIC_MODELS) {
      for (AfterEffect afterEffect : TestSimilarityBase.AFTER_EFFECTS) {
        for (Normalization normalization : TestSimilarityBase.NORMALIZATIONS) {
          sims.add(new DFRSimilarity(basicModel, afterEffect, normalization));
        }
      }
    }
    for (Distribution distribution : TestSimilarityBase.DISTRIBUTIONS) {
      for (Lambda lambda : TestSimilarityBase.LAMBDAS) {
        for (Normalization normalization : TestSimilarityBase.NORMALIZATIONS) {
          sims.add(new IBSimilarity(distribution, lambda, normalization));
        }
      }
    }
    sims.add(new LMDirichletSimilarity());
    sims.add(new LMJelinekMercerSimilarity(0.1f));
    sims.add(new LMJelinekMercerSimilarity(0.7f));
  }
  
  /** because of stupid things like querynorm, its possible we computeStats on a field that doesnt exist at all
   *  test this against a totally empty index, to make sure sims handle it
   */
  public void testEmptyIndex() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      assertEquals(0, is.search(new TermQuery(new Term("foo", "bar")), 10).totalHits);
    }
    ir.close();
    dir.close();
  }
  
  /** similar to the above, but ORs the query with a real field */
  public void testEmptyField() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("foo", "bar", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      BooleanQuery query = new BooleanQuery(true);
      query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
      query.add(new TermQuery(new Term("bar", "baz")), BooleanClause.Occur.SHOULD);
      assertEquals(1, is.search(query, 10).totalHits);
    }
    ir.close();
    dir.close();
  }
  
  /** similar to the above, however the field exists, but we query with a term that doesnt exist too */
  public void testEmptyTerm() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("foo", "bar", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      BooleanQuery query = new BooleanQuery(true);
      query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
      query.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD);
      assertEquals(1, is.search(query, 10).totalHits);
    }
    ir.close();
    dir.close();
  }
  
  /** make sure we can retrieve when norms are disabled */
  public void testNoNorms() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setOmitNorms(true);
    ft.freeze();
    doc.add(newField("foo", "bar", ft));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      BooleanQuery query = new BooleanQuery(true);
      query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
      assertEquals(1, is.search(query, 10).totalHits);
    }
    ir.close();
    dir.close();
  }
  
  /** make sure all sims work if TF is omitted */
  public void testOmitTF() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_ONLY);
    ft.freeze();
    Field f = newField("foo", "bar", ft);
    doc.add(f);
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      BooleanQuery query = new BooleanQuery(true);
      query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
      assertEquals(1, is.search(query, 10).totalHits);
    }
    ir.close();
    dir.close();
  }
  
  /** make sure all sims work if TF and norms is omitted */
  public void testOmitTFAndNorms() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_ONLY);
    ft.setOmitNorms(true);
    ft.freeze();
    Field f = newField("foo", "bar", ft);
    doc.add(f);
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      BooleanQuery query = new BooleanQuery(true);
      query.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
      assertEquals(1, is.search(query, 10).totalHits);
    }
    ir.close();
    dir.close();
  }
  
  /** make sure all sims work with spanOR(termX, termY) where termY does not exist */
  public void testCrazySpans() throws Exception {
    // The problem: "normal" lucene queries create scorers, returning null if terms dont exist
    // This means they never score a term that does not exist.
    // however with spans, there is only one scorer for the whole hierarchy:
    // inner queries are not real queries, their boosts are ignored, etc.
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    doc.add(newField("foo", "bar", ft));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      SpanTermQuery s1 = new SpanTermQuery(new Term("foo", "bar"));
      SpanTermQuery s2 = new SpanTermQuery(new Term("foo", "baz"));
      Query query = new SpanOrQuery(s1, s2);
      TopDocs td = is.search(query, 10);
      assertEquals(1, td.totalHits);
      float score = td.scoreDocs[0].score;
      assertTrue(score >= 0.0f);
      assertFalse("inf score for " + sim, Float.isInfinite(score));
    }
    ir.close();
    dir.close();
  }
}