/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestClassicSimilarity extends LuceneTestCase {
private Directory directory;
private IndexReader indexReader;
private IndexSearcher indexSearcher;
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
try (IndexWriter indexWriter = new IndexWriter(directory, newIndexWriterConfig())) {
Document document = new Document();
document.add(new StringField("test", "hit", Store.NO));
indexWriter.addDocument(document);
indexWriter.commit();
}
indexReader = DirectoryReader.open(directory);
indexSearcher = newSearcher(indexReader);
indexSearcher.setSimilarity(new ClassicSimilarity());
}
@Override
public void tearDown() throws Exception {
IOUtils.close(indexReader, directory);
super.tearDown();
}
public void testHit() throws IOException {
Query query = new TermQuery(new Term("test", "hit"));
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testMiss() throws IOException {
Query query = new TermQuery(new Term("test", "miss"));
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(0, topDocs.totalHits);
}
public void testEmpty() throws IOException {
Query query = new TermQuery(new Term("empty", "miss"));
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(0, topDocs.totalHits);
}
public void testBQHit() throws IOException {
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("test", "hit")), Occur.SHOULD)
.build();
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testBQHitOrMiss() throws IOException {
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("test", "hit")), Occur.SHOULD)
.add(new TermQuery(new Term("test", "miss")), Occur.SHOULD)
.build();
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testBQHitOrEmpty() throws IOException {
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("test", "hit")), Occur.SHOULD)
.add(new TermQuery(new Term("empty", "miss")), Occur.SHOULD)
.build();
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testDMQHit() throws IOException {
Query query = new DisjunctionMaxQuery(
Arrays.asList(
new TermQuery(new Term("test", "hit"))),
0);
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testDMQHitOrMiss() throws IOException {
Query query = new DisjunctionMaxQuery(
Arrays.asList(
new TermQuery(new Term("test", "hit")),
new TermQuery(new Term("test", "miss"))),
0);
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testDMQHitOrEmpty() throws IOException {
Query query = new DisjunctionMaxQuery(
Arrays.asList(
new TermQuery(new Term("test", "hit")),
new TermQuery(new Term("empty", "miss"))),
0);
TopDocs topDocs = indexSearcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testSaneNormValues() throws IOException {
ClassicSimilarity sim = new ClassicSimilarity();
for (int i = 0; i < 256; i++) {
float boost = TFIDFSimilarity.OLD_NORM_TABLE[i];
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
if (i > 0) {
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > TFIDFSimilarity.OLD_NORM_TABLE[i-1]);
}
}
TFIDFSimilarity.IDFStats stats = (IDFStats) sim.computeWeight(1f, new IndexSearcher(new MultiReader()).collectionStatistics("foo"));
for (int i = 0; i < 256; i++) {
float boost = stats.normTable[i];
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
if (i > 0) {
assertTrue("boost is not decreasing: " + boost + ",byte=" + i, boost < stats.normTable[i-1]);
}
}
}
public void testNormEncodingBackwardCompatibility() throws IOException {
Similarity similarity = new ClassicSimilarity();
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major}) {
for (int length : new int[] {1, 4, 16 }) { // these length values are encoded accurately on both cases
Directory dir = newDirectory();
// set the version on the directory
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
Document doc = new Document();
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
doc.add(new TextField("foo", value, Store.NO));
w.addDocument(doc);
IndexReader reader = DirectoryReader.open(w);
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
Explanation expl = searcher.explain(new TermQuery(new Term("foo", "b")), 0);
Explanation fieldNorm = findExplanation(expl, "fieldNorm");
assertNotNull(fieldNorm);
assertEquals(fieldNorm.toString(), 1/Math.sqrt(length), fieldNorm.getValue(), 0f);
w.close();
reader.close();
dir.close();
}
}
}
private static Explanation findExplanation(Explanation expl, String text) {
if (expl.getDescription().startsWith(text)) {
return expl;
} else {
for (Explanation sub : expl.getDetails()) {
Explanation match = findExplanation(sub, text);
if (match != null) {
return match;
}
}
}
return null;
}
public void testSameNormsAsBM25() {
ClassicSimilarity sim1 = new ClassicSimilarity();
BM25Similarity sim2 = new BM25Similarity();
sim2.setDiscountOverlaps(true);
for (int iter = 0; iter < 100; ++iter) {
final int length = TestUtil.nextInt(random(), 1, 1000);
final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
assertEquals(
sim2.computeNorm(state),
sim1.computeNorm(state),
0f);
}
}
}