package org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; /** * Tests the results of fuzzy against pre-recorded output * The format of the file is the following: * * Header Row: # of bits: generate 2^n sequential documents * with a value of Integer.toBinaryString * * Entries: an entry is a param spec line, a resultCount line, and * then 'resultCount' results lines. The results lines are in the * expected order. * * param spec line: a comma-separated list of params to FuzzyQuery * (query, prefixLen, pqSize, minScore) * query = query text as a number (expand with Integer.toBinaryString) * prefixLen = prefix length * pqSize = priority queue maximum size for TopTermsBoostOnlyBooleanQueryRewrite * minScore = minimum similarity * * resultCount line: total number of expected hits. * * results line: comma-separated docID, score pair **/ public class TestFuzzyQuery2 extends LuceneTestCase { /** epsilon for score comparisons */ static final float epsilon = 0.00001f; private Random random; @Override public void setUp() throws Exception { super.setUp(); random = newRandom(); } public void testFromTestData() throws Exception { // TODO: randomize! assertFromTestData(new int[] { 0x40, 0x41 }); assertFromTestData(new int[] { 0x40, 0x0195 }); assertFromTestData(new int[] { 0x40, 0x0906 }); assertFromTestData(new int[] { 0x40, 0x1040F }); assertFromTestData(new int[] { 0x0194, 0x0195 }); assertFromTestData(new int[] { 0x0194, 0x0906 }); assertFromTestData(new int[] { 0x0194, 0x1040F }); assertFromTestData(new int[] { 0x0905, 0x0906 }); assertFromTestData(new int[] { 0x0905, 0x1040F }); assertFromTestData(new int[] { 0x1040E, 0x1040F }); } public void assertFromTestData(int codePointTable[]) throws Exception { InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); int bits = Integer.parseInt(reader.readLine()); int terms = (int) Math.pow(2, bits); Directory dir = newDirectory(random); RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false)); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); doc.add(field); for (int i = 0; i < terms; i++) { field.setValue(mapInt(codePointTable, i)); writer.addDocument(doc); } IndexReader r = writer.getReader(); IndexSearcher searcher = new IndexSearcher(r); writer.close(); String line; while ((line = reader.readLine()) != null) { String params[] = line.split(","); String query = mapInt(codePointTable, Integer.parseInt(params[0])); int prefix = Integer.parseInt(params[1]); int pqSize = Integer.parseInt(params[2]); float minScore = Float.parseFloat(params[3]); FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix); q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize)); int expectedResults = Integer.parseInt(reader.readLine()); TopDocs docs = searcher.search(q, expectedResults); assertEquals(expectedResults, docs.totalHits); for (int i = 0; i < expectedResults; i++) { String scoreDoc[] = reader.readLine().split(","); assertEquals(Integer.parseInt(scoreDoc[0]), docs.scoreDocs[i].doc); assertEquals(Float.parseFloat(scoreDoc[1]), docs.scoreDocs[i].score, epsilon); } } searcher.close(); r.close(); dir.close(); } /* map bits to unicode codepoints */ private static String mapInt(int codePointTable[], int i) { StringBuilder sb = new StringBuilder(); String binary = Integer.toBinaryString(i); for (int j = 0; j < binary.length(); j++) sb.appendCodePoint(codePointTable[binary.charAt(j) - '0']); return sb.toString(); } /* Code to generate test data public static void main(String args[]) throws Exception { int bits = 3; System.out.println(bits); int terms = (int) Math.pow(2, bits); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); doc.add(field); for (int i = 0; i < terms; i++) { field.setValue(Integer.toBinaryString(i)); writer.addDocument(doc); } writer.optimize(); writer.close(); IndexSearcher searcher = new IndexSearcher(dir); for (int prefix = 0; prefix < bits; prefix++) for (int pqsize = 1; pqsize <= terms; pqsize++) for (float minscore = 0.1F; minscore < 1F; minscore += 0.2F) for (int query = 0; query < terms; query++) { FuzzyQuery q = new FuzzyQuery( new Term("field", Integer.toBinaryString(query)), minscore, prefix); q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqsize)); System.out.println(query + "," + prefix + "," + pqsize + "," + minscore); TopDocs docs = searcher.search(q, terms); System.out.println(docs.totalHits); for (int i = 0; i < docs.totalHits; i++) System.out.println(docs.scoreDocs[i].doc + "," + docs.scoreDocs[i].score); } } */ }