TestFuzzyQuery2.java example

Explorer
solrcene-master
package org.apache.lucene.search;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Random;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

/** 
 * Tests the results of fuzzy against pre-recorded output 
 * The format of the file is the following:
 * 
 * Header Row: # of bits: generate 2^n sequential documents 
 * with a value of Integer.toBinaryString
 * 
 * Entries: an entry is a param spec line, a resultCount line, and
 * then 'resultCount' results lines. The results lines are in the
 * expected order.
 * 
 * param spec line: a comma-separated list of params to FuzzyQuery
 *   (query, prefixLen, pqSize, minScore)
 * query = query text as a number (expand with Integer.toBinaryString)
 * prefixLen = prefix length
 * pqSize = priority queue maximum size for TopTermsBoostOnlyBooleanQueryRewrite
 * minScore = minimum similarity
 * 
 * resultCount line: total number of expected hits.
 * 
 * results line: comma-separated docID, score pair
 **/
public class TestFuzzyQuery2 extends LuceneTestCase {
  /** epsilon for score comparisons */
  static final float epsilon = 0.00001f;
  private Random random;
  
  @Override
  public void setUp() throws Exception {
    super.setUp();
    random = newRandom();
  }

  public void testFromTestData() throws Exception {
    // TODO: randomize!
    assertFromTestData(new int[] { 0x40, 0x41 });
    assertFromTestData(new int[] { 0x40, 0x0195 });
    assertFromTestData(new int[] { 0x40, 0x0906 });
    assertFromTestData(new int[] { 0x40, 0x1040F });
    assertFromTestData(new int[] { 0x0194, 0x0195 });
    assertFromTestData(new int[] { 0x0194, 0x0906 });
    assertFromTestData(new int[] { 0x0194, 0x1040F });
    assertFromTestData(new int[] { 0x0905, 0x0906 });
    assertFromTestData(new int[] { 0x0905, 0x1040F });
    assertFromTestData(new int[] { 0x1040E, 0x1040F });
  }

  public void assertFromTestData(int codePointTable[]) throws Exception {
    InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt");
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
    
    int bits = Integer.parseInt(reader.readLine());
    int terms = (int) Math.pow(2, bits);
    
    Directory dir = newDirectory(random);
    RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false));
    
    Document doc = new Document();
    Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
    doc.add(field);
    
    for (int i = 0; i < terms; i++) {
      field.setValue(mapInt(codePointTable, i));
      writer.addDocument(doc);
    }   
    
    IndexReader r = writer.getReader();
    IndexSearcher searcher = new IndexSearcher(r);
    writer.close();
    String line;
    while ((line = reader.readLine()) != null) {
      String params[] = line.split(",");
      String query = mapInt(codePointTable, Integer.parseInt(params[0]));
      int prefix = Integer.parseInt(params[1]);
      int pqSize = Integer.parseInt(params[2]);
      float minScore = Float.parseFloat(params[3]);
      FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix);
      q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize));
      int expectedResults = Integer.parseInt(reader.readLine());
      TopDocs docs = searcher.search(q, expectedResults);
      assertEquals(expectedResults, docs.totalHits);
      for (int i = 0; i < expectedResults; i++) {
        String scoreDoc[] = reader.readLine().split(",");
        assertEquals(Integer.parseInt(scoreDoc[0]), docs.scoreDocs[i].doc);
        assertEquals(Float.parseFloat(scoreDoc[1]), docs.scoreDocs[i].score, epsilon);
      }
    }
    searcher.close();
    r.close();
    dir.close();
  }
  
  /* map bits to unicode codepoints */
  private static String mapInt(int codePointTable[], int i) {
    StringBuilder sb = new StringBuilder();
    String binary = Integer.toBinaryString(i);
    for (int j = 0; j < binary.length(); j++)
      sb.appendCodePoint(codePointTable[binary.charAt(j) - '0']);
    return sb.toString();
  }

  /* Code to generate test data
  public static void main(String args[]) throws Exception {
    int bits = 3;
    System.out.println(bits);
    int terms = (int) Math.pow(2, bits);
    
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
        IndexWriter.MaxFieldLength.UNLIMITED);
    
    Document doc = new Document();
    Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
    doc.add(field);

    for (int i = 0; i < terms; i++) {
      field.setValue(Integer.toBinaryString(i));
      writer.addDocument(doc);
    }
    
    writer.optimize();
    writer.close();

    IndexSearcher searcher = new IndexSearcher(dir);
    for (int prefix = 0; prefix < bits; prefix++)
      for (int pqsize = 1; pqsize <= terms; pqsize++)
        for (float minscore = 0.1F; minscore < 1F; minscore += 0.2F)
          for (int query = 0; query < terms; query++) {
            FuzzyQuery q = new FuzzyQuery(
                new Term("field", Integer.toBinaryString(query)), minscore, prefix);
            q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqsize));
            System.out.println(query + "," + prefix + "," + pqsize + "," + minscore);
            TopDocs docs = searcher.search(q, terms);
            System.out.println(docs.totalHits);
            for (int i = 0; i < docs.totalHits; i++)
              System.out.println(docs.scoreDocs[i].doc + "," + docs.scoreDocs[i].score);
          }
  }
  */
}