TestSynonymTokenFilter.java example

Explorer
solrcene-master
package org.apache.lucene.wordnet;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;

public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
  final String testFile = "testSynonyms.txt";
  
  public void testSynonyms() throws Exception {
    SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
    /* all expansions */
    Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
    assertAnalyzesTo(analyzer, "Lost in the woods",
        new String[] { "lost", "in", "the", "woods", "forest", "wood" },
        new int[] { 0, 5, 8, 12, 12, 12 },
        new int[] { 4, 7, 11, 17, 17, 17 },
        new int[] { 1, 1, 1, 1, 0, 0 });
  }
  
  public void testSynonymsSingleQuote() throws Exception {
    SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
    /* all expansions */
    Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
    assertAnalyzesTo(analyzer, "king",
        new String[] { "king", "baron" });
  }
  
  public void testSynonymsLimitedAmount() throws Exception {
    SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
    /* limit to one synonym expansion */
    Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
    assertAnalyzesTo(analyzer, "Lost in the woods",
        /* wood comes before forest due to 
         * the input file, not lexicographic order
         */
        new String[] { "lost", "in", "the", "woods", "wood" },
        new int[] { 0, 5, 8, 12, 12 },
        new int[] { 4, 7, 11, 17, 17 },
        new int[] { 1, 1, 1, 1, 0 });
  }
  
  public void testReusableTokenStream() throws Exception {
    SynonymMap map = new SynonymMap(getClass().getResourceAsStream(testFile));
    /* limit to one synonym expansion */
    Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
    assertAnalyzesToReuse(analyzer, "Lost in the woods",
        new String[] { "lost", "in", "the", "woods", "wood" },
        new int[] { 0, 5, 8, 12, 12 },
        new int[] { 4, 7, 11, 17, 17 },
        new int[] { 1, 1, 1, 1, 0 });
    assertAnalyzesToReuse(analyzer, "My wolfish dog went to the forest",
        new String[] { "my", "wolfish", "ravenous", "dog", "went", "to",
          "the", "forest", "woods" },
        new int[] { 0, 3, 3, 11, 15, 20, 23, 27, 27 },
        new int[] { 2, 10, 10, 14, 19, 22, 26, 33, 33 },
        new int[] { 1, 1, 0, 1, 1, 1, 1, 1, 0 });
  }
  
  private class SynonymWhitespaceAnalyzer extends Analyzer {
    private SynonymMap synonyms;
    private int maxSynonyms;
    
    public SynonymWhitespaceAnalyzer(SynonymMap synonyms, int maxSynonyms) {
      this.synonyms = synonyms;
      this.maxSynonyms = maxSynonyms;
    }
    
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream ts = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
      ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
      return ts;
    }
    
    private class SavedStreams {
      Tokenizer source;
      TokenStream result;
    }
    
    @Override
    public TokenStream reusableTokenStream(String fieldName, Reader reader)
        throws IOException {
      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
      if (streams == null) {
        streams = new SavedStreams();
        streams.source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
        streams.result = new SynonymTokenFilter(streams.source, synonyms, maxSynonyms);
        setPreviousTokenStream(streams);
      } else {
        streams.source.reset(reader);
        streams.result.reset(); // reset the SynonymTokenFilter
      }
      return streams.result;
    }
  }
  
}