TestSynonymFilter.java example

Explorer
solrcene-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.synonym;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

/**
 * @version $Id: TestSynonymFilter.java 950008 2010-06-01 10:35:13Z rmuir $
 */
public class TestSynonymFilter extends BaseTokenStreamTestCase {

  static List<String> strings(String str) {
    String[] arr = str.split(" ");
    return Arrays.asList(arr);
  }

  static void assertTokenizesTo(SynonymMap dict, String input,
      String expected[]) throws IOException {
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    SynonymFilter stream = new SynonymFilter(tokenizer, dict);
    assertTokenStreamContents(stream, expected);
  }
  
  static void assertTokenizesTo(SynonymMap dict, String input,
      String expected[], int posIncs[]) throws IOException {
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    SynonymFilter stream = new SynonymFilter(tokenizer, dict);
    assertTokenStreamContents(stream, expected, posIncs);
  }
  
  static void assertTokenizesTo(SynonymMap dict, List<Token> input,
      String expected[], int posIncs[])
      throws IOException {
    TokenStream tokenizer = new IterTokenStream(input);
    SynonymFilter stream = new SynonymFilter(tokenizer, dict);
    assertTokenStreamContents(stream, expected, posIncs);
  }
  
  static void assertTokenizesTo(SynonymMap dict, List<Token> input,
      String expected[], int startOffsets[], int endOffsets[], int posIncs[])
      throws IOException {
    TokenStream tokenizer = new IterTokenStream(input);
    SynonymFilter stream = new SynonymFilter(tokenizer, dict);
    assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
        posIncs);
  }
  
  public void testMatching() throws IOException {
    SynonymMap map = new SynonymMap();

    boolean orig = false;
    boolean merge = true;
    map.add(strings("a b"), tokens("ab"), orig, merge);
    map.add(strings("a c"), tokens("ac"), orig, merge);
    map.add(strings("a"), tokens("aa"), orig, merge);
    map.add(strings("b"), tokens("bb"), orig, merge);
    map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
    map.add(strings("x c"), tokens("xc"), orig, merge);

    assertTokenizesTo(map, "$", new String[] { "$" });
    assertTokenizesTo(map, "a", new String[] { "aa" });
    assertTokenizesTo(map, "a $", new String[] { "aa", "$" });
    assertTokenizesTo(map, "$ a", new String[] { "$", "aa" });
    assertTokenizesTo(map, "a a", new String[] { "aa", "aa" });
    assertTokenizesTo(map, "b", new String[] { "bb" });
    assertTokenizesTo(map, "z x c v", new String[] { "zxcv" });
    assertTokenizesTo(map, "z x c $", new String[] { "z", "xc", "$" });

    // repeats
    map.add(strings("a b"), tokens("ab"), orig, merge);
    map.add(strings("a b"), tokens("ab"), orig, merge);
    
    // FIXME: the below test intended to be { "ab" }
    assertTokenizesTo(map, "a b", new String[] { "ab", "ab", "ab"  });

    // check for lack of recursion
    map.add(strings("zoo"), tokens("zoo"), orig, merge);
    assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" });
    map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
    // FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" }
    // maybe this was just a typo in the old test????
    assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" });
  }

  public void testIncludeOrig() throws IOException {
    SynonymMap map = new SynonymMap();

    boolean orig = true;
    boolean merge = true;
    map.add(strings("a b"), tokens("ab"), orig, merge);
    map.add(strings("a c"), tokens("ac"), orig, merge);
    map.add(strings("a"), tokens("aa"), orig, merge);
    map.add(strings("b"), tokens("bb"), orig, merge);
    map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
    map.add(strings("x c"), tokens("xc"), orig, merge);

    assertTokenizesTo(map, "$", 
        new String[] { "$" },
        new int[] { 1 });
    assertTokenizesTo(map, "a", 
        new String[] { "a", "aa" },
        new int[] { 1, 0 });
    assertTokenizesTo(map, "a", 
        new String[] { "a", "aa" },
        new int[] { 1, 0 });
    assertTokenizesTo(map, "$ a", 
        new String[] { "$", "a", "aa" },
        new int[] { 1, 1, 0 });
    assertTokenizesTo(map, "a $", 
        new String[] { "a", "aa", "$" },
        new int[] { 1, 0, 1 });
    assertTokenizesTo(map, "$ a !", 
        new String[] { "$", "a", "aa", "!" },
        new int[] { 1, 1, 0, 1 });
    assertTokenizesTo(map, "a a", 
        new String[] { "a", "aa", "a", "aa" },
        new int[] { 1, 0, 1, 0 });
    assertTokenizesTo(map, "b", 
        new String[] { "b", "bb" },
        new int[] { 1, 0 });
    assertTokenizesTo(map, "z x c v",
        new String[] { "z", "zxcv", "x", "c", "v" },
        new int[] { 1, 0, 1, 1, 1 });
    assertTokenizesTo(map, "z x c $",
        new String[] { "z", "x", "xc", "c", "$" },
        new int[] { 1, 1, 0, 1, 1 });

    // check for lack of recursion
    map.add(strings("zoo zoo"), tokens("zoo"), orig, merge);
    // CHECKME: I think the previous test (with 4 zoo's), was just a typo.
    assertTokenizesTo(map, "zoo zoo $ zoo",
        new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
        new int[] { 1, 0, 1, 1, 1 });

    map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
    assertTokenizesTo(map, "zoo zoo $ zoo",
        new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
        new int[] { 1, 0, 1, 1, 1, 0, 1 });
  }


  public void testMapMerge() throws IOException {
    SynonymMap map = new SynonymMap();

    boolean orig = false;
    boolean merge = true;
    map.add(strings("a"), tokens("a5,5"), orig, merge);
    map.add(strings("a"), tokens("a3,3"), orig, merge);

    assertTokenizesTo(map, "a",
        new String[] { "a3", "a5" },
        new int[] { 1, 2 });

    map.add(strings("b"), tokens("b3,3"), orig, merge);
    map.add(strings("b"), tokens("b5,5"), orig, merge);

    assertTokenizesTo(map, "b",
        new String[] { "b3", "b5" },
        new int[] { 1, 2 });

    map.add(strings("a"), tokens("A3,3"), orig, merge);
    map.add(strings("a"), tokens("A5,5"), orig, merge);
    
    assertTokenizesTo(map, "a",
        new String[] { "a3", "A3", "a5", "A5" },
        new int[] { 1, 0, 2, 0 });

    map.add(strings("a"), tokens("a1"), orig, merge);
    assertTokenizesTo(map, "a",
        new String[] { "a1", "a3", "A3", "a5", "A5" },
        new int[] { 1, 2, 0, 2, 0 });

    map.add(strings("a"), tokens("a2,2"), orig, merge);
    map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge);
    assertTokenizesTo(map, "a",
        new String[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" },
        new int[] { 1, 1, 1, 0, 1, 1, 0, 1  });
  }


  public void testOverlap() throws IOException {
    SynonymMap map = new SynonymMap();

    boolean orig = false;
    boolean merge = true;
    map.add(strings("qwe"), tokens("qq/ww/ee"), orig, merge);
    map.add(strings("qwe"), tokens("xx"), orig, merge);
    map.add(strings("qwe"), tokens("yy"), orig, merge);
    map.add(strings("qwe"), tokens("zz"), orig, merge);
    assertTokenizesTo(map, "$", new String[] { "$" });
    assertTokenizesTo(map, "qwe",
        new String[] { "qq", "ww", "ee", "xx", "yy", "zz" },
        new int[] { 1, 0, 0, 0, 0, 0 });

    // test merging within the map

    map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge);
    map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge);
    assertTokenizesTo(map, "a",
        new String[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" },
        new int[] { 1, 2, 2, 1, 1, 1, 1, 100 });
  }

  public void testPositionIncrements() throws IOException {
    SynonymMap map = new SynonymMap();

    boolean orig = false;
    boolean merge = true;

    // test that generated tokens start at the same posInc as the original
    map.add(strings("a"), tokens("aa"), orig, merge);
    assertTokenizesTo(map, tokens("a,5"), 
        new String[] { "aa" },
        new int[] { 5 });
    assertTokenizesTo(map, tokens("a,0"),
        new String[] { "aa" },
        new int[] { 0 });

    // test that offset of first replacement is ignored (always takes the orig offset)
    map.add(strings("b"), tokens("bb,100"), orig, merge);
    assertTokenizesTo(map, tokens("b,5"),
        new String[] { "bb" },
        new int[] { 5 });
    assertTokenizesTo(map, tokens("b,0"),
        new String[] { "bb" },
        new int[] { 0 });

    // test that subsequent tokens are adjusted accordingly
    map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
    assertTokenizesTo(map, tokens("c,5"),
        new String[] { "cc", "c2" },
        new int[] { 5, 2 });
    assertTokenizesTo(map, tokens("c,0"),
        new String[] { "cc", "c2" },
        new int[] { 0, 2 });
  }


  public void testPositionIncrementsWithOrig() throws IOException {
    SynonymMap map = new SynonymMap();

    boolean orig = true;
    boolean merge = true;

    // test that generated tokens start at the same offset as the original
    map.add(strings("a"), tokens("aa"), orig, merge);
    assertTokenizesTo(map, tokens("a,5"),
        new String[] { "a", "aa" },
        new int[] { 5, 0 });
    assertTokenizesTo(map, tokens("a,0"),
        new String[] { "a", "aa" },
        new int[] { 0, 0 });

    // test that offset of first replacement is ignored (always takes the orig offset)
    map.add(strings("b"), tokens("bb,100"), orig, merge);
    assertTokenizesTo(map, tokens("b,5"),
        new String[] { "b", "bb" },
        new int[] { 5, 0 });
    assertTokenizesTo(map, tokens("b,0"),
        new String[] { "b", "bb" },
        new int[] { 0, 0 });

    // test that subsequent tokens are adjusted accordingly
    map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
    assertTokenizesTo(map, tokens("c,5"),
        new String[] { "c", "cc", "c2" },
        new int[] { 5, 0, 2 });
    assertTokenizesTo(map, tokens("c,0"),
        new String[] { "c", "cc", "c2" },
        new int[] { 0, 0, 2 });
  }


  public void testOffsetBug() throws IOException {
    // With the following rules:
    // a a=>b
    // x=>y
    // analysing "a x" causes "y" to have a bad offset (end less than start)
    // SOLR-167
    SynonymMap map = new SynonymMap();

    boolean orig = false;
    boolean merge = true;

    map.add(strings("a a"), tokens("b"), orig, merge);
    map.add(strings("x"), tokens("y"), orig, merge);

    // "a a x" => "b y"
    assertTokenizesTo(map, tokens("a,1,0,1 a,1,2,3 x,1,4,5"),
        new String[] { "b", "y" },
        new int[] { 0, 4 },
        new int[] { 3, 5 },
        new int[] { 1, 1 });
  }

  
  /***
   * Return a list of tokens according to a test string format:
   * a b c  =>  returns List<Token> [a,b,c]
   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
   * @deprecated does not support attributes api
   */
  private List<Token> tokens(String str) {
    String[] arr = str.split(" ");
    List<Token> result = new ArrayList<Token>();
    for (int i=0; i<arr.length; i++) {
      String[] toks = arr[i].split("/");
      String[] params = toks[0].split(",");

      int posInc;
      int start;
      int end;

      if (params.length > 1) {
        posInc = Integer.parseInt(params[1]);
      } else {
        posInc = 1;
      }

      if (params.length > 2) {
        start = Integer.parseInt(params[2]);
      } else {
        start = 0;
      }

      if (params.length > 3) {
        end = Integer.parseInt(params[3]);
      } else {
        end = start + params[0].length();
      }

      Token t = new Token(params[0],start,end,"TEST");
      t.setPositionIncrement(posInc);
      
      result.add(t);
      for (int j=1; j<toks.length; j++) {
        t = new Token(toks[j],0,0,"TEST");
        t.setPositionIncrement(0);
        result.add(t);
      }
    }
    return result;
  }
  
  /**
   * @deprecated does not support custom attributes
   */
  private static class IterTokenStream extends TokenStream {
    final Token tokens[];
    int index = 0;
    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
    TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
    
    public IterTokenStream(Token... tokens) {
      super();
      this.tokens = tokens;
    }
    
    public IterTokenStream(Collection<Token> tokens) {
      this(tokens.toArray(new Token[tokens.size()]));
    }
    
    public boolean incrementToken() throws IOException {
      if (index >= tokens.length)
        return false;
      else {
        clearAttributes();
        Token token = tokens[index++];
        termAtt.setEmpty().append(token);
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        posIncAtt.setPositionIncrement(token.getPositionIncrement());
        flagsAtt.setFlags(token.getFlags());
        typeAtt.setType(token.type());
        payloadAtt.setPayload(token.getPayload());
        return true;
      }
    }
  }
}