TestNewMultiWordSynonyms.java example

Explorer
montysolr-master
- contrib
package org.apache.lucene.analysis.synonym;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.NewSynonymFilterFactory.SynonymParser;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.CharsRef;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;


public class TestNewMultiWordSynonyms extends BaseTokenStreamTestCase {
  
  private StringMockResourceLoader getSyn() {
    return new StringMockResourceLoader(
        "hubble\0space\0telescope,HST,hs telescope\n" +
        "foo\0bar,foo ba,fu ba,foobar\n" +
        "foo\0baz,fu ba");
  }
  
  private StringMockResourceLoader getSemicolonSingleSyn() {
    return new StringMockResourceLoader(
        "žščřdťň, á;zscrdtn, a\n" +
        "fůů, bar => foo, bar; fuu, bar\n" +
        "ADAMŠuk, m; ADAMGuk, m;ADAMČuk, m\n"
        );
  }
  
  private StringMockResourceLoader getSolrSingleSyn() {
    return new StringMockResourceLoader(
        "žščřdťň\\,\\ á,zscrdtn\\,\\ a\n" +
        "fůů\\,\\ bar => foo\\,\\ bar, fuu\\,\\ bar\n"
        );
  }
  
  String O = TypeAttribute.DEFAULT_TYPE;
  String S = SynonymFilter.TYPE_SYNONYM;
  
  
  public void testSingleWordSolrSynonyms() throws IOException {
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    args.put("tokenizerFactory", KeywordTokenizerFactory.class.getCanonicalName().toString());
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(getSolrSingleSyn());
    TokenStream ts = factory.create(keywordMockTokenizer(new StringReader("žščřdťň, á")));
    assertTokenStreamContents(ts, new String[] { "žščřdťň, á", "zscrdtn, a" },
        new int[]    {0, 0}, //startOffset
        new int[]    {10,10}, //endOffset
        new String[] {S, S}, //type
        new int[]    {1, 0}  //posIncr
    );
  }
  
  public void testSingleWordSemicolonSynonyms() throws IOException {
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    args.put("format", "semicolon");
    args.put("tokenizerFactory", KeywordTokenizerFactory.class.getCanonicalName().toString());
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(getSemicolonSingleSyn());
    TokenStream ts = factory.create(keywordMockTokenizer(new StringReader("žščřdťň, á")));
    assertTokenStreamContents(ts, new String[] { "žščřdťň, á", "zscrdtn, a" },
        new int[]    {0, 0}, //startOffset
        new int[]    {10,10}, //endOffset
        new String[] {S, S}, //type
        new int[]    {1, 0}  //posIncr
    );
    
    ts = factory.create(keywordMockTokenizer(new StringReader("žščřdťň, á")));
    assertTokenStreamContents(ts, new String[] { "žščřdťň, á", "zscrdtn, a" },
        new int[]    {0, 0}, //startOffset
        new int[]    {10,10}, //endOffset
        new String[] {S, S}, //type
        new int[]    {1, 0}  //posIncr
    );
  }
  
  
  /*
   * This parser is useful if you want to index multi-token synonyms (as one token)
   * as well as their components. Ie. "hubble space telescope was..." will be 
   * indexed as
   * 0: hubble|hubble space telescope|HST
   * 1: space
   * 2: telescope
   */
  public static class TestParserReplaceNullsInclOrig extends NewSynonymFilterFactory.SynonymBuilderFactory {
    public TestParserReplaceNullsInclOrig(Map<String,String> args) {
      super(args);
    }

    protected SynonymParser getParser(Analyzer analyzer) {
      return new NewSolrSynonymParser(true, true, analyzer) {
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) {
          super.add(input, NewSynonymFilterFactory.replaceNulls(output), true);
        }
      };
    }
  }
  
  /**
   * @since solr 1.4
   */
  public void testMultiWordSynonyms() throws IOException {
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(new StringMockResourceLoader("a b c,d"));
    TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("a e")));
    // This fails because ["e","e"] is the value of the token stream
    assertTokenStreamContents(ts, new String[] { "a", "e" });
  }
  

  
  public void testMultiWordSynonymsReplaceNullsCustomInclOrigAnalyzer() throws IOException {
    
    
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
    args.put("builderFactory", NewSynonymFilterFactory.BestEffortSearchLowercase.class.getName());
    
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(getSyn());
    
    
    TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
    assertTokenStreamContents(ts, new String[] { "foo", "hubble", "hubble space telescope", "HST", "hs telescope", "space", "telescope" },
        new int[]    {0, 4, 4, 4, 4,11,17}, //startOffset
        new int[]    {3,10,26,26,26,16,26}, //endOffset
        new String[] {O, O, S, S, S, O, O}, //type
        new int[]    {1, 1, 0, 0, 0, 1, 1}  //posIncr
    );
    
    // test ignoreCase=true
    ts = factory.create(whitespaceMockTokenizer(new StringReader("hst")));
    assertTokenStreamContents(ts, new String[] { "hubble space telescope", "HST", "hs telescope"},
        new int[]    {0, 0, 0},
        new int[]    {3, 3, 3},
        new String[] {S, S, S},
        new int[]    {1, 0, 0}
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo", "foo bar", "foo ba", "fu ba", "foobar", "bar" },
        new int[]    {0, 5, 5, 5, 5, 5, 9}, //startOffset
        new int[]    {4, 8,12,12,12,12,12}, //endOffset
        new String[] {O, O, S, S, S, S, O}, //type
        new int[]    {1, 1, 0, 0, 0, 0, 1}  //posIncr
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo bar", "foo ba", "fu ba", "foobar"},
        new int[]    {0, 5, 5, 5, 5, 5}, //startOffset
        new int[]    {4,11,11,11,11,11}, //endOffset
        new String[] {O, S, S, S, S, S}, //type
        new int[]    {1, 1, 0, 0, 0, 1}  //posIncr
    );
    
  }
  
  public void testMultiWordSynonymsReplaceNullsInclOrig() throws IOException {
    
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    args.put("ignoreCase", "true");
    args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
    args.put("builderFactory", TestParserReplaceNullsInclOrig.class.getName());
    
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(getSyn());
    
    
    TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
    assertTokenStreamContents(ts, new String[] { "foo", "hubble", "hubble space telescope", "hst", "hs telescope", "space", "telescope" },
        new int[]    {0, 4, 4, 4, 4,11,17}, //startOffset
        new int[]    {3,10,26,26,26,16,26}, //endOffset
        new String[] {O, O, S, S, S, O, O}, //type
        new int[]    {1, 1, 0, 0, 0, 1, 1}  //posIncr
    );
    
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("hst")));
    assertTokenStreamContents(ts, new String[] { "hst", "hubble space telescope", "hst", "hs telescope"},
        new int[]    {0, 0, 0, 0},
        new int[]    {3, 3, 3, 3},
        new String[] {O, S, S, S},
        new int[]    {1, 0, 0, 0}
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo", "foo bar", "foo ba", "fu ba", "foobar", "bar" },
        new int[]    {0, 5, 5, 5, 5, 5, 9}, //startOffset
        new int[]    {4, 8,12,12,12,12,12}, //endOffset
        new String[] {O, O, S, S, S, S, O}, //type
        new int[]    {1, 1, 0, 0, 0, 0, 1}  //posIncr
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
    assertTokenStreamContents(ts, new String[] { "some", "foobar", "foo bar", "foo ba", "fu ba", "foobar"},
        new int[]    {0, 5, 5, 5, 5, 5, 5}, //startOffset
        new int[]    {4,11,11,11,11,11,11}, //endOffset
        new String[] {O, O, S, S, S, S, S}, //type
        new int[]    {1, 1, 0, 0, 0, 0, 1}  //posIncr
    );
    
  }
  
  
  public void testMultiWordSynonymsNullReplaced() throws IOException {
    
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    args.put("ignoreCase", "false");
    args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
    args.put("builderFactory", NewSynonymFilterFactory.MultiTokenReplaceNulls.class.getName());
    
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(getSyn());
    
    
    TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
    assertTokenStreamContents(ts, new String[] { "foo", "hubble space telescope", "HST", "hs telescope" },
        new int[]    {0, 4, 4, 4}, //startOffset
        new int[]    {3,26,26,26}, //endOffset
        new String[] {O, S, S, S}, //type
        new int[]    {1, 1, 0, 0}  //posIncr
    );
    
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("HST")));
    assertTokenStreamContents(ts, new String[] { "hubble space telescope", "HST", "hs telescope"},
        new int[]    {0, 0, 0},
        new int[]    {3, 3, 3},
        new String[] {S, S, S},
        new int[]    {1, 0, 0}
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo bar", "foo ba", "fu ba", "foobar" },
        new int[]    {0, 5, 5, 5, 5}, //startOffset
        new int[]    {4,12,12,12,12}, //endOffset
        new String[] {O, S, S, S, S}, //type
        new int[]    {1, 1, 0, 0, 0}  //posIncr
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo bar", "foo ba", "fu ba", "foobar"},
        new int[]    {0, 5, 5, 5, 5, 5}, //startOffset
        new int[]    {4,11,11,11,11,11}, //endOffset
        new String[] {O, S, S, S, S, S}, //type
        new int[]    {1, 1, 0, 0, 0, 1}  //posIncr
    );
    
  }
  
  
  
  public void testMultiWordSynonymsDefault() throws IOException {
    
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
    
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(getSyn());
    
    
    TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
    assertTokenStreamContents(ts, new String[] { "foo", "hubble", "HST", "hs telescope", "space", "telescope" },
        new int[]    {0, 4, 4, 4,11,17}, //startOffset
        new int[]    {3,10,26,26,16,26}, //endOffset
        new String[] {O, S, S, S, S, S}, //type
        new int[]    {1, 1, 0, 0, 1, 1}  //posIncr
    );
    
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("HST")));
    assertTokenStreamContents(ts, new String[] { "hubble", "HST", "hs telescope", "space", "telescope" },
        new int[]    {0, 0, 0, 0, 0},
        new int[]    {3, 3, 3, 3, 3},
        new String[] {S, S, S, S, S},
        new int[]    {1, 0, 0, 1, 1}
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo", "foo ba", "fu ba", "foobar", "bar" },
        new int[]    {0, 5, 5, 5, 5, 9}, //startOffset
        new int[]    {4, 8,12,12,12,12}, //endOffset
        new String[] {O, S, S, S, S, S}, //type
        new int[]    {1, 1, 0, 0, 0, 1}  //posIncr
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo", "foo ba", "fu ba", "foobar", "bar"},
        new int[]    {0, 5, 5, 5, 5, 5}, //startOffset
        new int[]    {4,11,11,11,11,11}, //endOffset
        new String[] {O, S, S, S, S, S}, //type
        new int[]    {1, 1, 0, 0, 0, 1}  //posIncr
    );
    
  }
  
  /*
   * The default behaviour but the original tokens are emitted
   * before the synonyms
   */
  public void testMultiWordSynonymsInclOrig() throws IOException {
    
    Map<String,String> args = new HashMap<String,String>();
    args.put("synonyms", "synonyms.txt");
    args.put("ignoreCase", "true");
    args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
    args.put("builderFactory", NewSynonymFilterFactory.AlwaysIncludeOriginal.class.getName());
    NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
    factory.inform(getSyn());
    
    
    TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
    assertTokenStreamContents(ts, new String[] { "foo", "hubble", "hubble", "hst", "hs telescope", "space", "space", "telescope", "telescope" },
    		new int[]    {0, 4, 4, 4, 4,11,11,17,17}, //startOffset
    		new int[]    {3,10,10,26,26,16,16,26,26}, //endOffset
    		new String[] {O, O, S, S, S, O, S, O, S}, //type
    		new int[]    {1, 1, 0, 0, 0, 1, 0, 1, 0}  //posIncr
    );
    
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("hst")));
    assertTokenStreamContents(ts, new String[] { "hst", "hubble", "hst", "hs telescope", "space", "telescope" },
    		new int[]    {0, 0, 0, 0, 0, 0},
    		new int[]    {3, 3, 3, 3, 3, 3},
    		new String[] {O, S, S, S, S, S},
    		new int[]    {1, 0, 0, 0, 1, 1}
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
    assertTokenStreamContents(ts, new String[] { "some", "foo", "foo", "foo ba", "fu ba", "foobar", "bar", "bar" },
    		new int[]    {0, 5, 5, 5, 5, 5, 9, 9}, //startOffset
    		new int[]    {4, 8, 8,12,12,12,12,12}, //endOffset
    		new String[] {O, O, S, S, S, S, O, S}, //type
    		new int[]    {1, 1, 0, 0, 0, 0, 1, 0}  //posIncr
    );
    
    ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
    assertTokenStreamContents(ts, new String[] { "some", "foobar", "foo", "foo ba", "fu ba", "foobar", "bar"},
    		new int[]    {0, 5, 5, 5, 5, 5, 5}, //startOffset
    		new int[]    {4,11,11,11,11,11,11}, //endOffset
    		new String[] {O, O, S, S, S, S, S}, //type
    		new int[]    {1, 1, 0, 0, 0, 0, 1}  //posIncr
    );
    
  }
}

class StringMockResourceLoader implements ResourceLoader {
  String text;

  public StringMockResourceLoader(String text) {
    this.text = text;
  }

  public <T> T newInstance(String cname, Class<T> expectedType) {
    try {
      Class<? extends T> clazz = Class.forName(cname).asSubclass(expectedType);
      return clazz.newInstance();
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  public InputStream openResource(String resource) throws IOException {
    return new ByteArrayInputStream(text.getBytes("UTF-8"));
  }

  @Override
  public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) {
    try {
      return Class.forName(cname, true, Thread.currentThread().getContextClassLoader()).asSubclass(expectedType);
    } catch (Exception e) {
      throw new RuntimeException("Cannot load class: " + cname, e);
    }
  }
}