package org.xbib.elasticsearch.index.analysis.worddelimiter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.junit.Test;
import org.xbib.elasticsearch.index.analysis.BaseTokenStreamTest;
import org.xbib.elasticsearch.index.analysis.MockTokenizer;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import static org.xbib.elasticsearch.MapperTestUtils.tokenFilterFactory;
import static org.xbib.elasticsearch.MapperTestUtils.tokenizerFactory;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.ALL_PARTS_AT_SAME_POSITION;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.CATENATE_ALL;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.CATENATE_WORDS;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.GENERATE_NUMBER_PARTS;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.GENERATE_WORD_PARTS;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.SPLIT_ON_CASE_CHANGE;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.SPLIT_ON_NUMERICS;
import static org.xbib.elasticsearch.index.analysis.worddelimiter.WordDelimiterFilter2.STEM_ENGLISH_POSSESSIVE;
/**
*
*/
public class WordDelimiterFilter2Tests extends BaseTokenStreamTest {
@Test
public void testOffsets() throws IOException {
String resource = "org/xbib/elasticsearch/index/analysis/worddelimiter/worddelimiter.json";
Tokenizer tokenizer = tokenizerFactory(resource, "keyword").create();
tokenizer.setReader(new StringReader("foo-bar"));
TokenStream ts = tokenFilterFactory(resource, "wd").create(tokenizer);
assertTokenStreamContents(ts,
new String[]{"foo", "bar", "foobar"},
new int[]{0, 4, 0},
new int[]{3, 7, 7},
null, null, null, null, false);
}
@Test
public void testOffsetChange() throws Exception {
String resource = "org/xbib/elasticsearch/index/analysis/worddelimiter/worddelimiter.json";
Tokenizer tokenizer = tokenizerFactory(resource, "keyword").create();
tokenizer.setReader(new StringReader("übelkeit"));
TokenStream ts = tokenFilterFactory(resource,"wd").create(tokenizer);
assertTokenStreamContents(ts,
new String[]{"übelkeit" },
new int[]{0},
new int[]{8});
}
public void doSplit(final String input, String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
tokenizer.setReader(new StringReader(input));
WordDelimiterFilter2 wdf = new WordDelimiterFilter2(tokenizer, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, output);
}
@Test
public void testSplits() throws Exception {
doSplit("basic-split", "basic", "split");
doSplit("camelCase", "camel", "Case");
// non-space marking symbol shouldn't cause split
// this is an example in Thai
doSplit("\u0e1a\u0e49\u0e32\u0e19", "\u0e1a\u0e49\u0e32\u0e19");
// possessive followed by delimiter
doSplit("test's'", "test");
// some russian upper and lowercase
doSplit("Роберт", "Роберт");
// now cause a split (russian camelCase)
doSplit("РобЕрт", "Роб", "Ерт");
// a composed titlecase character, don't split
doSplit("aDžungla", "aDžungla");
// a modifier letter, don't split
doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام");
// enclosing mark, don't split
doSplit("test⃝", "test⃝");
// combining spacing mark (the virama), don't split
doSplit("हिन्दी", "हिन्दी");
// don't split non-ascii digits
doSplit("١٢٣٤", "١٢٣٤");
// don't split supplementaries into unpaired surrogates
doSplit("𠀀𠀀", "𠀀𠀀");
}
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
tokenizer.setReader(new StringReader(input));
WordDelimiterFilter2 wdf = new WordDelimiterFilter2(tokenizer, flags, null);
assertTokenStreamContents(wdf, output);
}
/*
* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
*/
@Test
public void testPossessives() throws Exception {
doSplitPossessive(1, "ra's", "ra");
doSplitPossessive(0, "ra's", "ra", "s");
}
/*
* Set a large position increment gap of 10 if the token is "largegap" or "/"
*/
private final class LargePosIncTokenFilter extends TokenFilter {
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
protected LargePosIncTokenFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/")) {
posIncAtt.setPositionIncrement(10);
}
return true;
} else {
return false;
}
}
}
@Test
public void testPositionIncrements() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final Set<String> protWords = new HashSet<String>(Collections.singletonList("NUTCH"));
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(
tokenizer,
flags, protWords));
}
};
/* in this case, works as expected. */
assertAnalyzesTo(a, "LUCENE / SOLR", new String[]{"LUCENE", "SOLR" },
new int[]{0, 9},
new int[]{6, 13},
null,
new int[]{1, 1},
null,
false);
/* only in this case, posInc of 2 ?! */
assertAnalyzesTo(a, "LUCENE / solR", new String[]{"LUCENE", "sol", "R", "solR" },
new int[]{0, 9, 12, 9},
new int[]{6, 12, 13, 13},
null,
new int[]{1, 1, 1, 0},
null,
false);
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[]{"LUCENE", "NUTCH", "SOLR" },
new int[]{0, 9, 15},
new int[]{6, 14, 19},
null,
new int[]{1, 1, 1},
null,
false);
assertAnalyzesTo(a, "LUCENE4.0.0", new String[]{"LUCENE", "4", "0", "0", "LUCENE400" },
new int[]{0, 6, 8, 10, 0},
new int[]{6, 7, 9, 11, 11},
null,
new int[]{1, 1, 1, 1, 0},
null,
false);
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(
new LargePosIncTokenFilter(tokenizer),
flags, protWords));
}
};
/* increment of "largegap" is preserved */
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[]{"LUCENE", "largegap", "SOLR" },
new int[]{0, 7, 16},
new int[]{6, 15, 20},
null,
new int[]{1, 10, 1},
null,
false);
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[]{"LUCENE", "SOLR" },
new int[]{0, 9},
new int[]{6, 13},
null,
new int[]{1, 11},
null,
false);
/* in this case, the increment of 10 from the "/" is carried over */
assertAnalyzesTo(a2, "LUCENE / solR", new String[]{"LUCENE", "sol", "R", "solR" },
new int[]{0, 9, 12, 9},
new int[]{6, 12, 13, 13},
null,
new int[]{1, 11, 1, 0},
null,
false);
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[]{"LUCENE", "NUTCH", "SOLR" },
new int[]{0, 9, 15},
new int[]{6, 14, 19},
null,
new int[]{1, 11, 1},
null,
false);
Analyzer a3 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
//filter.setEnablePositionIncrements(true);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(filter, flags, protWords));
}
};
assertAnalyzesTo(a3, "lucene.solr",
new String[]{"lucene", "solr", "lucenesolr" },
new int[]{0, 7, 0},
new int[]{6, 11, 11},
null,
new int[]{1, 1, 0},
null,
false);
/* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr",
new String[]{"lucene", "solr", "lucenesolr" },
new int[]{4, 11, 4},
new int[]{10, 15, 15},
null,
new int[]{2, 1, 0},
null,
false);
final int flags4 = flags | CATENATE_WORDS;
Analyzer a4 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(filter, flags4, protWords));
}
};
assertAnalyzesTo(a4, "LUCENE4.0.0", new String[]{"LUCENE", "4", "0", "0", "LUCENE400" },
new int[]{0, 6, 8, 10, 0},
new int[]{6, 7, 9, 11, 11},
null,
new int[]{1, 1, 1, 1, 0},
null,
false);
}
@Test
public void testPositionIncrementsCollapsePositions() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | ALL_PARTS_AT_SAME_POSITION;
final Set<String> protWords = new HashSet<String>(Collections.singletonList("NUTCH"));
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(
tokenizer,
flags, protWords));
}
};
/* in this case, works as expected. */
assertAnalyzesTo(a, "LUCENE / SOLR", new String[]{"LUCENE", "SOLR" },
new int[]{0, 9},
new int[]{6, 13},
null,
new int[]{1, 1});
/* only in this case, posInc of 2 ?! */
assertAnalyzesTo(a, "LUCENE / solR", new String[]{"LUCENE", "sol", "R", "solR" },
new int[]{0, 9, 12, 9},
new int[]{6, 12, 13, 13},
null,
new int[]{1, 1, 0, 0},
null,
false);
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[]{"LUCENE", "NUTCH", "SOLR" },
new int[]{0, 9, 15},
new int[]{6, 14, 19},
null,
new int[]{1, 1, 1});
assertAnalyzesTo(a, "LUCENE4.0.0", new String[]{"LUCENE", "4", "0", "0", "LUCENE400" },
new int[]{0, 6, 8, 10, 0},
new int[]{6, 7, 9, 11, 11},
null,
new int[]{1, 0, 0, 0, 0},
null,
false);
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(
new LargePosIncTokenFilter(tokenizer),
flags, protWords));
}
};
/* increment of "largegap" is preserved */
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[]{"LUCENE", "largegap", "SOLR" },
new int[]{0, 7, 16},
new int[]{6, 15, 20},
null,
new int[]{1, 10, 1});
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[]{"LUCENE", "SOLR" },
new int[]{0, 9},
new int[]{6, 13},
null,
new int[]{1, 11});
/* in this case, the increment of 10 from the "/" is carried over */
assertAnalyzesTo(a2, "LUCENE / solR", new String[]{"LUCENE", "sol", "R", "solR" },
new int[]{0, 9, 12, 9},
new int[]{6, 12, 13, 13},
null,
new int[]{1, 11, 0, 0},
null,
false);
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[]{"LUCENE", "NUTCH", "SOLR" },
new int[]{0, 9, 15},
new int[]{6, 14, 19},
null,
new int[]{1, 11, 1});
Analyzer a3 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(filter, flags, protWords));
}
};
assertAnalyzesTo(a3, "lucene.solr",
new String[]{"lucene", "solr", "lucenesolr" },
new int[]{0, 7, 0},
new int[]{6, 11, 11},
null,
new int[]{1, 0, 0},
null,
false);
/* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr",
new String[]{"lucene", "solr", "lucenesolr" },
new int[]{4, 11, 4},
new int[]{10, 15, 15},
null,
new int[]{2, 0, 0},
null,
false);
final int flags4 = flags | CATENATE_WORDS;
Analyzer a4 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter2(filter, flags4, protWords));
}
};
assertAnalyzesTo(a4, "LUCENE4.0.0", new String[]{"LUCENE", "4", "0", "0", "LUCENE400" },
new int[]{0, 6, 8, 10, 0},
new int[]{6, 7, 9, 11, 11},
null,
new int[]{1, 0, 0, 0, 0},
null,
false);
}
}