/*
* Copyright 2011 Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.es;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.WordDelimiterFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.settings.IndexSettings;
/**
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class JetwickFilterFactory extends AbstractTokenFilterFactory {
@Inject
public JetwickFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
}
private CharArraySet protectedWords = null;
private int generateWordParts = 1;
private int generateNumberParts = 1;
private int catenateWords = 0;
private int catenateNumbers = 0;
private int catenateAll = 0;
private int splitOnCaseChange = 0;
private int splitOnNumerics = 1;
private int preserveOriginal = 1;
private int stemEnglishPossessive = 0;
private String handleAsChar = "";
private String handleAsDigit = "@#$€₱č₤";
@Override
public TokenStream create(TokenStream tokenStream) {
return myCreate(tokenStream, handleAsChar, handleAsDigit,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange, preserveOriginal,
splitOnNumerics, stemEnglishPossessive, protectedWords);
}
@Override
public String name() {
return "jetwickfilter";
}
public static TokenStream myCreate(TokenStream tokenStream, String handleAsChar, String handleAsDigit,
int generateWordParts,
int generateNumberParts,
int catenateWords,
int catenateNumbers,
int catenateAll,
int splitOnCaseChange,
int preserveOriginal,
int splitOnNumerics,
int stemEnglishPossessive,
CharArraySet protectedWords) {
byte[] tab = new byte[256];
for (int i = 0; i < 256; i++) {
byte code = 0;
if (Character.isLowerCase(i) || handleAsChar.contains(String.valueOf((char) i))) {
code |= WordDelimiterFilter.LOWER;
} else if (Character.isUpperCase(i)) {
code |= WordDelimiterFilter.UPPER;
} else if (Character.isDigit(i) || handleAsDigit.contains(String.valueOf((char) i))) {
code |= WordDelimiterFilter.DIGIT;
}
if (code == 0) {
code = WordDelimiterFilter.SUBWORD_DELIM;
}
tab[i] = code;
}
return new WordDelimiterFilter(tokenStream, tab,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange, preserveOriginal,
splitOnNumerics, stemEnglishPossessive, protectedWords);
}
}