package de.jetwick.solrplugin; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.solr.analysis.BaseTokenFilterFactory; import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; /** * * @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net */ public class TWordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public static final String PROTECTED_TOKENS = "protected"; public void inform(ResourceLoader loader) { String wordFiles = args.get(PROTECTED_TOKENS); if (wordFiles != null) { try { File protectedWordFiles = new File(wordFiles); if (protectedWordFiles.exists()) { List<String> wlist = loader.getLines(wordFiles); //This cast is safe in Lucene protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally } else { List<String> files = StrUtils.splitFileNames(wordFiles); for (String file : files) { List<String> wlist = loader.getLines(file.trim()); if (protectedWords == null) protectedWords = new CharArraySet(wlist, false); else protectedWords.addAll(wlist); } } } catch (IOException e) { throw new RuntimeException(e); } } } private CharArraySet protectedWords = null; int generateWordParts = 0; int generateNumberParts = 0; int catenateWords = 0; int catenateNumbers = 0; int catenateAll = 0; int splitOnCaseChange = 0; int splitOnNumerics = 0; int preserveOriginal = 0; int stemEnglishPossessive = 0; String handleAsChar = ""; String handleAsDigit = ""; @Override public void init(Map<String, String> args) { super.init(args); generateWordParts = getInt("generateWordParts", 1); generateNumberParts = getInt("generateNumberParts", 1); catenateWords = getInt("catenateWords", 0); catenateNumbers = getInt("catenateNumbers", 0); catenateAll = getInt("catenateAll", 0); splitOnCaseChange = getInt("splitOnCaseChange", 1); splitOnNumerics = getInt("splitOnNumerics", 1); preserveOriginal = getInt("preserveOriginal", 0); stemEnglishPossessive = getInt("stemEnglishPossessive", 1); handleAsChar = getArgs().get("handleAsChar"); if (handleAsChar == null) handleAsChar = ""; handleAsDigit = getArgs().get("handleAsDigit"); if (handleAsDigit == null) handleAsDigit = ""; } public TWordDelimiterFilter create(TokenStream input) { byte[] tab = new byte[256]; for (int i = 0; i < 256; i++) { byte code = 0; if (Character.isLowerCase(i) || handleAsChar.contains(String.valueOf((char) i))) { code |= TWordDelimiterFilter.LOWER; } else if (Character.isUpperCase(i)) { code |= TWordDelimiterFilter.UPPER; } else if (Character.isDigit(i) || handleAsDigit.contains(String.valueOf((char) i))) { code |= TWordDelimiterFilter.DIGIT; } if (code == 0) { code = TWordDelimiterFilter.SUBWORD_DELIM; } tab[i] = code; } return new TWordDelimiterFilter(input, tab, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protectedWords); } }