/** * This software is licensed to you under the Apache License, Version 2.0 (the * "Apache License"). * * LinkedIn's contributions are made under the Apache License. If you contribute * to the Software, the contributions will be deemed to have been made under the * Apache License, unless you expressly indicate otherwise. Please do not make any * contributions that would be inconsistent with the Apache License. * * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, this software * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache * License for the specific language governing permissions and limitations for the * software governed under the Apache License. * * © 2012 LinkedIn Corp. All Rights Reserved. */ package com.senseidb.plugin.analyzer; import java.util.Arrays; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.apache.commons.collections.MapUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer; import org.apache.lucene.util.Version; import com.senseidb.plugin.SenseiPluginFactory; import com.senseidb.plugin.SenseiPluginRegistry; /** * A {@code SenseiPluginFactory} that instantiates a {@link PatternAnalyzer}. It can be configured through * the following properties: * * <ul> * <li><code>matchVersion</code>: the Lucene version compatibility (default: LUCENE_35). See {@link Version}. * <li><code>pattern</code>: a regular expression delimiting tokens (default: "<code>\\s+</code>") * <li><code>toLowerCase</code>: if true, tokens are converted to lower case (default: true) * <li><code>stopWords</code>: a comma-separated list of stop words (defaults to an empty list) * </ul> * * Example configuration: * * <pre> * sensei.index.analyzer.class=com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory * sensei.index.analyzer.pattern=[ -_./:] * </pre> * * @author jgrande * */ public class LucenePatternAnalyzerFactory implements SenseiPluginFactory<Analyzer> { @Override public Analyzer getBean(Map<String, String> initProperties, String fullPrefix, SenseiPluginRegistry pluginRegistry) { Version matchVersion = Version.valueOf(MapUtils.getString(initProperties, "matchVersion", "LUCENE_35")); Pattern pattern = Pattern.compile(MapUtils.getString(initProperties, "pattern", "\\s+")); boolean toLowerCase = MapUtils.getBoolean(initProperties, "toLowerCase", true); String stopWordsStr = MapUtils.getString(initProperties, "stopWords", ""); Set<String> stopWords = new HashSet<String>(Arrays.asList(stopWordsStr.split("\\s*,\\s*"))); return new PatternAnalyzer(matchVersion, pattern, toLowerCase, stopWords); } }