LucenePerFieldAnalyzerFactory.java example

Explorer
sensei-master
/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 * © 2012 LinkedIn Corp. All Rights Reserved.
 */
package com.senseidb.plugin.analyzer;

import java.util.HashMap;
import java.util.Map;

import org.apache.commons.collections.MapUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;

import com.senseidb.plugin.SenseiPluginFactory;
import com.senseidb.plugin.SenseiPluginRegistry;

/**
 * A {@code SenseiPluginFactory} that instantiates a {@link PerFieldAnalyzerWrapper}. It reads
 * the field names as a comma-separated list from the <code>fields</code> property. To specify
 * the analyzer for each field, you must do it the same way you normally would, but appending
 * <code>.field.<i>fieldname</i></code> to the usual prefix. You can set the default analyzer
 * with the <code>default</code> property. If this property is missing, the default analyzer
 * is set to {@link StandardAnalyzer} with version {@code LUCENE_35}. Example configuration:
 *
 * <pre>
 * sensei.index.analyzer.class=com.senseidb.plugin.analyzer.LucenePerFieldAnalyzerFactory
 * sensei.index.analyzer.default.class=com.senseidb.plugin.analyzer.LuceneKeywordAnalyzerFactory
 * sensei.index.analyzer.fields=content_tokenized,content_keyword
 *
 * sensei.index.analyzer.fields.content_tokenized.class=com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory
 * sensei.index.analyzer.fields.content_tokenized.pattern=[ -_./:]
 *
 * sensei.index.analyzer.fields.content_keyword.class=com.senseidb.plugin.analyzer.LuceneKeywordAnalyzerFactory
 * </pre>
 *
 * @author jgrande
 *
 */
public class LucenePerFieldAnalyzerFactory implements SenseiPluginFactory<Analyzer> {

    @Override
    public Analyzer getBean(Map<String, String> initProperties, String fullPrefix, SenseiPluginRegistry pluginRegistry) {
        Analyzer defaultAnalyzer = pluginRegistry.getBeanByFullPrefix(fullPrefix + ".default", Analyzer.class);
        if (defaultAnalyzer == null) {
            defaultAnalyzer = new StandardAnalyzer(Version.LUCENE_35);
        }

        String[] fields = MapUtils.getString(initProperties, "fields", "").split("\\s*,\\s*");
        Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
        for (String field: fields) {
            String analyzerPrefix = fullPrefix + ".fields." + field;
            Analyzer analyzer = pluginRegistry.getBeanByFullPrefix(analyzerPrefix, Analyzer.class);
            analyzers.put(field, analyzer);
        }

        return new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzers);
    }

}