/* * Copyright 2004-2009 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.compass.core.lucene.engine.analyzer; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; import java.util.StringTokenizer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.compass.core.config.CompassConfigurable; import org.compass.core.config.CompassSettings; import org.compass.core.engine.SearchEngineException; import org.compass.core.lucene.LuceneEnvironment; import org.compass.core.util.ClassUtils; import org.compass.core.util.StringUtils; /** * @author kimchy */ public class DefaultLuceneAnalyzerFactory implements LuceneAnalyzerFactory { private static final Log log = LogFactory.getLog(DefaultLuceneAnalyzerFactory.class); private static final Set<String> extednedAnalyzers; private static final Set<String> coreAnalyzers; static { coreAnalyzers = new HashSet<String>(); coreAnalyzers.add(LuceneEnvironment.Analyzer.CoreTypes.WHITESPACE); coreAnalyzers.add(LuceneEnvironment.Analyzer.CoreTypes.STANDARD); coreAnalyzers.add(LuceneEnvironment.Analyzer.CoreTypes.SIMPLE); coreAnalyzers.add(LuceneEnvironment.Analyzer.CoreTypes.STOP); extednedAnalyzers = new HashSet<String>(); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.BRAZILIAN); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.CJK); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.CHINESE); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.CZECH); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.GERMAN); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.GREEK); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.FRENCH); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.DUTCH); extednedAnalyzers.add(LuceneEnvironment.Analyzer.ExtendedTypes.RUSSIAN); } public Analyzer createAnalyzer(String analyzerName, CompassSettings settings) throws SearchEngineException { Object obj = settings.getSettingAsObject(LuceneEnvironment.Analyzer.TYPE); if (obj instanceof Analyzer) { if (obj instanceof CompassConfigurable) { ((CompassConfigurable) obj).configure(settings); } return (Analyzer) obj; } Analyzer analyzer; String analyzerSetting = settings.getSetting(LuceneEnvironment.Analyzer.TYPE, LuceneEnvironment.Analyzer.CoreTypes.STANDARD); if (log.isDebugEnabled()) { log.debug("Analyzer [" + analyzerName + "] uses Lucene analyzer [" + analyzerSetting + "]"); } if (coreAnalyzers.contains(analyzerSetting.toLowerCase())) { AnalyzerBuilderDelegate analyzerBuilderDelegate = new CoreAnalyzerBuilderDelegate(); analyzer = analyzerBuilderDelegate.buildAnalyzer(analyzerName, settings, this); } else if (LuceneEnvironment.Analyzer.Snowball.SNOWBALL.equalsIgnoreCase(analyzerSetting)) { AnalyzerBuilderDelegate analyzerBuilderDelegate = new SnowballAnalyzerBuilderDelegate(); analyzer = analyzerBuilderDelegate.buildAnalyzer(analyzerName, settings, this); } else if (extednedAnalyzers.contains(analyzerSetting.toLowerCase())) { AnalyzerBuilderDelegate analyzerBuilderDelegate = new ExtendedAnalyzerBuilderDelegate(); analyzer = analyzerBuilderDelegate.buildAnalyzer(analyzerName, settings, this); } else { // the analyzer must be a fully qualified class, try to instansiate try { analyzer = (Analyzer) ClassUtils.forName(analyzerSetting, settings.getClassLoader()).newInstance(); } catch (Exception e) { throw new SearchEngineException("Cannot instantiate Lucene Analyzer [" + analyzerSetting + "] for analyzer [" + analyzerName + "]. Please verify the analyzer setting at [" + LuceneEnvironment.Analyzer.TYPE + "]", e); } if (analyzer instanceof CompassConfigurable) { ((CompassConfigurable) analyzer).configure(settings); } } return analyzer; } public String[] parseStopWords(String analyzerName, CompassSettings settings, String[] defaultStopWords) { String stopWords = settings.getSetting(LuceneEnvironment.Analyzer.STOPWORDS); if (stopWords == null) { if (log.isTraceEnabled()) { log.trace("Anayzer [" + analyzerName + "] uses default stop words [" + StringUtils.arrayToCommaDelimitedString(defaultStopWords) + "]"); } return defaultStopWords; } boolean addStopWords = false; if (stopWords.startsWith("+")) { addStopWords = true; stopWords = stopWords.substring(1); } StringTokenizer st = new StringTokenizer(stopWords, ","); ArrayList<String> listStopWords = new ArrayList<String>(); while (st.hasMoreTokens()) { String stopword = st.nextToken().trim(); if (StringUtils.hasLength(stopword)) { listStopWords.add(stopword); } } String[] arrStopWords = listStopWords.toArray(new String[listStopWords.size()]); if (addStopWords) { if (log.isTraceEnabled()) { log.trace("Analyzer [" + analyzerName + "] uses default stop words [" + StringUtils.arrayToCommaDelimitedString(defaultStopWords) + "]"); log.trace("Analyzer [" + analyzerName + "] and uses user stop words [" + StringUtils.arrayToCommaDelimitedString(arrStopWords) + "]"); } String[] tempStopWords = arrStopWords; arrStopWords = new String[tempStopWords.length + defaultStopWords.length]; System.arraycopy(defaultStopWords, 0, arrStopWords, 0, defaultStopWords.length); System.arraycopy(tempStopWords, 0, arrStopWords, defaultStopWords.length, tempStopWords.length); } else { if (log.isTraceEnabled()) { log.trace("Analyzer [" + analyzerName + "] uses user stop words [" + StringUtils.arrayToCommaDelimitedString(arrStopWords) + "]"); } } return arrStopWords; } }