/** * Distribution License: * JSword is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License, version 2.1 as published by * the Free Software Foundation. This program is distributed in the hope * that it will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * The License is available on the internet at: * http://www.gnu.org/copyleft/lgpl.html * or by writing to: * Free Software Foundation, Inc. * 59 Temple Place - Suite 330 * Boston, MA 02111-1307, USA * * Copyright: 2007 * The copyright to this program is held by it's authors. * * ID: $Id: $ */ package org.crosswire.jsword.index.lucene.analysis; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.apache.lucene.analysis.LowerCaseTokenizer; import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.nl.DutchAnalyzer; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.util.Version; import org.crosswire.jsword.book.Book; /** * An Analyzer whose {@link TokenStream} is built from a * {@link LowerCaseTokenizer} filtered with {@link SnowballFilter} (optional) * and {@link StopFilter} (optional) Default behavior: Stemming is done, Stop * words not removed A snowball stemmer is configured according to the language * of the Book. Currently it takes following stemmer names (available stemmers * in lucene snowball package net.sf.snowball.ext) * * <pre> * Danish * Dutch * English * Finnish * French * German2 * German * Italian * Kp * Lovins * Norwegian * Porter * Portuguese * Russian * Spanish * Swedish * </pre> * * This list is expected to expand, as and when Snowball project support more * languages * * @see gnu.lgpl.License for license details.<br> * The copyright to this program is held by it's authors. * @author sijo cherian [sijocherian at yahoo dot com] */ public class ConfigurableSnowballAnalyzer extends AbstractBookAnalyzer { public ConfigurableSnowballAnalyzer() { } /** * Filters {@link LowerCaseTokenizer} with {@link StopFilter} if enabled and * {@link SnowballFilter}. */ @Override public final TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new LowerCaseTokenizer(reader); if (doStopWords && stopSet != null) { result = new StopFilter(false, result, stopSet); } // Configure Snowball filter based on language/stemmerName if (doStemming) { result = new SnowballFilter(result, stemmerName); } return result; } /* (non-Javadoc) * @see org.apache.lucene.analysis.Analyzer#reusableTokenStream(java.lang.String, java.io.Reader) */ @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(new LowerCaseTokenizer(reader)); if (doStopWords && stopSet != null) { streams.setResult(new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), streams.getResult(), stopSet)); } if (doStemming) { streams.setResult(new PorterStemFilter(streams.getResult())); } setPreviousTokenStream(streams); } else { streams.getSource().reset(reader); } return streams.getResult(); } @Override public void setBook(Book newBook) { book = newBook; stemmerName = null; if (book != null) { // stemmer name are same as language name, in most cases pickStemmer(book.getLanguage().getCode()); } } /** * Given the name of a stemmer, use that one. * * @param languageCode */ public void pickStemmer(String languageCode) { if (languageCode != null) { // Check for allowed stemmers if (languageCodeToStemmerLanguageNameMap.containsKey(languageCode)) { stemmerName = languageCodeToStemmerLanguageNameMap.get(languageCode); } else { throw new IllegalArgumentException("SnowballAnalyzer configured for unavailable stemmer " + stemmerName); } // Initialize the default stop words if (defaultStopWordMap.containsKey(languageCode)) { stopSet = defaultStopWordMap.get(languageCode); } } } /** * The name of the stemmer to use. */ private String stemmerName; private static Map<String, String> languageCodeToStemmerLanguageNameMap = new HashMap<String, String>(); static { languageCodeToStemmerLanguageNameMap.put("da", "Danish"); languageCodeToStemmerLanguageNameMap.put("nl", "Dutch"); languageCodeToStemmerLanguageNameMap.put("en", "English"); languageCodeToStemmerLanguageNameMap.put("fi", "Finnish"); languageCodeToStemmerLanguageNameMap.put("fr", "French"); languageCodeToStemmerLanguageNameMap.put("de", "German"); languageCodeToStemmerLanguageNameMap.put("it", "Italian"); languageCodeToStemmerLanguageNameMap.put("no", "Norwegian"); languageCodeToStemmerLanguageNameMap.put("pt", "Portuguese"); languageCodeToStemmerLanguageNameMap.put("ru", "Russian"); languageCodeToStemmerLanguageNameMap.put("es", "Spanish"); languageCodeToStemmerLanguageNameMap.put("sv", "Swedish"); } // Maps StemmerName > String array of standard stop words private static HashMap<String, Set<?>> defaultStopWordMap = new HashMap<String, Set<?>>(); static { defaultStopWordMap.put("fr", FrenchAnalyzer.getDefaultStopSet()); defaultStopWordMap.put("de", GermanAnalyzer.getDefaultStopSet()); defaultStopWordMap.put("nl", DutchAnalyzer.getDefaultStopSet()); defaultStopWordMap.put("en", StopAnalyzer.ENGLISH_STOP_WORDS_SET); } private final Version matchVersion = Version.LUCENE_29; }