/*
* This library is part of OpenCms -
* the Open Source Content Management System
*
* Copyright (c) Alkacon Software GmbH (http://www.alkacon.com)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* For further information about Alkacon Software, please see the
* company website: http://www.alkacon.com
*
* For further information about OpenCms, please see the
* project website: http://www.opencms.org
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.opencms.search.galleries;
import org.opencms.search.CmsSearchIndex;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
* Special analyzer for multiple languages, used in the OpenCms gallery search index.<p>
*
* The gallery search is done in one single index that may contain multiple languages.<p>
*
* According to the Lucene JavaDocs (3.0 version), the Lucene {@link org.apache.lucene.analysis.standard.StandardAnalyzer} is already using
* "a good tokenizer for most European-language documents". The only caveat is that a
* list of English only stop words is used.<p>
*
* This extended analyzer used a compound list of stop words compiled from the following languages:<ul>
* <li>English
* <li>German
* <li>Spanish
* <li>Italian
* <li>French
* <li>Portugese
* <li>Danish
* <li>Dutch
* <li>Catalan
* <li>Czech
* </ul>
*
* @since 8.0.0
*/
public class CmsGallerySearchAnalyzer extends StopwordAnalyzerBase {
/** Default maximum allowed token length. */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
/**
* Constructor with version parameter.<p>
*
* @param version the Lucene standard analyzer version to match
* @throws IOException
*/
public CmsGallerySearchAnalyzer(Version version)
throws IOException {
// initialize superclass
super(version, WordlistLoader.getWordSet(
new BufferedReader(new InputStreamReader(
CmsGallerySearchAnalyzer.class.getResourceAsStream("stopwords_multilanguage.txt"))),
"#",
CmsSearchIndex.LUCENE_VERSION));
}
/**
* @see org.apache.lucene.analysis.ReusableAnalyzerBase#createComponents(java.lang.String, java.io.Reader)
*
* This is take from the Lucene StandardAnalyzer, which is final since 3.1
*/
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
TokenStream tok = new StandardFilter(matchVersion, src);
tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
protected boolean reset(final Reader r) throws IOException {
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
return super.reset(r);
}
};
}
}