package org.apache.lucene.analysis.el; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.util.Version; import java.io.IOException; import java.io.Reader; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * {@link Analyzer} for the Greek language. * <p> * Supports an external list of stopwords (words * that will not be indexed at all). * A default set of stopwords is used unless an alternative list is specified. * </p> * * <p><b>NOTE</b>: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.</p> */ public final class GreekAnalyzer extends Analyzer { // the letters are indexes to the charset array (see GreekCharsets.java) private static char A = 6; private static char B = 7; private static char G = 8; private static char D = 9; private static char E = 10; private static char Z = 11; private static char H = 12; private static char TH = 13; private static char I = 14; private static char K = 15; private static char L = 16; private static char M = 17; private static char N = 18; private static char KS = 19; private static char O = 20; private static char P = 21; private static char R = 22; private static char S = 24; // skip final sigma private static char T = 25; private static char Y = 26; private static char F = 27; private static char X = 28; private static char PS = 29; private static char W = 30; /** * List of typical Greek stopwords. */ private static char[][] GREEK_STOP_WORDS = { {O}, {H}, {T, O}, {O, I}, {T, A}, {T, O, Y}, {T, H, S}, {T, W, N}, {T, O, N}, {T, H, N}, {K, A, I}, {K, I}, {K}, {E, I, M, A, I}, {E, I, S, A, I}, {E, I, N, A, I}, {E, I, M, A, S, T, E}, {E, I, S, T, E}, {S, T, O}, {S, T, O, N}, {S, T, H}, {S, T, H, N}, {M, A}, {A, L, L, A}, {A, P, O}, {G, I, A}, {P, R, O, S}, {M, E}, {S, E}, {W, S}, {P, A, R, A}, {A, N, T, I}, {K, A, T, A}, {M, E, T, A}, {TH, A}, {N, A}, {D, E}, {D, E, N}, {M, H}, {M, H, N}, {E, P, I}, {E, N, W}, {E, A, N}, {A, N}, {T, O, T, E}, {P, O, Y}, {P, W, S}, {P, O, I, O, S}, {P, O, I, A}, {P, O, I, O}, {P, O, I, O, I}, {P, O, I, E, S}, {P, O, I, W, N}, {P, O, I, O, Y, S}, {A, Y, T, O, S}, {A, Y, T, H}, {A, Y, T, O}, {A, Y, T, O, I}, {A, Y, T, W, N}, {A, Y, T, O, Y, S}, {A, Y, T, E, S}, {A, Y, T, A}, {E, K, E, I, N, O, S}, {E, K, E, I, N, H}, {E, K, E, I, N, O}, {E, K, E, I, N, O, I}, {E, K, E, I, N, E, S}, {E, K, E, I, N, A}, {E, K, E, I, N, W, N}, {E, K, E, I, N, O, Y, S}, {O, P, W, S}, {O, M, W, S}, {I, S, W, S}, {O, S, O}, {O, T, I} }; /** * Contains the stopwords used with the {@link StopFilter}. */ private Set stopSet = new HashSet(); /** * Charset for Greek letters. * Represents encoding for 24 lowercase Greek letters. * Predefined charsets can be taken from {@link GreekCharsets} class * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 */ private char[] charset; private final Version matchVersion; /** @deprecated Use {@link #GreekAnalyzer(Version)} instead */ public GreekAnalyzer() { this(Version.LUCENE_23); } public GreekAnalyzer(Version matchVersion) { charset = GreekCharsets.UnicodeGreek; stopSet = StopFilter.makeStopSet( makeStopWords(GreekCharsets.UnicodeGreek)); this.matchVersion = matchVersion; } /** * Builds an analyzer. * @deprecated Use {@link #GreekAnalyzer(Version)} instead. */ public GreekAnalyzer(char[] charset) { this.charset = charset; stopSet = StopFilter.makeStopSet(makeStopWords(charset)); matchVersion = Version.LUCENE_23; } /** * Builds an analyzer with the given stop words. * @param stopwords Array of stopwords to use. * * @deprecated Use {@link #GreekAnalyzer(Version, String[])} instead */ public GreekAnalyzer(String [] stopwords) { this(Version.LUCENE_23, stopwords); } /** * Builds an analyzer with the given stop words. * @param stopwords Array of stopwords to use. */ public GreekAnalyzer(Version matchVersion, String [] stopwords) { charset = GreekCharsets.UnicodeGreek; stopSet = StopFilter.makeStopSet(stopwords); this.matchVersion = matchVersion; } /** * Builds an analyzer with the given stop words. * @deprecated Use {@link #GreekAnalyzer(Version, String[])} instead. */ public GreekAnalyzer(char[] charset, String[] stopwords) { this.charset = charset; stopSet = StopFilter.makeStopSet(stopwords); matchVersion = Version.LUCENE_23; } /** * Takes greek stop words and translates them to a String array, using * the given charset. * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 */ private static String[] makeStopWords(char[] charset) { String[] res = new String[GREEK_STOP_WORDS.length]; for (int i = 0; i < res.length; i++) { char[] theStopWord = GREEK_STOP_WORDS[i]; // translate the word,using the charset StringBuffer theWord = new StringBuffer(); for (int j = 0; j < theStopWord.length; j++) { theWord.append(charset[theStopWord[j]]); } res[i] = theWord.toString(); } return res; } /** * Builds an analyzer with the given stop words. * @deprecated Use {@link #GreekAnalyzer(Version, Map)} instead. */ public GreekAnalyzer(char[] charset, Map stopwords) { this.charset = charset; stopSet = new HashSet(stopwords.keySet()); matchVersion = Version.LUCENE_23; } /** * Builds an analyzer with the given stop words. * * @deprecated Use {@link #GreekAnalyzer(Version,Map)} instead */ public GreekAnalyzer(Map stopwords) { this(Version.LUCENE_23, stopwords); } /** * Builds an analyzer with the given stop words. */ public GreekAnalyzer(Version matchVersion, Map stopwords) { charset = GreekCharsets.UnicodeGreek; stopSet = new HashSet(stopwords.keySet()); this.matchVersion = matchVersion; } /** * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link GreekLowerCaseFilter} and {@link StopFilter} */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new GreekLowerCaseFilter(result, charset); result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); return result; } private class SavedStreams { Tokenizer source; TokenStream result; }; /** * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text * in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link GreekLowerCaseFilter} and {@link StopFilter} */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new StandardTokenizer(matchVersion, reader); streams.result = new GreekLowerCaseFilter(streams.source, charset); streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), streams.result, stopSet); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; } }