package org.apache.lucene.analysis.el;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* {@link Analyzer} for the Greek language.
* <p>
* Supports an external list of stopwords (words
* that will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
* </p>
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class GreekAnalyzer extends Analyzer
{
// the letters are indexes to the charset array (see GreekCharsets.java)
private static char A = 6;
private static char B = 7;
private static char G = 8;
private static char D = 9;
private static char E = 10;
private static char Z = 11;
private static char H = 12;
private static char TH = 13;
private static char I = 14;
private static char K = 15;
private static char L = 16;
private static char M = 17;
private static char N = 18;
private static char KS = 19;
private static char O = 20;
private static char P = 21;
private static char R = 22;
private static char S = 24; // skip final sigma
private static char T = 25;
private static char Y = 26;
private static char F = 27;
private static char X = 28;
private static char PS = 29;
private static char W = 30;
/**
* List of typical Greek stopwords.
*/
private static char[][] GREEK_STOP_WORDS = {
{O},
{H},
{T, O},
{O, I},
{T, A},
{T, O, Y},
{T, H, S},
{T, W, N},
{T, O, N},
{T, H, N},
{K, A, I},
{K, I},
{K},
{E, I, M, A, I},
{E, I, S, A, I},
{E, I, N, A, I},
{E, I, M, A, S, T, E},
{E, I, S, T, E},
{S, T, O},
{S, T, O, N},
{S, T, H},
{S, T, H, N},
{M, A},
{A, L, L, A},
{A, P, O},
{G, I, A},
{P, R, O, S},
{M, E},
{S, E},
{W, S},
{P, A, R, A},
{A, N, T, I},
{K, A, T, A},
{M, E, T, A},
{TH, A},
{N, A},
{D, E},
{D, E, N},
{M, H},
{M, H, N},
{E, P, I},
{E, N, W},
{E, A, N},
{A, N},
{T, O, T, E},
{P, O, Y},
{P, W, S},
{P, O, I, O, S},
{P, O, I, A},
{P, O, I, O},
{P, O, I, O, I},
{P, O, I, E, S},
{P, O, I, W, N},
{P, O, I, O, Y, S},
{A, Y, T, O, S},
{A, Y, T, H},
{A, Y, T, O},
{A, Y, T, O, I},
{A, Y, T, W, N},
{A, Y, T, O, Y, S},
{A, Y, T, E, S},
{A, Y, T, A},
{E, K, E, I, N, O, S},
{E, K, E, I, N, H},
{E, K, E, I, N, O},
{E, K, E, I, N, O, I},
{E, K, E, I, N, E, S},
{E, K, E, I, N, A},
{E, K, E, I, N, W, N},
{E, K, E, I, N, O, Y, S},
{O, P, W, S},
{O, M, W, S},
{I, S, W, S},
{O, S, O},
{O, T, I}
};
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private Set stopSet = new HashSet();
/**
* Charset for Greek letters.
* Represents encoding for 24 lowercase Greek letters.
* Predefined charsets can be taken from {@link GreekCharsets} class
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
private final Version matchVersion;
/** @deprecated Use {@link #GreekAnalyzer(Version)} instead */
public GreekAnalyzer() {
this(Version.LUCENE_23);
}
public GreekAnalyzer(Version matchVersion) {
charset = GreekCharsets.UnicodeGreek;
stopSet = StopFilter.makeStopSet(
makeStopWords(GreekCharsets.UnicodeGreek));
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer.
* @deprecated Use {@link #GreekAnalyzer(Version)} instead.
*/
public GreekAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
matchVersion = Version.LUCENE_23;
}
/**
* Builds an analyzer with the given stop words.
* @param stopwords Array of stopwords to use.
*
* @deprecated Use {@link #GreekAnalyzer(Version, String[])} instead
*/
public GreekAnalyzer(String [] stopwords)
{
this(Version.LUCENE_23, stopwords);
}
/**
* Builds an analyzer with the given stop words.
* @param stopwords Array of stopwords to use.
*/
public GreekAnalyzer(Version matchVersion, String [] stopwords)
{
charset = GreekCharsets.UnicodeGreek;
stopSet = StopFilter.makeStopSet(stopwords);
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(Version, String[])} instead.
*/
public GreekAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(stopwords);
matchVersion = Version.LUCENE_23;
}
/**
* Takes greek stop words and translates them to a String array, using
* the given charset.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[GREEK_STOP_WORDS.length];
for (int i = 0; i < res.length; i++)
{
char[] theStopWord = GREEK_STOP_WORDS[i];
// translate the word,using the charset
StringBuffer theWord = new StringBuffer();
for (int j = 0; j < theStopWord.length; j++)
{
theWord.append(charset[theStopWord[j]]);
}
res[i] = theWord.toString();
}
return res;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(Version, Map)} instead.
*/
public GreekAnalyzer(char[] charset, Map stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
matchVersion = Version.LUCENE_23;
}
/**
* Builds an analyzer with the given stop words.
*
* @deprecated Use {@link #GreekAnalyzer(Version,Map)} instead
*/
public GreekAnalyzer(Map stopwords)
{
this(Version.LUCENE_23, stopwords);
}
/**
* Builds an analyzer with the given stop words.
*/
public GreekAnalyzer(Version matchVersion, Map stopwords)
{
charset = GreekCharsets.UnicodeGreek;
stopSet = new HashSet(stopwords.keySet());
this.matchVersion = matchVersion;
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link GreekLowerCaseFilter} and {@link StopFilter}
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new GreekLowerCaseFilter(result, charset);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopSet);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link GreekLowerCaseFilter} and {@link StopFilter}
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new GreekLowerCaseFilter(streams.source, charset);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stopSet);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
}