package weiweiwang.github.search.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.util.Version;
import java.io.Reader;
/**
* @author wangweiwei
* Date: 8/5/12
* Time: 4:46 PM
*/
final public class PinyinAnalyzer extends Analyzer {
// private static NormalizeCharMap NORMALIZE_CHAR_MAP = null;
//
// static {
// NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// for (char c = 'a'; c <= 'z'; c++) {
// builder.add(String.valueOf(c), "");
// }
// NORMALIZE_CHAR_MAP = builder.build();
// }
protected final Version matchVersion;
protected final boolean convertToT9;
// protected final boolean lowercaseFilter;
public PinyinAnalyzer(Version version, /*boolean lowercaseFilter,*/ boolean convertToT9) {
matchVersion = version;
// this.lowercaseFilter = lowercaseFilter;
this.convertToT9 = convertToT9;
}
@Override
protected TokenStreamComponents createComponents(String s, Reader reader) {
Tokenizer source = new LetterTokenizer(matchVersion, reader);
TokenStream filter = new LowerCaseFilter(matchVersion, source);
if (convertToT9) {
filter = new T9Filter(matchVersion, filter);
}
filter = new EdgeNGramTokenFilter(filter, EdgeNGramTokenFilter.Side.FRONT, 1, 10);
return new TokenStreamComponents(source, filter);
}
/**
* Override this if you want to add a CharFilter chain.
* <p/>
* The default implementation returns <code>reader</code>
* unchanged.
*
* @param fieldName IndexableField name being indexed
* @param reader original Reader
* @return reader, optionally decorated with CharFilter(s)
*/
protected Reader initReader(String fieldName, Reader reader) {
// if (lowercaseFilter) {
// return new MappingCharFilter(NORMALIZE_CHAR_MAP, reader);
// }
return reader;
}
}