package org.wikibrain.lucene.tokenizers; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.cjk.CJKWidthFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter; import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter; import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import org.wikibrain.core.lang.Language; import org.wikibrain.lucene.TokenizerOptions; import java.io.Reader; /** * @author Ari Weiland */ public class JapaneseTokenizer extends LanguageTokenizer { protected JapaneseTokenizer(Version version, TokenizerOptions options, Language language) { super(version, options, language); } @Override public Tokenizer makeTokenizer(Reader r) { return new org.apache.lucene.analysis.ja.JapaneseTokenizer(r, null, false, org.apache.lucene.analysis.ja.JapaneseTokenizer.DEFAULT_MODE); } @Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new JapaneseBaseFormFilter(tokenizer); stream = new CJKWidthFilter(stream); if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream); if (useStopWords) { stream = new JapanesePartOfSpeechStopFilter(true, stream, JapaneseAnalyzer.getDefaultStopTags()); stream = new StopFilter(matchVersion, stream, JapaneseAnalyzer.getDefaultStopSet()); } if (useStem) stream = new JapaneseKatakanaStemFilter(stream); return stream; } }