/* * Copyright 2011-2013 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.kr; import org.apache.lucene.analysis.*; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.List; import java.util.Set; /** * Filters StandardTokenizer with StandardFilter, * {@link org.apache.lucene.analysis.LowerCaseFilter} and {@link org.apache.lucene.analysis.StopFilter}, * using a list of English stop words. * * @version $Id: KoreanAnalyzer.java,v 1.2 2013/04/07 13:10:27 smlee0818 Exp $ */ public class KoreanAnalyzer extends StopwordAnalyzerBase { private static final Logger log = LoggerFactory.getLogger(KoreanAnalyzer.class); private static final boolean isTraceEnabled = log.isTraceEnabled(); private static final boolean isDebugEnabled = log.isDebugEnabled(); /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * Specifies whether deprecated acronyms should be replaced with HOST type. * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"} */ private boolean replaceInvalidAcronym = true; private Set stopSet; private boolean bigrammable = true; private boolean hasOrigin = true; private boolean exactMatch = false; private boolean originCNoun = true; public static final String DIC_ENCODING = "UTF-8"; /** * An unmodifiable set containing some common English words that are usually not * useful for searching. */ public static final Set<?> STOP_WORDS_SET; static { List<String> stopWords = Arrays.asList( "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "이", "그", "저", "요", "것", "수", "등", "들", "및", "에", "에서", "그리고", "그래서", "또", "또는", "꼭", "잘", "?", "!", ";", ".", "-"); CharArraySet stopSet = new CharArraySet(Version.LUCENE_36, stopWords.size(), false); stopSet.addAll(stopWords); STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } public KoreanAnalyzer() { this(Version.LUCENE_36, STOP_WORDS_SET); } /** 검색을 위한 형태소분석 */ public KoreanAnalyzer(boolean exactMatch) { this(Version.LUCENE_36, STOP_WORDS_SET); this.exactMatch = exactMatch; } public KoreanAnalyzer(Version matchVersion, String[] stopWords) throws IOException { this(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords)); } /** Builds an analyzer with the stop words from the given file. */ public KoreanAnalyzer(Version matchVersion) throws IOException { this(matchVersion, STOP_WORDS_SET); } // /** Builds an analyzer with the stop words from the given file. // * @see WordlistLoader#getWordSet(File) // */ // public KoreanAnalyzer(Version matchVersion, File stopwords) throws IOException { // this(matchVersion, WordlistLoader.getWordSet(new InputStreamReader(new FileInputStream(stopwords), DIC_ENCODING))); // } // // /** Builds an analyzer with the stop words from the given file. // * @see WordlistLoader#getWordSet(File) // */ // public KoreanAnalyzer(Version matchVersion, File stopwords, String encoding) throws IOException { // this(matchVersion, WordlistLoader.getWordSet(new InputStreamReader(new FileInputStream(stopwords), encoding))); // } // // /** Builds an analyzer with the stop words from the given reader. // * @see WordlistLoader#getWordSet(Reader) // */ // public KoreanAnalyzer(Version matchVersion, Reader stopwords) throws IOException { // this(matchVersion, WordlistLoader.getWordSet(stopwords)); // } /** Builds an analyzer with the stop words from the given reader. */ public KoreanAnalyzer(Version matchVersion, Set<?> stopWords) { super(matchVersion, stopWords); replaceInvalidAcronym = true; // matchVersion.onOrAfter(Version.LUCENE_36); } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { if (isDebugEnabled) log.debug("TokenStreamComponents를 생성합니다. fieldName=[{}]", fieldName); final KoreanTokenizer src = new KoreanTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); //src.setReplaceInvalidAcronym(replaceInvalidAcronym); TokenStream tok = new KoreanFilter(src, bigrammable, hasOrigin, exactMatch); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected boolean reset(final Reader reader) throws IOException { src.setMaxTokenLength(KoreanAnalyzer.this.maxTokenLength); return super.reset(reader); } }; } /** * determine whether the bigram index term is returned or not if a input word is failed to analysis * If true is set, the bigram index term is returned. If false is set, the bigram index term is not returned. */ public void setBigrammable(boolean is) { bigrammable = is; } /** determin whether the original term is returned or not if a input word is analyzed morphically. */ public void setHasOrigin(boolean has) { hasOrigin = has; } /** determin whether the original compound noun is returned or not if a input word is analyzed morphically. */ public void setOriginCNoun(boolean cnoun) { originCNoun = cnoun; } /** determin whether the original compound noun is returned or not if a input word is analyzed morphically. */ public void setExactMatch(boolean exact) { exactMatch = exact; } }