/*
* Copyright 2011-2013 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.kr;
import com.google.common.collect.ImmutableList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.kr.morph.*;
import org.apache.lucene.analysis.kr.utils.DictionaryUtil;
import org.apache.lucene.analysis.kr.utils.HanjaUtils;
import org.apache.lucene.analysis.kr.utils.SynonymUtil;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.*;
@SuppressWarnings( "unchecked" )
public class KoreanFilter extends TokenFilter {
private static final Logger log = LoggerFactory.getLogger(KoreanFilter.class);
private static boolean isTraceEnabled = log.isTraceEnabled();
private static boolean isDebugEnabled = log.isDebugEnabled();
private LinkedList<IndexWord> morphQueue;
private MorphAnalyzer morph;
private WordSpaceAnalyzer wsAnal;
private boolean bigrammable = true;
private boolean hasOrigin = true;
private boolean originCNoun = true;
private boolean exactMatch = false;
private char[] curTermBuffer;
private int curTermLength;
private String curType;
private String curSource;
private int tokStart;
private int hanStart = 0; // 한글의 시작 위치, 복합명사일경우
private int chStart = 0;
private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
public KoreanFilter(TokenStream input) {
super(input);
if (isDebugEnabled)
log.debug("KoreanFilter를 생성합니다...");
morphQueue = new LinkedList<IndexWord>();
morph = new MorphAnalyzer();
wsAnal = new WordSpaceAnalyzer();
cnAnalyzer.setExactMach(false);
}
/**
* @param input input token stream
* @param bigram Whether the bigram index term return or not.
*/
public KoreanFilter(TokenStream input, boolean bigram) {
this(input);
bigrammable = bigram;
}
public KoreanFilter(TokenStream input, boolean bigram, boolean has) {
this(input, bigram);
hasOrigin = has;
}
public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match) {
this(input, bigram, has);
this.exactMatch = match;
}
public KoreanFilter(TokenStream input, boolean bigram, boolean has, boolean match, boolean cnoun) {
this(input, bigram, has, match);
this.originCNoun = cnoun;
}
@Override
public final boolean incrementToken() throws IOException {
if (curTermBuffer != null && morphQueue.size() > 0) {
setTermBufferByQueue(false);
return true;
}
if (!input.incrementToken())
return false;
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
tokStart = offsetAtt.startOffset();
curType = typeAtt.type();
try {
if (KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN].equals(curType)) {
analysisKorean(new String(curTermBuffer, 0, termAtt.length()));
} else if (KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE].equals(curType)) {
analysisChinese(new String(curTermBuffer, 0, termAtt.length()));
} else {
analysisETC(new String(curTermBuffer, 0, termAtt.length()));
}
ImmutableList<IndexWord> indexWords = ImmutableList.copyOf(morphQueue);
for (IndexWord indexWord : indexWords) {
String word = indexWord.getWord();
for (String syn : SynonymUtil.getSynonym(word)) {
if (!word.equals(syn)) {
if (isTraceEnabled)
log.trace("동의어를 추가합니다. word=[{}], syn=[{}]", word, syn);
morphQueue.add(new IndexWord(syn, indexWord.getOffset()));
}
}
}
// for (String syn : SynonymUtil.getSynonym(text)) {
// if (!text.equals(syn)) {
// if (isTraceEnabled)
// log.trace("동의어를 추가합니다. text=[{}], 동의어=[{}]", text, syn);
// morphQueue.add(new IndexWord(syn, 0));
// }
// }
} catch (MorphException e) {
log.error("MorphException이 발생했습니다.", e);
throw new IOException("Korean Filter MorphException\n" + e.getMessage(), e);
}
if (morphQueue != null && morphQueue.size() > 0) {
setTermBufferByQueue(true);
} else {
return incrementToken();
}
return true;
}
/** queue에 저장된 값으로 buffer의 값을 복사한다. */
private void setTermBufferByQueue(boolean isFirst) {
clearAttributes();
IndexWord iw = morphQueue.removeFirst();
int pos = iw.getOffset();
termAtt.copyBuffer(iw.getWord().toCharArray(), 0, iw.getWord().length());
offsetAtt.setOffset(tokStart + pos, tokStart + pos + iw.getWord().length());
if (!isFirst && iw.getOffset() == 0) {
posIncrAtt.setPositionIncrement(0);
// posLenAtt.setPositionLength(iw.getWord().length());
}
}
/**
* 한글을 분석한다.
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
private void analysisKorean(String input) throws MorphException {
if (isTraceEnabled)
log.trace("한글을 분석합니다. input=[{}]", input);
List<AnalysisOutput> outputs = morph.analyze(input);
if (outputs.size() == 0) return;
Map<String, IndexWord> map = new LinkedHashMap();
if (hasOrigin) map.put(input, new IndexWord(input, 0));
if (outputs.get(0).getScore() >= AnalysisOutput.SCORE_COMPOUNDS) {
extractKeyword(outputs, map);
} else {
try {
List<AnalysisOutput> list = wsAnal.analyze(input);
List<AnalysisOutput> results = new ArrayList<AnalysisOutput>();
if (list.size() > 1) {
for (AnalysisOutput o : list) {
if (hasOrigin)
map.put(o.getSource(), new IndexWord(o.getSource(), 0));
results.addAll(morph.analyze(o.getSource()));
}
} else {
results.addAll(list);
}
extractKeyword(results, map);
} catch (Exception e) {
extractKeyword(outputs, map);
}
}
for (String text : map.keySet()) {
if (text.length() <= 1) continue;
morphQueue.add(map.get(text));
if (isTraceEnabled)
log.trace("큐에 추출한 인덱스를 추가합니다. indexWord=[{}]", map.get(text));
}
}
private void extractKeyword(List<AnalysisOutput> outputs, Map<String, IndexWord> map) throws MorphException {
if (isTraceEnabled)
log.trace("키워드를 추출합니다...");
for (AnalysisOutput output : outputs) {
if (output.getPos() != PatternConstants.POS_VERB) {
if (originCNoun || (!originCNoun && output.getCNounList().size() == 0)) {
map.put(output.getStem(), new IndexWord(output.getStem(), 0));
}
// }else {
// map.put(output.getStem()+"다", new Integer(1));
}
if (exactMatch) continue;
if (output.getScore() >= AnalysisOutput.SCORE_COMPOUNDS) {
List<CompoundEntry> cnouns = output.getCNounList();
int start = 0;
for (int jj = 0; jj < cnouns.size(); jj++) {
CompoundEntry cnoun = cnouns.get(jj);
if (cnoun.getWord().length() > 1)
map.put(cnoun.getWord(), new IndexWord(cnoun.getWord(), start));
if (jj == 0 && cnoun.getWord().length() == 1) {
map.put(cnoun.getWord() + cnouns.get(jj + 1).getWord(),
new IndexWord(cnoun.getWord(), start));
} else if (jj > 1 && cnoun.getWord().length() == 1) {
String iw = cnouns.get(jj - 1).getWord() + cnoun.getWord();
map.put(iw, new IndexWord(iw, start - cnouns.get(jj - 1).getWord().length()));
}
start += cnoun.getWord().length();
}
} else if (bigrammable) {
addBiagramToMap(output.getStem(), map);
}
}
}
private void addBiagramToMap(String input, Map<String, IndexWord> map) {
if (isTraceEnabled)
log.trace("Biagram을 분석해서 맵에 추가. input=[{}]", input);
int offset = 0;
int strlen = input.length();
while (offset < strlen - 1) {
if (isAlphaNumChar(input.charAt(offset))) {
String text = findAlphaNumeric(input.substring(offset));
map.put(text, new IndexWord(text, offset));
offset += text.length();
} else {
String text = input.substring(offset,
offset + 2 > strlen ? strlen : offset + 2);
map.put(text, new IndexWord(text, offset));
offset++;
}
}
}
private String findAlphaNumeric(String text) {
if (isTraceEnabled)
log.trace("AlphaNumeric을 찾습니다. text=[{}]", text);
int pos = 0;
for (char c : text.toCharArray()) {
if (!isAlphaNumChar(c))
break;
pos++;
}
// for (int i = 0; i < text.length(); i++) {
// if (!isAlphaNumChar(text.charAt(i)))
// break;
// pos++;
// }
return text.substring(0, pos);
}
/**
* 한자는 2개이상의 한글 음으로 읽혀질 수 있다.
* 두음법칙이 아님.
*
* @param term 단어
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
private void analysisChinese(String term) throws MorphException {
if (isTraceEnabled)
log.trace("한자를 분석합니다. term=[{}]", term);
morphQueue.add(new IndexWord(term, 0));
if (term.length() < 2) return; // 1글자 한자는 색인어로 한글을 추출하지 않는다.
List<StringBuilder> candiList = new ArrayList();
candiList.add(new StringBuilder());
for (int i = 0; i < term.length(); i++) {
char[] chs = HanjaUtils.convertToHangul(term.charAt(i));
if (chs == null) continue;
int caniSize = candiList.size();
for (int j = 0; j < caniSize; j++) {
StringBuilder sb = candiList.get(j);
for (int k = 1; k < chs.length; k++) { // 추가로 생성된 음에 대해서 새로운 텍스트를 생성한다.
if (k == 1) break; // 2개 이상의 음을 가지고 있는 경우 첫번째 음으로만 처리를 한다.
StringBuilder cpSb = new StringBuilder(sb);
cpSb.append(chs[k]);
candiList.add(cpSb);
}
sb.append(chs[0]);
}
}
int maxCandidate = 5;
if (candiList.size() < maxCandidate) maxCandidate = candiList.size();
for (int i = 0; i < maxCandidate; i++) {
morphQueue.add(new IndexWord(candiList.get(i).toString(), 0));
}
Map<String, String> cnounMap = new HashMap<String, String>();
// 추출된 명사가 복합명사인 경우 분리한다.
for (int i = 0; i < maxCandidate; i++) {
List<CompoundEntry> results = confirmCNoun(candiList.get(i).toString());
int pos = 0;
int offset = 0;
for (CompoundEntry entry : results) {
pos += entry.getWord().length();
if (cnounMap.get(entry.getWord()) != null) continue;
// 한글과 매치되는 한자를 짤라서 큐에 저장한다.
IndexWord indexWord = new IndexWord(term.substring(offset, pos), offset);
morphQueue.add(indexWord);
if (isTraceEnabled)
log.trace("한글과 매치되는 한자를 큐에 저장한다. indexWord=[{}]", indexWord);
cnounMap.put(entry.getWord(), entry.getWord());
if (entry.getWord().length() < 2) continue; // 한글은 2글자 이상만 저장한다.
// 분리된 한글을 큐에 저장한다.
morphQueue.add(new IndexWord(entry.getWord(), offset));
offset = pos;
}
}
}
private List<CompoundEntry> confirmCNoun(String input) throws MorphException {
if (isTraceEnabled)
log.trace("한자 명사인지 확인합니다. input=[{}]", input);
WordEntry cnoun = DictionaryUtil.getCNoun(input);
if (cnoun != null && cnoun.getFeature(WordEntry.IDX_NOUN) == '2') {
return cnoun.getCompounds();
}
return cnAnalyzer.analyze(input);
}
private void analysisETC(String term) throws MorphException {
if (isTraceEnabled)
log.trace("부가적인 분석을 수행합니다. term=[{}]", term);
final char[] buffer = termAtt.buffer();
final int bufferLength = termAtt.length();
final String type = typeAtt.type();
if (type.equals(APOSTROPHE_TYPE) && // remove 's
bufferLength >= 2 &&
buffer[bufferLength - 2] == '\'' &&
(buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) {
// Strip last 2 characters off
morphQueue.add(new IndexWord(term.substring(0, bufferLength - 2), 0));
} else if (type.equals(ACRONYM_TYPE)) { // remove dots
int upto = 0;
for (int i = 0; i < bufferLength; i++) {
char c = buffer[i];
if (c != '.')
buffer[upto++] = c;
}
morphQueue.add(new IndexWord(term.substring(0, upto), 0));
} else {
morphQueue.add(new IndexWord(term, 0));
}
}
private boolean isAlphaNumChar(int c) {
return (c >= 48 && c <= 57) || (c >= 65 && c <= 122);
}
public void setHasOrigin(boolean has) {
hasOrigin = has;
}
public void setExactMatch(boolean match) {
this.exactMatch = match;
}
}