/*
* Copyright 2011-2013 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.kr.test;
import junit.framework.TestCase;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.kr.morph.*;
import org.apache.lucene.analysis.kr.utils.DictionaryUtil;
import org.junit.Ignore;
import org.junit.Test;
import java.io.File;
import java.util.*;
@Slf4j
public class MorphAnalyzerTest extends TestCase {
@Test
public void testMorphAnalyzer() throws Exception {
String[] inputs = new String[] {
"고물가시대의",
"대외적",
"합쳐져", "뛰어오르고", "급여생활자나", "영세자영업자", "영세농어민", "서민계층들은", "온몸으로", "엄습하고",
"드라마가",
"과장광고", "과소비",
"날을", "아울러", "휴대전화기능처리부와", "코발트의",
"달라", "포함하고",
"사랑받아봄을", "하는",
"아딸떡볶이"
, "발행일", "출원인" // 어미로 끝나는 경우로 분석된다.
, "노란",
"만능청약통장", "가시밭같다",
"정책적", "시리즈를", "자리잡은", "찜통이다", "지난해",
"데모입니다",
"바이오및뇌공학",
"급락조짐을",
"4.19의거는",
"고스트x를",
"검색서비스를",
"장애물이"
};
MorphAnalyzer analyzer = new MorphAnalyzer();
long start = 0;
for (String input : inputs) {
List<AnalysisOutput> list = analyzer.analyze(input);
for (AnalysisOutput o : list) {
System.out.print(o.toString() + "->");
for (int i = 0; i < o.getCNounList().size(); i++) {
System.out.print(o.getCNounList().get(i).getWord() + "/");
}
System.out.print(o.getPatn());
System.out.println("<" + o.getScore() + ">");
}
if (start == 0) start = System.currentTimeMillis();
}
System.out.println((System.currentTimeMillis() - start) + "ms");
}
@Test
public void testCloneAnalysisOutput() throws Exception {
AnalysisOutput output = new AnalysisOutput();
output.setStem("aaaa");
AnalysisOutput clone = output.clone();
assertEquals("aaaa", clone.getStem());
System.out.println(clone.getStem());
}
@Test
public void testMorphAnalyzerManager() throws Exception {
String input = "나는 학교에 갔습니다";
MorphAnalyzerManager manager = new MorphAnalyzerManager();
manager.analyze(input);
}
@Test
public void testAlphaNumeric() throws Exception {
String str = "0123456789azAZ";
for (int i = 0; i < str.length(); i++) {
System.out.println(str.charAt(i) + ":" + (str.charAt(i) - 0));
}
}
@Test
public void testGetWordEntry() throws Exception {
String s = "밤하늘";
WordEntry we = DictionaryUtil.getCNoun(s);
System.out.println(we.getWord());
}
/**
* 세종사전에서 하다와 되다형 동사를 체언과 결합하기 위해 사용한 테스트케이스
*
* @throws Exception
*/
@Test
@Ignore( "용언 데이터 파일이 없습니다." )
public void yongonAnalysis() throws Exception {
String fname = "data/용언_상세.txt";
List<String> list = FileUtils.readLines(new File(fname));
Map<String, String> younons = new HashMap();
MorphAnalyzer analyzer = new MorphAnalyzer();
long start = 0;
List youngOutputs = new ArrayList();
for (String input : list) {
if (!input.endsWith("하다") && !input.endsWith("되다")) {
youngOutputs.add(input);
continue;
}
String eogan = input.substring(0, input.length() - 2);
List<AnalysisOutput> outputs = analyzer.analyze(input);
AnalysisOutput o = outputs.get(0);
String result = o.toString() + "->";
for (int i = 0; i < o.getCNounList().size(); i++) {
result += o.getCNounList().get(i).getWord() + "/";
}
result += "<" + o.getScore() + ">";
String tmp = younons.get(eogan);
if (tmp == null) {
younons.put(eogan, result);
} else {
younons.put(eogan, tmp + "| " + result);
}
}
fname = "data/체언_상세.txt";
String cheonOutfile = "data/cheon.txt";
String youngOutfile = "data/youngon.txt";
List<String> cheons = FileUtils.readLines(new File(fname));
List<String> outputs = new ArrayList();
System.out.println(younons.size());
for (String cheon : cheons) {
String str = younons.remove(cheon);
if (str != null) {
cheon += "=> " + str;
// younons.remove(cheon);
}
outputs.add(cheon);
}
Iterator<String> iter = younons.keySet().iterator();
while (iter.hasNext()) {
String key = iter.next();
outputs.add(key + "=> " + younons.get(key));
}
Collections.sort(outputs);
Collections.sort(youngOutputs);
FileUtils.writeLines(new File(cheonOutfile), outputs);
FileUtils.writeLines(new File(youngOutfile), youngOutputs);
outputs.addAll(youngOutputs);
Collections.sort(outputs);
FileUtils.writeLines(new File("data/all.txt"), outputs);
}
@Test
public void testCompoundNounsWithinDic() throws Exception {
String input = "고투자율";
WordEntry cnoun = DictionaryUtil.getCNoun(input);
List<CompoundEntry> list = null;
if (cnoun != null && cnoun.getFeature(WordEntry.IDX_NOUN) == '2') {
list = cnoun.getCompounds();
for (int j = 0; j < list.size(); j++) {
System.out.println(list.get(j).getWord());
}
}
}
@Test
public void testCompoundNouns() throws Exception {
String input = "가돌리늄착화합물";
CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
cnAnalyzer.setExactMach(true);
List<CompoundEntry> list = cnAnalyzer.analyze(input);
if (list == null) return;
for (CompoundEntry entry : list) {
System.out.println(entry.getWord());
}
}
}