/*
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.mecab;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.pipeline.JCasIterable;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;
import de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken;
public class MeCabTaggerDetailedTest {
@Test
public void testMeCabTagger() throws UIMAException, IOException {
CollectionReaderDescription reader = createReaderDescription(
TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, "src/test/resources",
TextReader.PARAM_LANGUAGE, "ja",
TextReader.PARAM_PATTERNS, new String[] { "[+]detailedTest.txt" });
AnalysisEngine jTagger = createEngine(MeCabTagger.class);
try {
JCas jcas = new JCasIterable(reader).iterator().next();
Collection<Sentence> totalFound = getSentences(jTagger, jcas);
assertEquals(1, totalFound.size());
evaluateSentence(totalFound, jcas);
// sysout the found senteces
for (Sentence s : totalFound) {
System.out.println(s.getCoveredText());
}
} finally {
jTagger.destroy();
}
}
private void evaluateSentence(Collection<Sentence> totalFound, JCas jcas) {
Sentence sent = totalFound.iterator().next();
List<JapaneseToken> tokens = JCasUtil.selectCovered(jcas, JapaneseToken.class, sent.getBegin(), sent.getEnd());
assertEquals(15, tokens.size());
int token = 0;
// Token 1
assertEquals("今", getForm(tokens.get(token)));
assertEquals("名詞-副詞可", getPOS(jcas, tokens.get(token)).substring(0, 6));
assertEquals("今", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("イマ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 2
token++;
assertEquals("まで", getForm(tokens.get(token)));
assertEquals("助詞-副助詞", getPOS(jcas, tokens.get(token)));
assertEquals("まで", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("マデ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 3
token++;
assertEquals("旅行", getForm(tokens.get(token)));
assertEquals("名詞-サ変接続", getPOS(jcas, tokens.get(token)));
assertEquals("旅行", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("リョコウ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 4
token++;
assertEquals("し", getForm(tokens.get(token)));
assertEquals("動詞-自立", getPOS(jcas, tokens.get(token)));
assertEquals("する", getLemma(jcas, tokens.get(token)));
assertEquals("B", tokens.get(token).getIbo());
assertEquals("シ", tokens.get(token).getKana());
assertEquals("連用形", tokens.get(token).getKei());
assertEquals("サ変・スル", tokens.get(token).getDan());
// Token 5
token++;
assertEquals("た", getForm(tokens.get(token)));
assertEquals("助動詞", getPOS(jcas, tokens.get(token)));
assertEquals("た", getLemma(jcas, tokens.get(token)));
assertEquals("I", tokens.get(token).getIbo());
assertEquals("タ", tokens.get(token).getKana());
assertEquals("基本形", tokens.get(token).getKei());
assertEquals("特殊・タ", tokens.get(token).getDan());
// Token 6
token++;
assertEquals("国", getForm(tokens.get(token)));
assertEquals("名詞-一般", getPOS(jcas, tokens.get(token)));
assertEquals("国", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("クニ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 7
token++;
assertEquals("の", getForm(tokens.get(token)));
assertEquals("助詞-連体化", getPOS(jcas, tokens.get(token)));
assertEquals("の", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("ノ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 8
token++;
assertEquals("中", getForm(tokens.get(token)));
assertEquals("名詞-非自立-副詞可能", getPOS(jcas, tokens.get(token)));
assertEquals("中", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("ナカ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 9
token++;
assertEquals("で", getForm(tokens.get(token)));
assertEquals("助詞-格助詞-一般", getPOS(jcas, tokens.get(token)));
assertEquals("で", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("デ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 10
token++;
assertEquals("日本", getForm(tokens.get(token)));
assertEquals("名詞-固有名詞-地域-国", getPOS(jcas, tokens.get(token)));
assertEquals("日本", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("ニッポン", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 11
token++;
assertEquals("が", getForm(tokens.get(token)));
assertEquals("助詞-格助詞-一般", getPOS(jcas, tokens.get(token)));
assertEquals("が", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("ガ", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 12
token++;
assertEquals("一番", getForm(tokens.get(token)));
assertEquals("名詞-副詞可能", getPOS(jcas, tokens.get(token)));
assertEquals("一番", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("イチバン", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
// Token 13
token++;
assertEquals("楽しかっ", getForm(tokens.get(token)));
assertEquals("形容詞-自立", getPOS(jcas, tokens.get(token)));
assertEquals("楽しい", getLemma(jcas, tokens.get(token)));
assertEquals("B", tokens.get(token).getIbo());
assertEquals("タノシカッ", tokens.get(token).getKana());
assertEquals("連用タ接続", tokens.get(token).getKei());
assertEquals("形容詞・イ段", tokens.get(token).getDan());
// Token 14
token++;
assertEquals("た", getForm(tokens.get(token)));
assertEquals("助動詞", getPOS(jcas, tokens.get(token)));
assertEquals("た", getLemma(jcas, tokens.get(token)));
assertEquals("I", tokens.get(token).getIbo());
assertEquals("タ", tokens.get(token).getKana());
assertEquals("基本形", tokens.get(token).getKei());
assertEquals("特殊・タ", tokens.get(token).getDan());
// Token 15
token++;
assertEquals("。", getForm(tokens.get(token)));
assertEquals("記号-句点", getPOS(jcas, tokens.get(token)));
assertEquals("。", getLemma(jcas, tokens.get(token)));
assertEquals("O", tokens.get(token).getIbo());
assertEquals("。", tokens.get(token).getKana());
assertEquals("", tokens.get(token).getKei());
assertEquals("", tokens.get(token).getDan());
}
private String getPOS(JCas jcas, Token token) {
List<POS> selectCovered = JCasUtil.selectCovered(jcas, POS.class, token.getBegin(), token.getEnd());
if (selectCovered.size() == 1) {
return selectCovered.get(0).getPosValue();
}
return "";
}
private String getLemma(JCas jcas, Token token) {
List<Lemma> selectCovered = JCasUtil.selectCovered(jcas, Lemma.class, token.getBegin(), token.getEnd());
if (selectCovered.size() == 1) {
return selectCovered.get(0).getValue();
}
return "";
}
private String getForm(Token token) {
return token.getCoveredText();
}
private Collection<Sentence> getSentences(AnalysisEngine jTagger, JCas jcas) throws AnalysisEngineProcessException,
UIMAException, IOException {
jTagger.process(jcas);
Collection<Sentence> found = JCasUtil.select(jcas, Sentence.class);
return found;
}
}