/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.languagetool;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.LanguageCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.languagetool.Language;
import org.languagetool.Languages;
import cn.com.cjf.CJFBeanFactory;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
/**
* Segmenter using LanguageTool to do the heavy lifting. LanguageTool internally uses different
* strategies for tokenization.
*/
@LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh",
"da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta",
"tl", "uk" })
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" })
public class LanguageToolSegmenter extends SegmenterBase
{
@Override
protected void process(JCas aJCas, String aText, int aZoneBegin)
throws AnalysisEngineProcessException
{
Language lang = Languages.getLanguageForShortName(getLanguage(aJCas));
Language defaultVariant = lang.getDefaultLanguageVariant();
if (defaultVariant != null) {
getLogger().debug(
"Using default variant [" + defaultVariant.getShortNameWithCountryAndVariant()
+ "] for language [" + getLanguage(aJCas) + "]");
lang = defaultVariant;
}
List<String> sentences = lang.getSentenceTokenizer().tokenize(aText);
int lastSStart = 0;
for (String s : sentences) {
int sStart = aText.indexOf(s, lastSStart);
int sEnd = sStart + s.length();
lastSStart = sEnd;
sStart += aZoneBegin;
sEnd += aZoneBegin;
createSentence(aJCas, sStart, sEnd);
List<String> tokens = lang.getWordTokenizer().tokenize(s);
int lastTStart = 0;
for (String t : tokens) {
int tStart = s.indexOf(t, lastTStart);
// The Chinese tokenizer adds some /xxx suffixes, try to remove that
if ("zh".equals(getLanguage(aJCas)) && tStart == -1) {
int suffix = t.indexOf('/');
if (suffix != -1) {
t = t.substring(0, suffix);
}
tStart = s.indexOf(t, lastTStart);
}
// The Chinese tokenizer normalizes from traditional to simplified Chinese.
// Maybe we have to undo this transformation.
if ("zh".equals(getLanguage(aJCas)) && tStart == -1) {
String trad = CJFBeanFactory.getChineseJF().chineseJan2Fan(t);
tStart = s.indexOf(trad, lastTStart);
}
if (tStart == -1) {
throw new IllegalStateException("Token [" + t + "] not found in sentence [" + s
+ "]");
}
int tEnd = tStart + t.length();
lastTStart = tEnd;
tStart += sStart;
tEnd += sStart;
createToken(aJCas, tStart, tEnd);
}
}
}
}