/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.icu;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.LanguageCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import com.ibm.icu.text.BreakIterator;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
/**
* ICU segmenter.
*/
@LanguageCapability({ "af", "ak", "am", "ar", "as", "az", "be", "bg", "bm", "bn", "bo", "br", "bs",
"ca", "ce", "cs", "cy", "da", "de", "dz", "ee", "el", "en", "eo", "es", "et", "eu", "fa",
"ff", "fi", "fo", "fr", "fy", "ga", "gd", "gl", "gu", "gv", "ha", "hi", "hr", "hu", "hy",
"ig", "ii", "in", "is", "it", "iw", "ja", "ji", "ka", "ki", "kk", "kl", "km", "kn", "ko",
"ks", "kw", "ky", "lb", "lg", "ln", "lo", "lt", "lu", "lv", "mg", "mk", "ml", "mn", "mr",
"ms", "mt", "my", "nb", "nd", "ne", "nl", "nn", "om", "or", "os", "pa", "pl", "ps", "pt",
"qu", "rm", "rn", "ro", "ru", "rw", "se", "sg", "si", "sk", "sl", "sn", "so", "sq", "sr",
"sv", "sw", "ta", "te", "th", "ti", "to", "tr", "ug", "uk", "ur", "uz", "vi", "yo", "zh",
"zu" })
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class IcuSegmenter
extends SegmenterBase
{
/**
* Per default, the segmenter does not split off contractions like {@code John's} into two
* tokens. When this parameter is enabled, a non-default token split is generated when an
* apostrophe ({@code '}) is encountered.
*/
public static final String PARAM_SPLIT_AT_APOSTROPHE = "splitAtApostrophe";
@ConfigurationParameter(name = PARAM_SPLIT_AT_APOSTROPHE, mandatory = true, defaultValue = "false")
private boolean splitAtApostrophe;
@Override
protected void process(JCas aJCas, String text, int zoneBegin)
throws AnalysisEngineProcessException
{
BreakIterator bi = BreakIterator.getSentenceInstance(getLocale(aJCas));
bi.setText(text);
int last = bi.first() + zoneBegin;
int cur = bi.next();
while (cur != BreakIterator.DONE) {
cur += zoneBegin;
if (isWriteSentence()) {
Annotation segment = createSentence(aJCas, last, cur);
if (segment != null) {
processSentence(aJCas, segment.getCoveredText(), segment.getBegin());
}
}
else {
int[] span = new int[] { last, cur };
trim(aJCas.getDocumentText(), span);
processSentence(aJCas, aJCas.getDocumentText().substring(span[0], span[1]), span[0]);
}
last = cur;
cur = bi.next();
}
}
/**
* Process the sentence to create tokens.
*/
private void processSentence(JCas aJCas, String text, int zoneBegin)
{
BreakIterator bi = BreakIterator.getWordInstance(getLocale(aJCas));
bi.setText(text);
int last = bi.first() + zoneBegin;
int cur = bi.next();
while (cur != BreakIterator.DONE) {
cur += zoneBegin;
Annotation token = createToken(aJCas, last, cur);
if (token != null) {
if (splitAtApostrophe) {
int i = token.getCoveredText().indexOf("'");
if (i > 0) {
i += token.getBegin();
createToken(aJCas, i, token.getEnd());
token.setEnd(i);
}
}
}
last = cur;
cur = bi.next();
}
}
}