/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.tokit;
import java.text.BreakIterator;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.LanguageCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
/**
* BreakIterator segmenter.
*/
@LanguageCapability({ "ar", "be", "bg", "ca", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr",
"ga", "hi", "hr", "hu", "in", "is", "it", "iw", "ja", "ko", "lt", "lv", "mk", "ms", "mt",
"nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "sv", "th", "tr", "uk", "vi",
"zh" })
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class BreakIteratorSegmenter
extends SegmenterBase
{
/**
* Per default the Java {@link BreakIterator} does not split off contractions like
* {@code John's} into two tokens. When this parameter is enabled, a non-default token split is
* generated when an apostrophe ({@code '}) is encountered.
*/
public static final String PARAM_SPLIT_AT_APOSTROPHE = "splitAtApostrophe";
@ConfigurationParameter(name = PARAM_SPLIT_AT_APOSTROPHE, mandatory = true, defaultValue = "false")
private boolean splitAtApostrophe;
@Override
protected void process(JCas aJCas, String text, int zoneBegin)
throws AnalysisEngineProcessException
{
BreakIterator bi = BreakIterator.getSentenceInstance(getLocale(aJCas));
bi.setText(text);
int last = bi.first() + zoneBegin;
int cur = bi.next();
while (cur != BreakIterator.DONE) {
cur += zoneBegin;
if (isWriteSentence()) {
Annotation segment = createSentence(aJCas, last, cur);
if (segment != null) {
processSentence(aJCas, segment.getCoveredText(), segment.getBegin());
}
}
else {
int[] span = new int[] { last, cur };
trim(aJCas.getDocumentText(), span);
processSentence(aJCas, aJCas.getDocumentText().substring(span[0], span[1]), span[0]);
}
last = cur;
cur = bi.next();
}
}
/**
* Process the sentence to create tokens.
*/
private void processSentence(JCas aJCas, String text, int zoneBegin)
{
BreakIterator bi = BreakIterator.getWordInstance(getLocale(aJCas));
bi.setText(text);
int last = bi.first() + zoneBegin;
int cur = bi.next();
while (cur != BreakIterator.DONE) {
cur += zoneBegin;
Annotation token = createToken(aJCas, last, cur);
if (token != null) {
if (splitAtApostrophe) {
int i = token.getCoveredText().indexOf("'");
if (i > 0) {
i += token.getBegin();
createToken(aJCas, i, token.getEnd());
token.setEnd(i);
}
}
}
last = cur;
cur = bi.next();
}
}
}