/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.api.segmentation;
import static org.apache.uima.fit.util.CasUtil.getType;
import static org.apache.uima.fit.util.CasUtil.select;
import java.util.Iterator;
import java.util.Locale;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm;
/**
*/
public abstract
class SegmenterBase
extends JCasAnnotator_ImplBase
{
/**
* A list of type names used for zoning.
*/
public final static String PARAM_ZONE_TYPES = "zoneTypes";
@ConfigurationParameter(name=PARAM_ZONE_TYPES, mandatory=false, defaultValue = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div" })
private String[] zoneTypes;
/**
* Strict zoning causes the segmentation to be applied only within the
* boundaries of a zone annotation. This works only if a single zone type
* is specified (the zone annotations should NOT overlap) or if no zone
* type is specified - in which case the whole document is taken as a zone.
* If strict zoning is turned off, multiple zone types can be specified.
* A list of all zone boundaries (start and end) is created and segmentation
* happens between them.
*/
public final static String PARAM_STRICT_ZONING = "strictZoning";
@ConfigurationParameter(name=PARAM_STRICT_ZONING, mandatory=true, defaultValue="false")
private boolean strictZoning;
/**
* The language.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false)
private String language;
/**
* Create {@link Token} annotations.
*/
public static final String PARAM_WRITE_TOKEN = ComponentParameters.PARAM_WRITE_TOKEN;
@ConfigurationParameter(name=PARAM_WRITE_TOKEN, mandatory=true, defaultValue="true")
private boolean writeToken;
/**
* Create {@link TokenForm} annotations.
*/
public static final String PARAM_WRITE_FORM = ComponentParameters.PARAM_WRITE_FORM;
@ConfigurationParameter(name=PARAM_WRITE_FORM, mandatory=true, defaultValue="true")
private boolean writeForm;
/**
* Create {@link Sentence} annotations.
*/
public static final String PARAM_WRITE_SENTENCE = ComponentParameters.PARAM_WRITE_SENTENCE;
@ConfigurationParameter(name=PARAM_WRITE_SENTENCE, mandatory=true, defaultValue="true")
private boolean writeSentence;
public boolean isStrictZoning()
{
return strictZoning;
}
public boolean isWriteSentence()
{
return writeSentence;
}
public boolean isWriteToken()
{
return writeToken;
}
public String[] getZoneTypes()
{
return zoneTypes;
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
String text = jcas.getDocumentText();
String[] zones = getZoneTypes();
if (isStrictZoning()) {
if (zones == null || zones.length == 0) {
process(jcas, text.substring(0, text.length()), 0);
}
else if (zones.length != 1) {
throw new AnalysisEngineProcessException(new IllegalStateException(
"Strict zoning cannot use multiple zone types"));
} else {
CAS cas = jcas.getCas();
for (AnnotationFS zone : select(cas, getType(cas, zones[0]))) {
int[] adjusted = limit(text, zone.getBegin(), zone.getEnd());
process(jcas, text.substring(adjusted[0], adjusted[1]), adjusted[0]);
}
}
}
else {
// This set collects all zone boundaries.
SortedSet<Integer> boundarySet = new TreeSet<Integer>();
boundarySet.add(0); // Add start boundary
boundarySet.add(text.length()); // Add end boundary
// If zoneTypes have been define then get the boundaries, otherwise we will
// simply have one big zone covering the whole document.
if (zones != null) {
// Iterate over all the zone indices and create sentences respecting
// the zone boundaries. If the zoneTypes overlap... well... bad luck!
for (String zoneName : zones) {
CAS cas = jcas.getCas();
for (AnnotationFS zone : select(cas, getType(cas, zoneName))) {
int[] adjusted = limit(text, zone.getBegin(), zone.getEnd());
boundarySet.add(adjusted[0]);
boundarySet.add(adjusted[1]);
}
}
}
// Now process all zoneTypes. There will be at least two entries in the
// boundary set (see above).
Iterator<Integer> bi = boundarySet.iterator();
int begin = bi.next();
while (bi.hasNext()) {
int end = bi.next();
process(jcas, text.substring(begin, end), begin);
begin = end;
}
}
}
/**
* Adjust the values in the two numeric arguments to be within the limits of the specified text.
* If the limits have to be adjusted, a warning is issued to the log. Illegal zone boundaries
* hint to a bug in the AE that produced the zone annotations.
*
* @param text
* the text.
* @param aBegin
* the zone begin.
* @param aEnd
* the zone end.
* @return reduced offsets.
*/
protected int[] limit(String text, int aBegin, int aEnd)
{
// checking to avoid out-of-bounds
int maxEnd = text.length();
int begin = aBegin < 0 ? 0 : aBegin;
begin = begin > maxEnd ? maxEnd : begin;
int end = aEnd < 0 ? 0 : aEnd;
end = end > maxEnd ? maxEnd : end;
if (begin != aBegin || end != aEnd) {
getLogger().warn(
"Adjusted out-of-bounds zone [" + aBegin + "-" + aEnd + "] to [" + begin + "-"
+ end + "]");
}
int[] offsets = { begin, end };
return offsets;
}
protected Sentence createSentence(final JCas aJCas, final int aBegin,
final int aEnd)
{
int[] span = new int[] { aBegin, aEnd };
trim(aJCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1]) && isWriteSentence()) {
Sentence seg = new Sentence(aJCas, span[0], span[1]);
seg.addToIndexes(aJCas);
return seg;
}
else {
return null;
}
}
/**
* @deprecated use {@link #createToken(JCas, int, int)}
*/
@Deprecated
protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd, final int aIndex)
{
return createToken(aJCas, null, aBegin, aEnd);
}
protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd)
{
return createToken(aJCas, null, aBegin, aEnd);
}
protected Token createToken(final JCas aJCas, final String aForm, final int aBegin,
final int aEnd)
{
int[] span = new int[] { aBegin, aEnd };
trim(aJCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1]) && isWriteToken()) {
Token seg = new Token(aJCas, span[0], span[1]);
if (aForm != null && writeForm) {
seg.setText(aForm);
}
seg.addToIndexes(aJCas);
return seg;
}
else {
return null;
}
}
protected abstract void process(JCas aJCas, String text, int zoneBegin)
throws AnalysisEngineProcessException;
/**
* Remove trailing or leading whitespace from the annotation.
* @param aText the text.
* @param aSpan the offsets.
*/
public void trim(String aText, int[] aSpan)
{
int begin = aSpan[0];
int end = aSpan[1]-1;
String data = aText;
while (
(begin < (data.length()-1))
&& trimChar(data.charAt(begin))
) {
begin ++;
}
while (
(end > 0)
&& trimChar(data.charAt(end))
) {
end --;
}
end++;
aSpan[0] = begin;
aSpan[1] = end;
}
public boolean isEmpty(int aBegin, int aEnd)
{
return aBegin >= aEnd;
}
public boolean trimChar(final char aChar)
{
switch (aChar) {
case '\n': return true; // Line break
case '\r': return true; // Carriage return
case '\t': return true; // Tab
case '\u200E': return true; // LEFT-TO-RIGHT MARK
case '\u200F': return true; // RIGHT-TO-LEFT MARK
case '\u2028': return true; // LINE SEPARATOR
case '\u2029': return true; // PARAGRAPH SEPARATOR
default:
return Character.isWhitespace(aChar);
}
}
public String getLanguage(JCas aJCas)
{
if (language != null) {
return language;
}
else {
return aJCas.getDocumentLanguage();
}
}
/**
* Get the locale from the parameter, then from the document if available.
* If no locale is set get the default locale from the VM.
* @param aJCas the JCas.
* @return the locale.
*/
public Locale getLocale(JCas aJCas)
{
String lang = getLanguage(aJCas);
if (lang != null) {
return new Locale(lang);
}
else {
return Locale.getDefault();
}
}
}