/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.tokit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
/**
* This class creates paragraph annotations for the given input document. It searches for the
* occurrence of two or more line-breaks (Unix and Windows) and regards this as the boundary between
* paragraphs.
*
*/
@TypeCapability(
outputs={
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph"})
public class ParagraphSplitter
extends JCasAnnotator_ImplBase
{
public static final String SINGLE_LINE_BREAKS_PATTERN = "((\n\r\n)+(\r\n)*)|((\n)+(\n)*)";
public static final String DOUBLE_LINE_BREAKS_PATTERN = "((\r\n\r\n)+(\r\n)*)|((\n\n)+(\n)*)";
/**
* A regular expression used to detect paragraph splits.
*
* Default: {@link #DOUBLE_LINE_BREAKS_PATTERN} (split on two consecutive line breaks)
*/
public static final String PARAM_SPLIT_PATTERN = "splitPattern";
@ConfigurationParameter(name = PARAM_SPLIT_PATTERN, defaultValue = DOUBLE_LINE_BREAKS_PATTERN)
private Pattern splitPattern;
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
String input = aJCas.getDocumentText();
if (input.length() < 1) {
throw new AnalysisEngineProcessException(new Throwable("Document text is empty."));
}
Pattern ParagraphPattern = splitPattern;
Matcher matcher = ParagraphPattern.matcher(input);
int pos = 0;
int nextBeginning = 0;
while (matcher.find(pos)) {
Paragraph paragraph = new Paragraph(aJCas, nextBeginning, matcher.start());
paragraph.addToIndexes();
nextBeginning = matcher.end();
pos = matcher.end();
}
if (pos < input.length()) {
Paragraph paragraph = new Paragraph(aJCas, nextBeginning, input.length());
paragraph.addToIndexes();
}
}
}