/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.text;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
/**
* UIMA collection reader for plain text files, one sentence per line.
*/
@TypeCapability(
outputs={
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"})
public class LineOrientedTextReader
extends JCasResourceCollectionReader_ImplBase
{
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aJCas, res);
try (InputStream is = new BufferedInputStream(res.getInputStream())) {
aJCas.setDocumentText(IOUtils.toString(is, "UTF-8"));
}
String t = aJCas.getDocumentText();
int start = 0;
int end = t.indexOf('\n');
while (end >= 0) {
createSentence(aJCas, start, end);
start = end + 1;
if (start < t.length()) {
end = t.indexOf('\n', start);
}
else {
end = -1;
}
}
if (start < t.length()) {
createSentence(aJCas, start, t.length());
}
}
protected Sentence createSentence(final JCas aJCas, final int aBegin,
final int aEnd)
{
int[] span = new int[] { aBegin, aEnd };
trim(aJCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
Sentence seg = new Sentence(aJCas, span[0], span[1]);
seg.addToIndexes(aJCas);
return seg;
}
else {
return null;
}
}
/**
* Remove trailing or leading whitespace from the annotation.
* @param aText the text.
* @param aSpan the offsets.
*/
public void trim(String aText, int[] aSpan)
{
int begin = aSpan[0];
int end = aSpan[1]-1;
String data = aText;
while (
(begin < (data.length()-1))
&& trimChar(data.charAt(begin))
) {
begin ++;
}
while (
(end > 0)
&& trimChar(data.charAt(end))
) {
end --;
}
end++;
aSpan[0] = begin;
aSpan[1] = end;
}
public boolean isEmpty(int aBegin, int aEnd)
{
return aBegin >= aEnd;
}
public boolean trimChar(final char aChar)
{
switch (aChar) {
case '\n': return true; // Line break
case '\r': return true; // Carriage return
case '\t': return true; // Tab
case '\u200E': return true; // LEFT-TO-RIGHT MARK
case '\u200F': return true; // RIGHT-TO-LEFT MARK
case '\u2028': return true; // LINE SEPARATOR
case '\u2029': return true; // PARAGRAPH SEPARATOR
default:
return Character.isWhitespace(aChar);
}
}
}