/*
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.aclanthology;
import static org.apache.commons.io.IOUtils.closeQuietly;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import com.ibm.icu.text.CharsetDetector;
import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
/**
* <p>Reads the ACL anthology corpus and outputs CASes with plain text documents.</p>
*
* <p>The reader tries to strip out hyphenation and replace problematic characters to produce a
* cleaned text. Otherwise, it is a plain text reader.</p>
*/
@MimeTypeCapability(MimeTypes.TEXT_PLAIN)
@TypeCapability(
outputs={
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"})
public class AclAnthologyReader
extends ResourceCollectionReaderBase
{
/**
* Name of configuration parameter that contains the character encoding used by the input files.
* If not specified, the default system encoding will be used.
*/
public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String encoding;
// replace some nasty characters
final char[] replaceChars = new char[] { (char) 1, (char) 8, (char) 11, (char) 12, (char) 14,
(char) 15, (char) 16, (char) 17, (char) 18, (char) 19, (char) 20, (char) 21, (char) 22,
(char) 24, (char) 25, (char) 28, (char) 30, (char) 31, (char) 127, (char) 154,
(char) 159, (char) 167, (char) 168, (char) 169, (char) 171, (char) 174, (char) 176,
(char) 177, (char) 182, (char) 187, (char) 405, (char) 406, (char) 407, (char) 534,
(char) 543, (char) 596, (char) 726, (char) 937, (char) 1227, (char) 1366, (char) 1367,
(char) 1372, (char) 1378, (char) 1390, (char) 1426, (char) 1436, (char) 1462,
(char) 1490, (char) 1525, (char) 1562, (char) 1697, (char) 1720, (char) 1802,
(char) 1954, (char) 8222, (char) 8226, (char) 8228, (char) 8249, (char) 8250,
(char) 9632, (char) 9642, (char) 10003, (char) 65279, (char) 65533 };
@Override
public void getNext(CAS aCAS)
throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aCAS, res);
InputStream is = null;
try {
is = new BufferedInputStream(res.getInputStream());
String text = "";
if ("auto".equals(encoding.toLowerCase())) {
CharsetDetector detector = new CharsetDetector();
text = IOUtils.toString(detector.getReader(is, null));
}
else {
text = IOUtils.toString(is, encoding);
}
// replace special chars
String cleanedText = text;
for (char c : replaceChars) {
cleanedText = cleanedText.replace(c, ' ');
}
// replace hyphens
cleanedText = replaceHyphens(cleanedText);
cleanedText = cleanedText.replaceAll("\\s{2,}", " ");
cleanedText = cleanedText.replaceAll("\\r?\\n", " ");
aCAS.setDocumentText(cleanedText);
}
finally {
closeQuietly(is);
}
}
private String replaceHyphens(String text) {
String lines[] = text.split("\\r?\\n");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < lines.length - 1; i++) {
// hyphen heuristic
if (lines[i].endsWith("-") &&
lines[i+1].length() > 0 &&
Character.isLowerCase(lines[i+1].charAt(0)) &&
!(lines[i+1].split(" ")[0].contains("-"))
)
{
// combine wordA[-\n]wordB into one word
String[] lineA = lines[i].split(" ");
String[] lineB = lines[i+1].split(" ");
String wordA = lineA[lineA.length-1];
wordA = wordA.substring(0, wordA.length()-1); // remove hyphen
String wordB = lineB[0];
// take current line without hyphen, but with complete word
sb.append(lines[i].substring(0, lines[i].length() - 1) + wordB + "\n");
// delete 2nd word part from following line
StringBuilder sbTmp = new StringBuilder();
for (int j = 1; j < lineB.length; j++) {
if (sbTmp.length() == 0) {
sbTmp.append(lineB[j]);
}
else {
sbTmp.append(" " + lineB[j]);
}
}
lines[i+1] = sbTmp.toString();
}
else {
sb.append(lines[i] + "\n");
}
}
return sb.toString();
}
}