/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.penntree;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
/**
* Penn Treebank combined format reader.
*/
@MimeTypeCapability({MimeTypes.TEXT_X_PTB_COMBINED})
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent" })
public class PennTreebankCombinedReader
extends JCasResourceCollectionReader_ImplBase
{
/**
* Name of configuration parameter that contains the character encoding used by the input files.
*/
public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String encoding;
/**
* Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the
* tag set defined as part of the model meta data. This can be useful if a custom model is
* specified which does not have such meta data, or it can be used in readers.
*/
public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET;
@ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false)
protected String posTagset;
/**
* Load the part-of-speech tag to UIMA type mapping from this location instead of locating
* the mapping automatically.
*/
public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false)
protected String posMappingLocation;
/**
* Sets whether to create or not to create POS tags. The creation of
* constituent tags must be turned on for this to work.
*
* <p>Default: {@code true}</p>
*/
public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS;
@ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true")
private boolean createPosTags;
/**
* Use this constituent tag set to use to resolve the tag set mapping instead of using the
* tag set defined as part of the model meta data. This can be useful if a custom model is
* specified which does not have such meta data, or it can be used in readers.
*/
public static final String PARAM_CONSTITUENT_TAG_SET = ComponentParameters.PARAM_CONSTITUENT_TAG_SET;
@ConfigurationParameter(name = PARAM_CONSTITUENT_TAG_SET, mandatory = false)
protected String constituentTagset;
/**
* Load the constituent tag to UIMA type mapping from this location instead of locating
* the mapping automatically.
*/
public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false)
protected String constituentMappingLocation;
/**
* Use the {@link String#intern()} method on tags. This is usually a good idea to avoid
* spaming the heap with thousands of strings representing only a few different tags.
*
* <p>Default: {@code true}</p>
*/
public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS;
@ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true")
private boolean internTags;
public static final String PARAM_REMOVE_TRACES = "removeTraces";
@ConfigurationParameter(name = PARAM_REMOVE_TRACES, mandatory = false, defaultValue = "true")
private boolean removeTraces;
public static final String PARAM_WRITE_TRACES_TO_TEXT = "writeTracesToText";
@ConfigurationParameter(name = PARAM_WRITE_TRACES_TO_TEXT, mandatory = false, defaultValue = "false")
private boolean writeTracesToText;
private static final String NONE = "-NONE-";
private MappingProvider posMappingProvider;
private MappingProvider constituentMappingProvider;
private PennTreeToJCasConverter converter;
private int lineNumber = 0;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation,
posTagset, getLanguage());
constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider(
constituentMappingLocation, constituentTagset, getLanguage());
converter = new PennTreeToJCasConverter(posMappingProvider, constituentMappingProvider);
converter.setInternTags(internTags);
converter.setWriteTracesToText(writeTracesToText);
converter.setCreatePosTags(createPosTags);
}
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aJCas.getCas(), res);
try {
posMappingProvider.configure(aJCas.getCas());
constituentMappingProvider.configure(aJCas.getCas());
}
catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
StringBuilder text = new StringBuilder();
try (InputStream is = res.getInputStream()) {
lineNumber = 0;
LineIterator li = IOUtils.lineIterator(is, encoding);
while (li.hasNext()) {
PennTreeNode tree = readTree(li);
if (removeTraces) {
doRemoveTraces(tree);
}
Constituent root = converter.convertPennTree(aJCas, text, tree);
Sentence sentence = new Sentence(aJCas, root.getBegin(), root.getEnd());
sentence.addToIndexes();
text.append('\n');
}
}
aJCas.setDocumentText(text.toString());
}
/**
* Remove traces such as having the form {@code (NP-SBJ (-NONE- *))}
*/
private boolean doRemoveTraces(PennTreeNode aTree)
{
if (NONE.equals(aTree.getLabel())) {
return true;
}
else if (aTree.getChildren().size() == 1) {
return doRemoveTraces(aTree.getChildren().get(0));
}
else {
PennTreeNode[] children = aTree.getChildren().toArray(
new PennTreeNode[aTree.getChildren().size()]);
for (PennTreeNode c : children) {
boolean removeChild = doRemoveTraces(c);
if (removeChild) {
aTree.getChildren().remove(c);
}
}
}
return false;
}
private String lineBuffer = null;
private PennTreeNode readTree(LineIterator aLi)
{
StringBuilder tree = new StringBuilder();
while (aLi.hasNext() || lineBuffer != null) {
String line = lineBuffer != null ? lineBuffer : aLi.nextLine();
lineNumber++;
lineBuffer = null;
if (StringUtils.isBlank(line)) {
continue;
}
// If the next line starts at the beginning and with an opening round bracket
if ((tree.length() > 0) && line.charAt(0) == '(') {
lineBuffer = line;
break;
}
tree.append(line);
tree.append('\n'); // Actually not needed - just in case we want to debug ;)
}
try {
return PennTreeUtils.parsePennTree(tree.toString());
}
catch (RuntimeException e) {
getLogger().error("Unable to parse tree before line [" + lineNumber + "]:\n" + tree);
throw e;
}
}
}