/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.tuepp;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model.TueppBaseform;
import de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model.TueppPos;
import de.tudarmstadt.ukp.dkpro.core.io.tuepp.internal.model.TueppToken;
/**
* UIMA collection reader for Tübingen Partially Parsed Corpus of Written German (TüPP-D/Z) XML
* files.
* <ul>
* <li>Only the part-of-speech with the best rank (rank 1) is read, if there is a tie between
* multiple tags, the first one from the XML file is read.</li>
* <li>Only the first lemma (baseform) from the XML file is read.</li>
* <li>Token are read, but not the specific kind of token (e.g. TEL, AREA, etc.).</li>
* <li>Article boundaries are not read.</li>
* <li>Paragraph boundaries are not read.</li>
* <li>Lemma information is read, but morphological information is not read.</li>
* <li>Chunk, field, and clause information is not read.</li>
* <li>Meta data headers are not read.</li>
* </ul>
*/
@MimeTypeCapability({MimeTypes.APPLICATION_X_TUEPP_XML})
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" })
public class TueppReader
extends JCasResourceCollectionReader_ImplBase
{
/**
* Day
*/
private static final String TAG_DAY = "DAY";
/**
* Article
*/
private static final String TAG_ART = "ART";
/**
* Forced line break
*/
private static final String TAG_BR = "BR";
/**
* Main title
*/
private static final String TAG_TI = "TI";
private static final String TAG_H2 = "H2";
private static final String TAG_H3 = "H3";
/**
* Text body
*/
private static final String TAG_TX = "TX";
/**
* Text type
*/
private static final String TAG_AR = "AR";
/**
* Author
*/
private static final String TAG_AU = "AU";
/**
* Publishing date
*/
private static final String TAG_DT = "DT";
/**
* Short title
*/
private static final String TAG_KT = "KT";
/**
* Source
*/
private static final String TAG_QU = "QU";
/**
* Subject area
*/
private static final String TAG_RE = "RE";
/**
* Page number
*/
private static final String TAG_SE = "SE";
/**
* Unique article ID
*/
private static final String TAG_TP = "TP";
/**
* Number of lines
*/
private static final String TAG_ZE = "ZE";
/**
* Paragraph
*/
private static final String TAG_P = "p";
/**
* Sentence
*/
private static final String TAG_SENTENCE = "s";
/**
* Token
*/
private static final String TAG_TOKEN = "t";
/**
* Location of the mapping file for part-of-speech tags to UIMA types.
*/
public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false)
protected String mappingPosLocation;
/**
* Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the
* tag set defined as part of the model meta data. This can be useful if a custom model is
* specified which does not have such meta data, or it can be used in readers.
*/
public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET;
@ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false)
protected String posTagset;
/**
* Character encoding of the input data.
*/
public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String encoding;
private MappingProvider posMappingProvider;
// XML stuff
private JAXBContext context;
private Unmarshaller unmarshaller;
private XMLInputFactory xmlInputFactory;
// State between files
private Resource res;
private InputStream is;
private XMLEventReader xmlEventReader;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation,
posTagset, getLanguage());
// Set up XML deserialization
try {
context = JAXBContext.newInstance(TueppToken.class);
unmarshaller = context.createUnmarshaller();
xmlInputFactory = XMLInputFactory.newInstance();
}
catch (JAXBException e) {
throw new ResourceInitializationException(e);
}
// Seek first article
try {
step();
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
}
private void closeAll()
{
closeQuietly(xmlEventReader);
xmlEventReader = null;
IOUtils.closeQuietly(is);
is = null;
res = null;
}
@Override
public void destroy()
{
closeAll();
super.destroy();
}
@Override
public boolean hasNext()
throws IOException, CollectionException
{
// If there is still a reader, then there is still an article. This requires that we call
// step() already during initialization.
return xmlEventReader != null;
}
/**
* Seek article in file. Stop once article element has been found without reading it.
*/
private void step() throws IOException
{
// Open next file
while (true) {
try {
if (res == null) {
// Call to super here because we want to know about the resources, not the articles
if (getResourceIterator().hasNext()) {
// There are still resources left to read
res = nextFile();
is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream());
xmlEventReader = xmlInputFactory.createXMLEventReader(is, encoding);
}
else {
// No more files to read
return;
}
}
// Seek article in file. Stop once article element has been found without reading it
XMLEvent e = null;
while ((e = xmlEventReader.peek()) != null) {
if (isStartElement(e, TAG_ART)) {
return;
}
else {
xmlEventReader.next();
}
}
// End of file reached
closeAll();
}
catch (XMLStreamException e) {
throw new IOException(e);
}
}
}
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
try {
posMappingProvider.configure(aJCas.getCas());
JCasBuilder jb = new JCasBuilder(aJCas);
XMLEvent e = null;
int sentenceStart = 0;
article: while ((e = xmlEventReader.peek()) != null) {
if (isStartElement(e, TAG_TP)) {
xmlEventReader.next(); // Read start element
String id = xmlEventReader.getElementText().trim();
initCas(aJCas, res, id);
DocumentMetaData meta = DocumentMetaData.get(aJCas);
meta.setDocumentId(id);
}
else if (isStartElement(e, TAG_SENTENCE)) {
sentenceStart = jb.getPosition();
xmlEventReader.next();
}
else if (isEndElement(e, TAG_SENTENCE)) {
jb.add("\n");
new Sentence(aJCas, sentenceStart, jb.getPosition()).addToIndexes();
xmlEventReader.next();
}
else if (isStartElement(e, TAG_TOKEN)) {
TueppToken sentence = unmarshaller.unmarshal(xmlEventReader, TueppToken.class)
.getValue();
readToken(jb, sentence);
}
else if (isStartElement(e, TAG_BR)) {
jb.add("\n");
xmlEventReader.next();
}
else if (isEndElement(e, TAG_ART)) {
// End of article
xmlEventReader.next();
break article;
}
else {
xmlEventReader.next();
}
}
jb.close();
}
catch (XMLStreamException ex1) {
throw new IOException(ex1);
}
catch(JAXBException ex2){
throw new IOException(ex2);
}
catch(AnalysisEngineProcessException ex3){
throw new IOException(ex3);
}
// Seek next article so we know what to return on hasNext()
step();
}
protected void readToken(JCasBuilder aBuilder, TueppToken aToken)
{
Token token = aBuilder.add(aToken.form, Token.class);
aBuilder.add(" ");
TueppPos pos = aToken.getPrimaryTag();
if (pos != null) {
Type posType = posMappingProvider.getTagType(pos.tag);
POS posAnno = (POS) aBuilder.getJCas().getCas()
.createAnnotation(posType, token.getBegin(), token.getEnd());
posAnno.setPosValue(pos.tag.intern());
posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null
: posAnno.getType().getShortName().intern());
posAnno.addToIndexes();
token.setPos(posAnno);
TueppBaseform baseform = pos.getPrimaryBaseForm();
if (baseform != null) {
Lemma lemma = new Lemma(aBuilder.getJCas(), token.getBegin(), token.getEnd());
lemma.setValue(baseform.form);
lemma.addToIndexes();
token.setLemma(lemma);
}
}
}
public static boolean isStartElement(XMLEvent aEvent, String aElement)
{
return aEvent.isStartElement()
&& ((StartElement) aEvent).getName().getLocalPart().equals(aElement);
}
public static boolean isEndElement(XMLEvent aEvent, String aElement)
{
return aEvent.isEndElement()
&& ((EndElement) aEvent).getName().getLocalPart().equals(aElement);
}
private static void closeQuietly(XMLEventReader aRes)
{
if (aRes != null) {
try {
aRes.close();
}
catch (XMLStreamException e) {
// Ignore
}
}
}
}