/*
* Copyright 2015
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.tei;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_FUNCTION;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_LEMMA;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_TYPE;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_BODY;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_FILE_DESC;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_HEADER;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TEI;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TEXT;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TITLE;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.E_TEI_TITLE_STMT;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_CHARACTER;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PARAGRAPH;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PHRASE;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_RS;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_SUNIT;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_WORD;
import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TEI_NS;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.Stack;
import java.util.regex.Pattern;
import javanet.staxutils.IndentingXMLEventWriter;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.Attribute;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT;
/**
* UIMA CAS consumer writing the CAS document text in TEI format.
*/
@MimeTypeCapability({MimeTypes.APPLICATION_TEI_XML})
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent",
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"})
public class TeiWriter
extends JCasFileWriter_ImplBase
{
/**
* Specify the suffix of output files. Default value <code>.xml</code>. If the suffix is not
* needed, provide an empty string as value.
*/
public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION;
@ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml")
private String filenameSuffix;
/**
* A token matching this pattern is rendered as a TEI "c" element instead of a "w" element.
*/
public static final String PARAM_C_TEXT_PATTERN = "cTextPattern";
@ConfigurationParameter(name = PARAM_C_TEXT_PATTERN, mandatory = true, defaultValue = "[,.:;()]|(``)|('')|(--)")
private Pattern cTextPattern;
/**
* Write constituent annotations to the CAS. Disabled by default because it requires type
* priorities to be set up (Constituents must have a higher prio than Tokens).
*/
public static final String PARAM_WRITE_CONSTITUENT = ComponentParameters.PARAM_WRITE_CONSTITUENT;
@ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "false")
private boolean writeConstituent;
/**
* Write named entity annotations to the CAS. Overlapping named entities are not supported.
*/
public static final String PARAM_WRITE_NAMED_ENTITY = ComponentParameters.PARAM_WRITE_NAMED_ENTITY;
@ConfigurationParameter(name = PARAM_WRITE_NAMED_ENTITY, mandatory = true, defaultValue = "true")
private boolean writeNamedEntity;
/**
* Indent the XML.
*/
public static final String PARAM_INDENT = "indent";
@ConfigurationParameter(name = PARAM_INDENT, mandatory = true, defaultValue = "false")
private boolean indent;
private final XMLEventFactory xmlef = XMLEventFactory.newInstance();
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
String text = aJCas.getDocumentText();
try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) {
XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
xmlOutputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true);
XMLEventWriter xmlEventWriter = xmlOutputFactory.createXMLEventWriter(docOS);
if (indent) {
xmlEventWriter = new IndentingXMLEventWriter(xmlEventWriter);
}
xmlEventWriter.add(xmlef.createStartDocument());
xmlEventWriter.setDefaultNamespace(TEI_NS);
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TEI, null, null));
// Render header
DocumentMetaData meta = DocumentMetaData.get(aJCas);
xmlEventWriter.add(xmlef.createStartElement(E_TEI_HEADER, null, null));
xmlEventWriter.add(xmlef.createStartElement(E_TEI_FILE_DESC, null, null));
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TITLE_STMT, null, null));
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TITLE, null, null));
xmlEventWriter.add(xmlef.createCharacters(meta.getDocumentTitle()));
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TITLE, null));
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TITLE_STMT, null));
xmlEventWriter.add(xmlef.createEndElement(E_TEI_FILE_DESC, null));
xmlEventWriter.add(xmlef.createEndElement(E_TEI_HEADER, null));
// Render text
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TEXT, null, null));
xmlEventWriter.add(xmlef.createStartElement(E_TEI_BODY, null, null));
FSIterator<Annotation> iterator = aJCas.getAnnotationIndex().iterator();
Stack<Annotation> stack = new Stack<Annotation>();
int pos = 0;
Annotation cur = null;
while (iterator.isValid()) {
Annotation nextAnnot = iterator.get();
// Ignore unmapped elements
Optional<String> teiElement = getTeiTag(nextAnnot);
if (!teiElement.isPresent()) {
iterator.moveToNext();
continue;
}
// Check if next annotation is potentially nested
if (cur == null || nextAnnot.getBegin() < cur.getEnd()) {
// Check if next annotation is fully nested
if (cur == null || nextAnnot.getEnd() <= cur.getEnd()) {
// Text between current and next annotation
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos,
nextAnnot.getBegin())));
// Next annotation
xmlEventWriter.add(xmlef.createStartElement(new QName(TEI_NS, teiElement.get()),
getAttributes(nextAnnot), null));
stack.push(cur);
cur = nextAnnot;
pos = nextAnnot.getBegin();
}
else {
// Overlapping annotations are ignored
getLogger().debug("Unable to render overlapping annotation");
}
iterator.moveToNext();
}
// Next annotation is following, not nested
else {
// Text between current and next annotation
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd())));
xmlEventWriter.add(xmlef.createEndElement(new QName(TEI_NS, teiElement.get()), null));
pos = cur.getEnd();
cur = stack.pop();
}
}
// End of text, end all elements that are still on the stack
if (cur != null) {
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd())));
pos = cur.getEnd();
xmlEventWriter.add(xmlef.createEndElement(new QName(TEI_NS, getTeiTag(cur).get()), null));
while (!stack.isEmpty()) {
cur = stack.pop();
if (cur == null) {
break;
}
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd())));
pos = cur.getEnd();
xmlEventWriter.add(xmlef.createEndElement(new QName(TEI_NS, getTeiTag(cur).get()), null));
}
}
if (pos < text.length()) {
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, text.length())));
}
xmlEventWriter.add(xmlef.createEndElement(E_TEI_BODY, null));
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TEXT, null));
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TEI, null));
xmlEventWriter.add(xmlef.createEndDocument());
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
private Iterator<Attribute> getAttributes(Annotation aAnnotation) {
List<Attribute> attributes = new ArrayList<Attribute>();
if (aAnnotation instanceof Token) {
Token t = (Token) aAnnotation;
if (t.getPos() != null) {
attributes.add(xmlef.createAttribute(ATTR_TYPE, t.getPos().getPosValue()));
}
if (t.getLemma() != null) {
attributes.add(xmlef.createAttribute(ATTR_LEMMA, t.getLemma().getValue()));
}
}
else if (aAnnotation instanceof NamedEntity) {
NamedEntity ne = (NamedEntity) aAnnotation;
attributes.add(xmlef.createAttribute(ATTR_TYPE, ne.getValue()));
}
else if (aAnnotation instanceof Constituent) {
Constituent c = (Constituent) aAnnotation;
if ("ROOT".equals(c.getConstituentType())) {
System.out.println();
}
if (c.getConstituentType() != null) {
attributes.add(xmlef.createAttribute(ATTR_TYPE, c.getConstituentType()));
}
if (c.getSyntacticFunction() != null) {
attributes.add(xmlef.createAttribute(ATTR_FUNCTION, c.getSyntacticFunction()));
}
}
return attributes.iterator();
}
private Optional<String> getTeiTag(Annotation aAnnotation)
{
if (aAnnotation instanceof Constituent) {
Constituent c = (Constituent) aAnnotation;
if ("ROOT".equals(c.getConstituentType())) {
System.out.println();
}
}
if (aAnnotation.getTypeIndexID() == Token.type) {
if (cTextPattern.matcher(aAnnotation.getCoveredText()).matches()) {
return Optional.of(TAG_CHARACTER);
}
return Optional.of(TAG_WORD);
}
else if (aAnnotation.getTypeIndexID() == Sentence.type) {
return Optional.of(TAG_SUNIT);
}
else if (aAnnotation.getTypeIndexID() == Paragraph.type) {
return Optional.of(TAG_PARAGRAPH);
}
else if (writeConstituent && (aAnnotation instanceof ROOT)) {
// We do not render ROOT nodes
return Optional.empty();
}
else if (writeConstituent && (aAnnotation instanceof Constituent)) {
return Optional.of(TAG_PHRASE);
}
else if (writeNamedEntity && (aAnnotation instanceof NamedEntity)) {
return Optional.of(TAG_RS);
}
else {
return Optional.empty();
}
}
}