/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.core.io.xces;
import static org.apache.commons.io.IOUtils.closeQuietly;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.ValidationEvent;
import javax.xml.bind.ValidationEventHandler;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.jcas.JCas;
import org.dkpro.core.io.xces.models.XcesBody;
import org.dkpro.core.io.xces.models.XcesPara;
import org.dkpro.core.io.xces.models.XcesSentence;
import org.dkpro.core.io.xces.models.XcesToken;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class XcesXmlReader
extends JCasResourceCollectionReader_ImplBase
{
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aJCas, res);
InputStream is = null;
try {
is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream());
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(is);
JAXBContext context = JAXBContext.newInstance(XcesBody.class);
Unmarshaller unmarshaller = context.createUnmarshaller();
unmarshaller.setEventHandler(new ValidationEventHandler()
{
public boolean handleEvent(ValidationEvent event)
{
throw new RuntimeException(event.getMessage(), event.getLinkedException());
}
});
JCasBuilder jb = new JCasBuilder(aJCas);
XMLEvent e = null;
while ((e = xmlEventReader.peek()) != null) {
if (isStartElement(e, "body")) {
try {
XcesBody paras = (XcesBody) unmarshaller
.unmarshal(xmlEventReader, XcesBody.class).getValue();
readPara(jb, paras);
}
catch (RuntimeException ex) {
System.out.println("Unable to parse XCES format: " + ex);
}
}
else {
xmlEventReader.next();
}
}
jb.close();
}
catch (XMLStreamException ex1) {
throw new IOException(ex1);
}
catch (JAXBException e1) {
throw new IOException(e1);
}
finally {
closeQuietly(is);
}
}
private void readPara(JCasBuilder jb, Object bodyObj)
{
// Below is the sample paragraph format
// <p id="p1">
// <s id="s1">
// <t id="t1" word="Αυτή" tag="PnDmFe03SgNmXx" lemma="αυτός" />
// <t id="t2" word="είναι" tag="VbMnIdPr03SgXxIpPvXx" lemma="είμαι" />
// <t id="t3" word="η" tag="AtDfFeSgNm" lemma="ο" />
// <t id="t4" word="πρώτη" tag="NmOdFeSgNmAj" lemma="πρώτος" />
// <t id="t5" word="γραμμή" tag="NoCmFeSgNm" lemma="γραμμή" />
// <t id="t6" word="." tag="PTERM_P" lemma="." />
// </s>
// </p>
if (bodyObj instanceof XcesBody) {
for (XcesPara paras : ((XcesBody) bodyObj).p) {
int paraStart = jb.getPosition();
int paraEnd = jb.getPosition();
for (XcesSentence s : paras.s) {
int sentStart = jb.getPosition();
int sentEnd = jb.getPosition();
for (int i = 0; i < s.xcesTokens.size(); i++) {
XcesToken t = s.xcesTokens.get(i);
XcesToken tnext = i + 1 == s.xcesTokens.size() ? null
: s.xcesTokens.get(i + 1);
Token token = jb.add(t.word, Token.class);
if (t.lemma != null) {
Lemma lemma = new Lemma(jb.getJCas(), token.getBegin(), token.getEnd());
lemma.setValue(t.lemma);
lemma.addToIndexes();
token.setLemma(lemma);
}
if (t.tag != null) {
POS pos = new POS(jb.getJCas(), token.getBegin(), token.getEnd());
pos.setPosValue(t.tag);
pos.addToIndexes();
token.setPos(pos);
}
sentEnd = jb.getPosition();
if (tnext == null)
jb.add("\n");
if (tnext != null) {
jb.add(" ");
}
}
Sentence sent = new Sentence(jb.getJCas(), sentStart, sentEnd);
sent.addToIndexes();
paraEnd = sent.getEnd();
}
Paragraph para = new Paragraph(jb.getJCas(), paraStart, paraEnd);
para.addToIndexes();
jb.add("\n");
}
}
}
public static boolean isStartElement(XMLEvent aEvent, String aElement)
{
return aEvent.isStartElement()
&& ((StartElement) aEvent).getName().getLocalPart().equals(aElement);
}
}