/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.core.io.xces;
import static org.apache.commons.io.IOUtils.closeQuietly;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.ValidationEvent;
import javax.xml.bind.ValidationEventHandler;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.jcas.JCas;
import org.dkpro.core.io.xces.models.XcesBodyBasic;
import org.dkpro.core.io.xces.models.XcesParaBasic;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph"})
public class XcesBasicXmlReader
extends JCasResourceCollectionReader_ImplBase
{
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aJCas, res);
InputStream is = null;
try {
is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream());
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReaderBasic = xmlInputFactory.createXMLEventReader(is);
//JAXB context for XCES body with basic type
JAXBContext contextBasic = JAXBContext.newInstance(XcesBodyBasic.class);
Unmarshaller unmarshallerBasic = contextBasic.createUnmarshaller();
unmarshallerBasic.setEventHandler(new ValidationEventHandler()
{
public boolean handleEvent(ValidationEvent event)
{
throw new RuntimeException(event.getMessage(), event.getLinkedException());
}
});
JCasBuilder jb = new JCasBuilder(aJCas);
XMLEvent eBasic = null;
while ((eBasic = xmlEventReaderBasic.peek()) != null) {
if (isStartElement(eBasic, "body")) {
try {
XcesBodyBasic parasBasic = (XcesBodyBasic) unmarshallerBasic
.unmarshal(xmlEventReaderBasic, XcesBodyBasic.class).getValue();
readPara(jb, parasBasic);
}
catch (RuntimeException ex) {
getLogger().warn(
"Input is not in basic xces format.");
}
}
else {
xmlEventReaderBasic.next();
}
}
jb.close();
}
catch (XMLStreamException ex1) {
throw new IOException(ex1);
}
catch (JAXBException e1) {
throw new IOException(e1);
}
finally {
closeQuietly(is);
}
}
private void readPara(JCasBuilder jb, Object bodyObj)
{
//Below is the sample paragraph format
//<p id="p1">Αυτή είναι η πρώτη γραμμή.</p>
if (bodyObj instanceof XcesBodyBasic) {
for (XcesParaBasic p : ((XcesBodyBasic) bodyObj).p) {
int start = jb.getPosition();
int end = start + p.s.length();
Paragraph para = new Paragraph(jb.getJCas(), start,end);
para.addToIndexes(jb.getJCas());
jb.add(p.s);
jb.add("\n\n");
}
}
}
public static boolean isStartElement(XMLEvent aEvent, String aElement)
{
return aEvent.isStartElement()
&& ((StartElement) aEvent).getName().getLocalPart().equals(aElement);
}
}