/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.xml; import static org.apache.commons.io.IOUtils.closeQuietly; import java.io.IOException; import java.io.InputStream; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.util.Logger; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; /** * @since 1.1.0 */ @MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML}) @TypeCapability( outputs={ "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) public class XmlTextReader extends ResourceCollectionReaderBase { @Override public void getNext(CAS aCAS) throws IOException, CollectionException { Resource res = nextFile(); initCas(aCAS, res); InputStream is = null; try { JCas jcas = aCAS.getJCas(); is = res.getInputStream(); // Create handler Handler handler = newSaxHandler(); handler.setJCas(jcas); handler.setLogger(getLogger()); // Parser XML SAXParserFactory pf = SAXParserFactory.newInstance(); SAXParser parser = pf.newSAXParser(); InputSource source = new InputSource(is); source.setPublicId(res.getLocation()); source.setSystemId(res.getLocation()); parser.parse(source, handler); // Set up language if (getConfigParameterValue(PARAM_LANGUAGE) != null) { aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); } } catch (CASException e) { throw new CollectionException(e); } catch (ParserConfigurationException e) { throw new CollectionException(e); } catch (SAXException e) { throw new IOException(e); } finally { closeQuietly(is); } } protected Handler newSaxHandler() { return new TextExtractor(); } /** */ protected abstract static class Handler extends DefaultHandler { private JCas jcas; private Logger logger; public void setJCas(final JCas aJCas) { jcas = aJCas; } protected JCas getJCas() { return jcas; } public void setLogger(Logger aLogger) { logger = aLogger; } public Logger getLogger() { return logger; } } /** */ public static class TextExtractor extends Handler { private final StringBuilder buffer = new StringBuilder(); @Override public void characters(char[] aCh, int aStart, int aLength) throws SAXException { buffer.append(aCh, aStart, aLength); } @Override public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException { buffer.append(aCh, aStart, aLength); } @Override public void endDocument() throws SAXException { getJCas().setDocumentText(buffer.toString()); } protected StringBuilder getBuffer() { return buffer; } } }