/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.rdfxml.xmlinput.impl; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.UTFDataFormatException; import org.apache.jena.rdfxml.xmlinput.FatalParsingErrorException ; import org.apache.jena.rdfxml.xmlinput.SAX2RDF ; import org.apache.jena.util.CharEncoding ; import org.apache.xerces.parsers.SAXParser; import org.apache.xerces.parsers.StandardParserConfiguration; import org.apache.xerces.xni.Augmentations; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; /** * * The main parser, other variants of XMLHandler are for more specialized purposes. */ public class RDFXMLParser extends XMLHandler { private SAXParser saxParser; private String readerXMLEncoding = null; private String xmlEncoding = null; /** * This is protected rather than private to allow subclassing, * however, reimplementors should be aware that the default configuration * via {@link #create()} includes functionality that is not simply * included. The most important is to do with character encoding checking. * A common user error is to not have correct XML encoding, or to open * files with the wrong encodings on their reader. The {@link #setEncoding(String)} * method does what it can to try and detect these user errors, and is worth the effort. * Consider using {@link SAXParserWithEncodingCheck} * @param rdr */ protected RDFXMLParser(SAXParser rdr) { super(); saxParser = rdr; try { SAX2RDF.installHandlers(rdr, this); } catch (SAXException e) { throw new RuntimeException("Supposedly impossible:", e); } } public SAXParser getSAXParser() { return saxParser; } /** * This works with an {@link RDFXMLParser} and catches and reports several * common errors to do with character encoding. * */ static protected class SAXParserWithEncodingCheck extends SAXParser { protected SAXParserWithEncodingCheck(StandardParserConfiguration c) { super(c); // try { // setFeature("http://xml.org/sax/features/string-interning", // false); // } catch (SAXException e) { // // Not supported - aggh // // TO DO ask on xerces list why not? // // e.printStackTrace(); // } } private RDFXMLParser rdfXmlParser; @Override public void xmlDecl(String version, String encoding, String standalone, Augmentations augs) { try { getRdfXmlParser().setEncoding(encoding == null ? "UTF" : encoding); } catch (SAXParseException e) { throw new WrappedException(e); } super.xmlDecl(version, encoding, standalone, augs); } /** * This must be called as part of the initialization process. * @param rdfXmlParser the rdfXmlParser to set */ public void setRdfXmlParser(RDFXMLParser rdfXmlParser) { this.rdfXmlParser = rdfXmlParser; } /** * @return the rdfXmlParser */ public RDFXMLParser getRdfXmlParser() { if (rdfXmlParser == null) { throw new IllegalStateException("setRdfXmlParser must be called as part of the initialization process"); } return rdfXmlParser; } } public static RDFXMLParser create() { StandardParserConfiguration c = new StandardParserConfiguration(); SAXParserWithEncodingCheck msp = new SAXParserWithEncodingCheck(c); RDFXMLParser a = new RDFXMLParser(msp); msp.setRdfXmlParser(a); return a; } public void parse(InputSource input) throws IOException, SAXException { parse(input, input.getSystemId()); } synchronized public void parse(InputSource input, String base) throws IOException, SAXException { // Make sure we have a sane state for // Namespace processing. initParse(base,""); SAX2RDF.installHandlers(saxParser, this); saxParser.reset(); initEncodingChecks(input); try { saxParser.parse(input); } catch (UTFDataFormatException e) { generalError(ERR_UTF_ENCODING, e); } catch (IOException e) { generalError(ERR_GENERIC_IO, e); } catch (WrappedException wrapped) { wrapped.throwMe(); } catch (FatalParsingErrorException e) { // ignore this. } finally { afterParse(); } } private void initEncodingChecks(InputSource in) { Reader rdr = in.getCharacterStream(); readerXMLEncoding = null; encodingProblems = false; if (rdr != null && rdr instanceof InputStreamReader) { String javaEnc = ((InputStreamReader) rdr).getEncoding(); readerXMLEncoding = CharEncoding.create(javaEnc).name(); } } protected void setEncoding(String original) throws SAXParseException { CharEncoding encodingInfo = CharEncoding.create(original); String e = encodingInfo.name(); if (xmlEncoding == null) { // special case UTF-8 or UTF-16? if (e.equals("UTF") && readerXMLEncoding != null && readerXMLEncoding.startsWith("UTF")) { xmlEncoding = readerXMLEncoding; return; } xmlEncoding = e; if (readerXMLEncoding != null && !readerXMLEncoding.equalsIgnoreCase(e)) { warning(null, WARN_ENCODING_MISMATCH, "Encoding on InputStreamReader or FileReader does not match that of XML document. Use FileInputStream. [" + readerXMLEncoding + " != " + e + "]"); encodingProblems = true; } if (e.equals("UTF")) return; if (!encodingInfo.isIANA()) { warning(null,encodingInfo.isInNIO() ? WARN_NON_IANA_ENCODING : WARN_UNSUPPORTED_ENCODING, encodingInfo .warningMessage()); } else if (!original.equalsIgnoreCase(e)) { warning(null,WARN_NONCANONICAL_IANA_NAME, "The encoding \"" + original + "\" is not the canonical name at IANA, suggest \"" + e + "\" would give more interoperability."); } } } }