/* * reserved comment block * DO NOT REMOVE OR ALTER! */ /* * Copyright 2003-2005 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.sun.org.apache.xerces.internal.xinclude; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.Iterator; import java.util.Locale; import java.util.Map; import com.sun.org.apache.xerces.internal.impl.XMLEntityManager; import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter; import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader; import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader; import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter; import com.sun.org.apache.xerces.internal.util.EncodingMap; import com.sun.org.apache.xerces.internal.util.HTTPInputSource; import com.sun.org.apache.xerces.internal.util.MessageFormatter; import com.sun.org.apache.xerces.internal.util.XMLChar; import com.sun.org.apache.xerces.internal.xni.XMLString; import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource; /** * This class is used for reading resources requested in <include> elements, * when the parse attribute of the <include> element is "text". Using this * class will open the location, detect the encoding, and discard the byte order * mark, if applicable. * * REVISIT: * Much of the code in this class is taken from XMLEntityManager. It would be nice * if this code could be shared in some way. However, since XMLEntityManager is used * for reading files as XML, and this needs to read files as text, there would need * to be some refactoring done. * * @author Michael Glavassevich, IBM * @author Peter McCracken, IBM * @author Ankit Pasricha, IBM * @author Arun Yadav, Sun Microsystems Inc. * * * @see XIncludeHandler */ public class XIncludeTextReader { private Reader fReader; private XIncludeHandler fHandler; private XMLInputSource fSource; private XMLErrorReporter fErrorReporter; private XMLString fTempString = new XMLString(); /** * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler. * * @param source The XMLInputSource to use. * @param handler The XIncludeHandler to use. * @param bufferSize The size of this text reader's buffer. */ public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize) throws IOException { fHandler = handler; fSource = source; fTempString = new XMLString(new char[bufferSize + 1], 0, 0); } /** * Sets the XMLErrorReporter used for reporting errors while * reading the text include. * * @param errorReporter the XMLErrorReporter to be used for * reporting errors. */ public void setErrorReporter(XMLErrorReporter errorReporter) { fErrorReporter = errorReporter; } /** * Return the Reader for given XMLInputSource. * * @param source The XMLInputSource to use. */ protected Reader getReader(XMLInputSource source) throws IOException { if (source.getCharacterStream() != null) { return source.getCharacterStream(); } else { InputStream stream = null; String encoding = source.getEncoding(); if (encoding == null) { encoding = "UTF-8"; } if (source.getByteStream() != null) { stream = source.getByteStream(); // Wrap the InputStream so that it is possible to rewind it. if (!(stream instanceof BufferedInputStream)) { stream = new BufferedInputStream(stream, fTempString.ch.length); } } else { String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false); URL url = new URL(expandedSystemId); URLConnection urlCon = url.openConnection(); // If this is an HTTP connection attach any request properties to the request. if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) { final HttpURLConnection urlConnection = (HttpURLConnection) urlCon; final HTTPInputSource httpInputSource = (HTTPInputSource) source; // set request properties Iterator propIter = httpInputSource.getHTTPRequestProperties(); while (propIter.hasNext()) { Map.Entry entry = (Map.Entry) propIter.next(); urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue()); } // set preference for redirection boolean followRedirects = httpInputSource.getFollowHTTPRedirects(); if (!followRedirects) { urlConnection.setInstanceFollowRedirects(followRedirects); } } // Wrap the InputStream so that it is possible to rewind it. stream = new BufferedInputStream(urlCon.getInputStream()); // content type will be string like "text/xml; charset=UTF-8" or "text/xml" String rawContentType = urlCon.getContentType(); // text/xml and application/xml offer only one optional parameter int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1; String contentType = null; String charset = null; if (index != -1) { // this should be something like "text/xml" contentType = rawContentType.substring(0, index).trim(); // this should be something like "charset=UTF-8", but we want to // strip it down to just "UTF-8" charset = rawContentType.substring(index + 1).trim(); if (charset.startsWith("charset=")) { // 8 is the length of "charset=" charset = charset.substring(8).trim(); // strip quotes, if present if ((charset.charAt(0) == '"' && charset.charAt(charset.length() - 1) == '"') || (charset.charAt(0) == '\'' && charset.charAt(charset.length() - 1) == '\'')) { charset = charset.substring(1, charset.length() - 1); } } else { charset = null; } } else { contentType = rawContentType.trim(); } String detectedEncoding = null; /** The encoding of such a resource is determined by: 1 external encoding information, if available, otherwise -- the most common type of external information is the "charset" parameter of a MIME package 2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise 3 the value of the encoding attribute if one exists, otherwise 4 UTF-8. **/ if (contentType.equals("text/xml")) { if (charset != null) { detectedEncoding = charset; } else { // see RFC2376 or 3023, section 3.1 detectedEncoding = "US-ASCII"; } } else if (contentType.equals("application/xml")) { if (charset != null) { detectedEncoding = charset; } else { // see RFC2376 or 3023, section 3.2 detectedEncoding = getEncodingName(stream); } } else if (contentType.endsWith("+xml")) { detectedEncoding = getEncodingName(stream); } if (detectedEncoding != null) { encoding = detectedEncoding; } // else 3 or 4. } encoding = encoding.toUpperCase(Locale.ENGLISH); // eat the Byte Order Mark encoding = consumeBOM(stream, encoding); // If the document is UTF-8 or US-ASCII use // the Xerces readers for these encodings. For // US-ASCII consult the encoding map since // this encoding has many aliases. if (encoding.equals("UTF-8")) { return new UTF8Reader(stream, fTempString.ch.length, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale() ); } // Try to use a Java reader. String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding); // If the specified encoding wasn't a recognized IANA encoding throw an IOException. // The XIncludeHandler will report this as a ResourceError and then will // attempt to include a fallback if there is one. if (javaEncoding == null) { MessageFormatter aFormatter = fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN); Locale aLocale = fErrorReporter.getLocale(); throw new IOException( aFormatter.formatMessage( aLocale, "EncodingDeclInvalid", new Object[] {encoding} ) ); } else if (javaEncoding.equals("ASCII")) { return new ASCIIReader(stream, fTempString.ch.length, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale() ); } return new InputStreamReader(stream, javaEncoding); } } /** * XMLEntityManager cares about endian-ness, since it creates its own optimized * readers. Since we're just using generic Java readers for now, we're not caring * about endian-ness. If this changes, even more code needs to be copied from * XMLEntity manager. -- PJM */ protected String getEncodingName(InputStream stream) throws IOException { final byte[] b4 = new byte[4]; String encoding = null; // this has the potential to throw an exception // it will be fixed when we ensure the stream is rewindable (see note above) stream.mark(4); int count = stream.read(b4, 0, 4); stream.reset(); if (count == 4) { encoding = getEncodingName(b4); } return encoding; } /** * Removes the byte order mark from the stream, if * it exists and returns the encoding name. * * @param stream * @param encoding * @throws IOException */ protected String consumeBOM(InputStream stream, String encoding) throws IOException { byte[] b = new byte[3]; int count = 0; stream.mark(3); if (encoding.equals("UTF-8")) { count = stream.read(b, 0, 3); if (count == 3) { final int b0 = b[0] & 0xFF; final int b1 = b[1] & 0xFF; final int b2 = b[2] & 0xFF; if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) { // First three bytes are not BOM, so reset. stream.reset(); } } else { stream.reset(); } } else if (encoding.startsWith("UTF-16")) { count = stream.read(b, 0, 2); if (count == 2) { final int b0 = b[0] & 0xFF; final int b1 = b[1] & 0xFF; if (b0 == 0xFE && b1 == 0xFF) { return "UTF-16BE"; } else if (b0 == 0xFF && b1 == 0xFE) { return "UTF-16LE"; } } // First two bytes are not BOM, so reset. stream.reset(); } // We could do UTF-32, but since the getEncodingName() doesn't support that // we won't support it here. // To implement UTF-32, look for: 00 00 FE FF for big-endian // or FF FE 00 00 for little-endian return encoding; } /** * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager. * Is there any way we can share the code, without having it implemented twice? * I think we should make it public and static in XMLEntityManager. --PJM * * Returns the IANA encoding name that is auto-detected from * the bytes specified, with the endian-ness of that encoding where appropriate. * * @param b4 The first four bytes of the input. * @return the encoding name, or null if no encoding could be detected */ protected String getEncodingName(byte[] b4) { // UTF-16, with BOM int b0 = b4[0] & 0xFF; int b1 = b4[1] & 0xFF; if (b0 == 0xFE && b1 == 0xFF) { // UTF-16, big-endian return "UTF-16BE"; } if (b0 == 0xFF && b1 == 0xFE) { // UTF-16, little-endian return "UTF-16LE"; } // UTF-8 with a BOM int b2 = b4[2] & 0xFF; if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { return "UTF-8"; } // other encodings int b3 = b4[3] & 0xFF; if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { // UCS-4, big endian (1234) return "ISO-10646-UCS-4"; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { // UCS-4, little endian (4321) return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { // UCS-4, unusual octet order (2143) return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { // UCS-4, unusual octect order (3412) return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { // UTF-16, big-endian, no BOM // (or could turn out to be UCS-2... return "UTF-16BE"; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { // UTF-16, little-endian, no BOM // (or could turn out to be UCS-2... return "UTF-16LE"; } if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { // EBCDIC // a la xerces1, return CP037 instead of EBCDIC here return "CP037"; } // this signals us to use the value from the encoding attribute return null; } // getEncodingName(byte[]):Object[] /** * Read the input stream as text, and pass the text on to the XIncludeHandler * using calls to characters(). This will read all of the text it can from the * resource. * * @throws IOException */ public void parse() throws IOException { fReader = getReader(fSource); fSource = null; int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); while (readSize != -1) { for (int i = 0; i < readSize; ++i) { char ch = fTempString.ch[i]; if (!isValid(ch)) { if (XMLChar.isHighSurrogate(ch)) { int ch2; // retrieve next character if (++i < readSize) { ch2 = fTempString.ch[i]; } // handle rare boundary case else { ch2 = fReader.read(); if (ch2 != -1) { fTempString.ch[readSize++] = (char) ch2; } } if (XMLChar.isLowSurrogate(ch2)) { // convert surrogates to a supplemental character int sup = XMLChar.supplemental(ch, (char)ch2); if (!isValid(sup)) { fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, "InvalidCharInContent", new Object[] { Integer.toString(sup, 16) }, XMLErrorReporter.SEVERITY_FATAL_ERROR); } } else { fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, "InvalidCharInContent", new Object[] { Integer.toString(ch2, 16) }, XMLErrorReporter.SEVERITY_FATAL_ERROR); } } else { fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, "InvalidCharInContent", new Object[] { Integer.toString(ch, 16) }, XMLErrorReporter.SEVERITY_FATAL_ERROR); } } } if (fHandler != null && readSize > 0) { fTempString.offset = 0; fTempString.length = readSize; fHandler.characters( fTempString, fHandler.modifyAugmentations(null, true)); } readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); } } /** * Sets the input source on this text reader. * * @param source The XMLInputSource to use. */ public void setInputSource(XMLInputSource source) { fSource = source; } /** * Closes the stream. Call this after parse(), or when there is no longer any need * for this object. * * @throws IOException */ public void close() throws IOException { if (fReader != null) { fReader.close(); fReader = null; } } /** * Returns true if the specified character is a valid XML character * as per the rules of XML 1.0. * * @param ch The character to check. */ protected boolean isValid(int ch) { return XMLChar.isValid(ch); } /** * Sets the buffer size property for the reader which decides the chunk sizes that are parsed * by the reader at a time and passed to the handler * * @param bufferSize The size of the buffer desired */ protected void setBufferSize(int bufferSize) { if (fTempString.ch.length != ++bufferSize) { fTempString.ch = new char[bufferSize]; } } }