/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.wicket.util.io; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.wicket.util.lang.Args; import org.apache.wicket.util.string.Strings; /** * This is a simple XmlReader. Its only purpose is to read the xml decl string from the input and * apply proper character encoding to all subsequent characters. The xml decl string itself is * removed from the output. * * @author Juergen Donnerstag */ public final class XmlReader extends Reader { /** Regex to find <?xml encoding ... ?> */ private static final Pattern xmlDecl = Pattern.compile("[\\s\\n\\r]*<\\?xml(\\s+.*)?\\?>"); /** Regex to find <?xml encoding ... ?> */ private static final Pattern encodingPattern = Pattern.compile("\\s+encoding\\s*=\\s*([\"\'](.*?)[\"\']|(\\S*)).*\\?>"); /** Null, if JVM default. Else from <?xml encoding=""> */ private String encoding; /** The input stream to read the data from */ private final InputStream inputStream; /** The reader which does the character encoding */ private Reader reader; /** * Construct. * * @param inputStream * The InputStream to read the xml data from * @param defaultEncoding * Default character encoding to use when not specified in XML declaration, specify * null to use JVM default * @throws IOException * In case something went wrong while reading the data */ public XmlReader(final InputStream inputStream, final String defaultEncoding) throws IOException { Args.notNull(inputStream, "inputStream"); if (!inputStream.markSupported()) { this.inputStream = new BufferedInputStream(new BOMInputStream(inputStream)); } else { this.inputStream = new BOMInputStream(inputStream); } encoding = defaultEncoding; init(); } /** * Return the encoding used while reading the markup file. * * @return if null, then JVM default */ public final String getEncoding() { return encoding; } /** * Reads and parses markup from a resource such as file. * * @throws IOException */ public void init() throws IOException { // read ahead buffer required for the first line of the markup (encoding) final int readAheadSize = 80; inputStream.mark(readAheadSize); // read-ahead the input stream and check if it starts with <?xml..?>. String xmlDeclaration = getXmlDeclaration(inputStream, readAheadSize); if (!Strings.isEmpty(xmlDeclaration)) { // If yes than determine the encoding from the xml decl encoding = determineEncoding(xmlDeclaration); } else { // If not, reset the input stream to the beginning of the file inputStream.reset(); } if (encoding == null) { // Use JVM default reader = new InputStreamReader(inputStream); } else { // Use the encoding provided reader = new InputStreamReader(inputStream, encoding); } } /** * Determine the encoding from the xml decl. * * @param string * The xmlDecl string * @return The encoding. Null, if not found */ private String determineEncoding(final CharSequence string) { // Does the string match the <?xml .. ?> pattern final Matcher matcher = encodingPattern.matcher(string); if (!matcher.find()) { // No return null; } // Extract the encoding String encoding = matcher.group(2); if ((encoding == null) || (encoding.length() == 0)) { encoding = matcher.group(3); } if (encoding != null) { encoding = encoding.trim(); } return encoding; } /** * Read-ahead the input stream (markup file). If the first line contains <?xml...?>, than * remember the xml decl for later to determine the encoding. * <p> * The xml decl will not be forwarded to the user. * * @param in * The markup file * @param readAheadSize * The read ahead buffer available to read the xml encoding information * @return true, if <?xml ..?> has been found * @throws IOException */ private String getXmlDeclaration(final InputStream in, final int readAheadSize) throws IOException { // Max one line final StringBuilder pushBack = new StringBuilder(readAheadSize); // The current char from the markup file int value; while ((value = in.read()) != -1) { pushBack.append((char)value); // Stop at the end of the first tag or end of line. If it is HTML // without newlines, stop after X bytes (= characters) if ((value == '>') || (value == '\n') || (value == '\r') || (pushBack.length() >= (readAheadSize - 1))) { break; } } // Does the string match the <?xml .. ?> pattern final Matcher matcher = xmlDecl.matcher(pushBack); if (!matcher.matches()) { // No return null; } // Save the whole <?xml ..> string for later return pushBack.toString().trim(); } /** * @see java.io.Reader#close() */ @Override public void close() throws IOException { try { reader.close(); } finally { inputStream.close(); } } /** * @see java.io.Reader#read(char[], int, int) */ @Override public int read(final char[] buf, final int from, final int to) throws IOException { return reader.read(buf, from, to); } /** * @return The markup to be parsed */ @Override public String toString() { return inputStream.toString() + " (" + encoding + ")"; } }