/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jasper.compiler; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; /* * The BoM detection is derived from: * http://svn.us.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/xmlparser/XMLEncodingDetector.java?annotate=1742248 * * The prolog is always at least as specific as the BOM therefore any encoding * specified in the prolog should take priority over the BOM. */ class EncodingDetector { private static final XMLInputFactory XML_INPUT_FACTORY; static { XML_INPUT_FACTORY = XMLInputFactory.newInstance(); } private final String encoding; private final int skip; private final boolean encodingSpecifiedInProlog; /* * TODO: Refactor Jasper InputStream creation and handling so the * InputStream passed to this method is buffered and therefore saves * on multiple opening and re-opening of the same file. */ EncodingDetector(InputStream is) throws IOException { // Keep buffer size to a minimum here. BoM will be no more than 4 bytes // so that is the maximum we need to buffer BufferedInputStream bis = new BufferedInputStream(is, 4); bis.mark(4); BomResult bomResult = processBom(bis); // Reset the stream back to the start to allow the XML prolog detection // to work. Skip any BoM we discovered. bis.reset(); for (int i = 0; i < bomResult.skip; i++) { bis.read(); } String prologEncoding = getPrologEncoding(bis); if (prologEncoding == null) { encodingSpecifiedInProlog = false; encoding = bomResult.encoding; } else { encodingSpecifiedInProlog = true; encoding = prologEncoding; } skip = bomResult.skip; } String getEncoding() { return encoding; } int getSkip() { return skip; } boolean isEncodingSpecifiedInProlog() { return encodingSpecifiedInProlog; } private String getPrologEncoding(InputStream stream) { String encoding = null; try { XMLStreamReader xmlStreamReader = XML_INPUT_FACTORY.createXMLStreamReader(stream); encoding = xmlStreamReader.getCharacterEncodingScheme(); } catch (XMLStreamException e) { // Ignore } return encoding; } private BomResult processBom(InputStream stream) { // Read first four bytes (or as many are available) and determine // encoding try { final byte[] b4 = new byte[4]; int count = 0; int singleByteRead; while (count < 4) { singleByteRead = stream.read(); if (singleByteRead == -1) { break; } b4[count] = (byte) singleByteRead; count++; } return parseBom(b4, count); } catch (IOException ioe) { // Failed. return new BomResult("UTF-8", 0); } } private BomResult parseBom(byte[] b4, int count) { if (count < 2) { return new BomResult("UTF-8", 0); } // UTF-16, with BOM int b0 = b4[0] & 0xFF; int b1 = b4[1] & 0xFF; if (b0 == 0xFE && b1 == 0xFF) { // UTF-16, big-endian return new BomResult("UTF-16BE", 2); } if (b0 == 0xFF && b1 == 0xFE) { // UTF-16, little-endian return new BomResult("UTF-16LE", 2); } // default to UTF-8 if we don't have enough bytes to make a // good determination of the encoding if (count < 3) { return new BomResult("UTF-8", 0); } // UTF-8 with a BOM int b2 = b4[2] & 0xFF; if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { return new BomResult("UTF-8", 3); } // default to UTF-8 if we don't have enough bytes to make a // good determination of the encoding if (count < 4) { return new BomResult("UTF-8", 0); } // Other encodings. No BOM. Try and ID encoding. int b3 = b4[3] & 0xFF; if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { // UCS-4, big endian (1234) return new BomResult("ISO-10646-UCS-4", 0); } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { // UCS-4, little endian (4321) return new BomResult("ISO-10646-UCS-4", 0); } if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { // UCS-4, unusual octet order (2143) // REVISIT: What should this be? return new BomResult("ISO-10646-UCS-4", 0); } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { // UCS-4, unusual octect order (3412) // REVISIT: What should this be? return new BomResult("ISO-10646-UCS-4", 0); } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { // UTF-16, big-endian, no BOM // (or could turn out to be UCS-2... // REVISIT: What should this be? return new BomResult("UTF-16BE", 0); } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { // UTF-16, little-endian, no BOM // (or could turn out to be UCS-2... return new BomResult("UTF-16LE", 0); } if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { // EBCDIC // a la xerces1, return CP037 instead of EBCDIC here return new BomResult("CP037", 0); } // default encoding return new BomResult("UTF-8", 0); } private static class BomResult { public final String encoding; public final int skip; public BomResult(String encoding, int skip) { this.encoding = encoding; this.skip = skip; } } }