/* * Copyright 2013 Skynav, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SKYNAV, INC. AND ITS CONTRIBUTORS “AS IS” AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL SKYNAV, INC. OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.skynav.xml.helpers; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.Collections; import java.util.List; public class Sniffer { private static final List<Object[]> bomList; static { List<Object[]> l = new java.util.ArrayList<Object[]>(); try { l.add(new Object[] { new int[] { 0x00, 0x00, 0xFE, 0xFF }, Charset.forName("UTF-32BE") }); l.add(new Object[] { new int[] { 0xFF, 0xFE, 0x00, 0x00 }, Charset.forName("UTF-32LE") }); l.add(new Object[] { new int[] { 0xEF, 0xBB, 0xBF }, Charset.forName("UTF-8") }); l.add(new Object[] { new int[] { 0xFE, 0xFF }, Charset.forName("UTF-16BE") }); l.add(new Object[] { new int[] { 0xFF, 0xFE }, Charset.forName("UTF-16LE") }); } catch (RuntimeException e) { } bomList = Collections.unmodifiableList(l); } private Sniffer() { } public static Charset sniff(ByteBuffer bb, Charset defaultCharset) { return sniff(bb, defaultCharset, null); } public static Charset sniff(ByteBuffer bb, Charset defaultCharset, Object[] outputParameters) { int restore = bb.position(); Charset sniffedCharset; Charset bomCharset = checkForBOMCharset(bb, outputParameters); Charset encodingCharset = checkForXMLEncodingCharset(bb, bomCharset, outputParameters); if (bomCharset != null) { if (encodingCharset != null) { if (encodingCharset.equals(bomCharset)) sniffedCharset = encodingCharset; else sniffedCharset = bomCharset; } else sniffedCharset = bomCharset; } else { if (encodingCharset != null) sniffedCharset = encodingCharset; else sniffedCharset = defaultCharset; } bb.position(restore); return sniffedCharset; } public static Charset checkForBOMCharset(ByteBuffer bb, Object[] outputParameters) { for (Object[] bomEntry : bomList) { int[] bom = (int[]) bomEntry[0]; if (bom.length > bb.limit()) return null; } int restore = bb.position(); for (Object[] bomEntry : bomList) { int[] bom = (int[]) bomEntry[0]; bb.rewind(); assert bb.limit() >= bom.length; int i = 0; while (i < bom.length) if ((bb.get() & 0xFF) != bom[i++]) break; if (i == bom.length) { if (outputParameters != null) { if (outputParameters.length > 0) outputParameters[0] = Integer.valueOf(bom.length); } return (Charset) bomEntry[1]; } } bb.position(restore); return null; } private static boolean isXMLEncodingNameInitial(int c) { return ((c >= (int) 'A') && (c <= (int) 'Z')) || ((c >= (int) 'a') && (c <= (int) 'z')); } private static boolean isXMLEncodingNameFollowing(int c) { return isXMLEncodingNameInitial(c) || ((c >= (int) '0') && (c <= (int) '9')) || (c == '.') || (c == '_') || (c == '-'); } private static String checkXMLEncodingName(String encoding) { if (encoding.length() == 0) return null; else { if (!isXMLEncodingNameInitial(encoding.charAt(0))) return null; for (int i = 1; i < encoding.length(); ++i) { if (!isXMLEncodingNameFollowing(encoding.charAt(i))) return null; } return encoding; } } private static String extractXMLEncoding(CharBuffer cb, int start, int end) { StringBuffer sb = new StringBuffer(); cb.position(start); while (cb.position() < end) sb.append(cb.get()); return checkXMLEncodingName(sb.toString()); } private static String parseXMLEncoding(CharBuffer cb) { int restore = cb.position(); skipSpace(cb); if (!match(cb, "=")) { cb.position(restore); return null; } skipSpace(cb); char quote; if (match(cb, "\"")) quote = '"'; else if (match(cb, "\'")) quote = '\''; else { cb.position(restore); return null; } int encodingStart = cb.position(); if (!find(cb, new String(new char[]{quote}))) { cb.position(restore); return null; } int encodingEnd = cb.position(); return extractXMLEncoding(cb, encodingStart, encodingEnd); } private static String findXMLEncoding(CharBuffer cb) { String encoding; int restore = cb.position(); if (find(cb, "encoding")) encoding = parseXMLEncoding(cb); else encoding = null; if (encoding == null) cb.position(restore); return encoding; } private static final int[] encoding8 = new int[] { 0x65, // 'e' 0x6E, // 'n' 0x63, // 'c' 0x6F, // 'o' 0x64, // 'd' 0x69, // 'i' 0x6E, // 'n' 0x67 // 'g' }; private static final int[] encoding16be = new int[] { 0x00, 0x65, // 'e' 0x00, 0x6E, // 'n' 0x00, 0x63, // 'c' 0x00, 0x6F, // 'o' 0x00, 0x64, // 'd' 0x00, 0x69, // 'i' 0x00, 0x6E, // 'n' 0x00, 0x67 // 'g' }; private static final int[] encoding16le = new int[] { 0x65, // 'e' 0x00, 0x6E, // 'n' 0x00, 0x63, // 'c' 0x00, 0x6F, // 'o' 0x00, 0x64, // 'd' 0x00, 0x69, // 'i' 0x00, 0x6E, // 'n' 0x00, 0x67, // 'g' 0x00, }; private static final int[] encoding32be = new int[] { 0x00, 0x00, 0x00, 0x65, // 'e' 0x00, 0x00, 0x00, 0x6E, // 'n' 0x00, 0x00, 0x00, 0x63, // 'c' 0x00, 0x00, 0x00, 0x6F, // 'o' 0x00, 0x00, 0x00, 0x64, // 'd' 0x00, 0x00, 0x00, 0x69, // 'i' 0x00, 0x00, 0x00, 0x6E, // 'n' 0x00, 0x00, 0x00, 0x67 // 'g' }; private static final int[] encoding32le = new int[] { 0x65, // 'e' 0x00, 0x00, 0x00, 0x6E, // 'n' 0x00, 0x00, 0x00, 0x63, // 'c' 0x00, 0x00, 0x00, 0x6F, // 'o' 0x00, 0x00, 0x00, 0x64, // 'd' 0x00, 0x00, 0x00, 0x69, // 'i' 0x00, 0x00, 0x00, 0x6E, // 'n' 0x00, 0x00, 0x00, 0x67, // 'g' 0x00, 0x00, 0x00 }; private static String extractXMLEncoding(ByteBuffer bb, int start, int end, int codeLength, boolean bigEndian) { StringBuffer sb = new StringBuffer(); int restore = bb.position(); bb.position(start); while (bb.position() < end) { if (bigEndian) if (!matchByte(bb, 0, codeLength - 1)) break; sb.append((char) bb.get()); if (!bigEndian) if (!matchByte(bb, 0, codeLength - 1)) break; } if (bb.position() == end) return checkXMLEncodingName(sb.toString()); else { bb.position(restore); return null; } } private static String parseXMLEncoding(ByteBuffer bb, int codeLength, boolean bigEndian) { int restore = bb.position(); skipSpace(bb, codeLength, bigEndian); if (!matchExtended(bb, '=', codeLength, bigEndian)) { bb.position(restore); return null; } skipSpace(bb, codeLength, bigEndian); char quote; if (matchExtended(bb, '"', codeLength, bigEndian)) quote = '"'; else if (matchExtended(bb, '\'', codeLength, bigEndian)) quote = '\''; else { bb.position(restore); return null; } int encodingStart = bb.position(); if (!findExtended(bb, quote, codeLength, bigEndian)) { bb.position(restore); return null; } int encodingEnd = bb.position() - codeLength; return extractXMLEncoding(bb, encodingStart, encodingEnd, codeLength, bigEndian); } // Either no BOM is present or it wasn't recognized. private static String findXMLEncoding(ByteBuffer bb) { String encoding; int restore = bb.position(); if (findFrom(bb, 0, encoding8)) encoding = parseXMLEncoding(bb, 1, true); else { byte[] bytes; if (bb.hasArray()) bytes = bb.array(); else { bytes = new byte[4]; bb.position(0); bb.get(bytes); bb.position(0); } if (bytes.length < 4) { encoding = null; } else if ((bytes[0] == '<') && (bytes[1] == 0) && (bytes[2] == 0) && (bytes[3] == 0)) { if (findFrom(bb, 0, encoding32le)) encoding = parseXMLEncoding(bb, 4, false); else encoding = null; } else if ((bytes[0] == '<') && (bytes[1] == 0)) { if (findFrom(bb, 0, encoding16le)) encoding = parseXMLEncoding(bb, 2, false); else encoding = null; } else if ((bytes[0] == 0) && (bytes[1] == 0) && (bytes[2] == 0) && (bytes[3] == '<')) { if (findFrom(bb, 0, encoding32be)) encoding = parseXMLEncoding(bb, 4, true); else encoding = null; } else if ((bytes[0] == 0) && (bytes[1] == '<')) { if (findFrom(bb, 0, encoding16be)) encoding = parseXMLEncoding(bb, 2, true); else encoding = null; } else encoding = null; } if (encoding == null) bb.position(restore); return encoding; } private static Charset checkForXMLEncodingCharset(ByteBuffer bb, Charset bomCharset, Object[] outputParameters) { String encoding = null; if (bomCharset != null) { // Decode using bomCharset, then search for encoding. // If a BOM was present and recognized, then bb.position() should be immediately // following the BOM. We will create a new ByteBuffer containing the bytes that // follow the BOM, if present, up to a maximum of 256 bytes from which we scan // for the XML encoding. int limitOld = bb.limit(); int limitNew = Math.min(limitOld,256); ByteBuffer bbNew = ByteBuffer.allocate(limitNew - bb.position()); if (limitNew < bb.limit()) bb.limit(limitNew); bbNew.put(bb); bbNew.rewind(); CharBuffer cb = bomCharset.decode(bbNew); encoding = findXMLEncoding(cb); // Restore prior limit. bb.limit(limitOld); } else encoding = findXMLEncoding(bb); if (encoding != null) { try { return Charset.forName(encoding); } catch (IllegalCharsetNameException e) { } catch (UnsupportedCharsetException e) { } } return null; } private static boolean match(ByteBuffer bb, int[] matchBytes) { int restore = bb.position(); int i = 0; int n = matchBytes.length; for (; i < n; ++i) { if (bb.get() != matchBytes[i]) break; } if (i == n) return true; else { bb.position(restore); return false; } } private static boolean matchExtended(ByteBuffer bb, int byteValue, int codeLength, boolean bigEndian) { int[] matchBytes = new int[codeLength]; if (bigEndian) matchBytes[matchBytes.length - 1] = byteValue; else matchBytes[0] = byteValue; return match(bb, matchBytes); } private static boolean find(ByteBuffer bb, int[] matchBytes) { if (matchBytes.length == 0) return true; for (; bb.position() < bb.limit(); bb.get()) { if (match(bb, matchBytes)) return true; } return false; } private static boolean findFrom(ByteBuffer bb, int offset, int[] matchBytes) { if (offset < 0) offset = 0; if (offset + matchBytes.length > bb.limit()) return false; bb.position(offset); return find(bb, matchBytes); } private static boolean findExtended(ByteBuffer bb, int byteValue, int codeLength, boolean bigEndian) { int[] matchBytes = new int[codeLength]; if (bigEndian) matchBytes[matchBytes.length - 1] = byteValue; else matchBytes[0] = byteValue; return find(bb, matchBytes); } private static boolean match(CharBuffer cb, String matchString) { int restore = cb.position(); int i = 0; int n = matchString.length(); for (; i < n; ++i) { if (cb.get() != matchString.charAt(i)) break; } if (i == n) return true; else { cb.position(restore); return false; } } private static boolean find(CharBuffer cb, String matchString) { if (matchString.length() == 0) return true; for (; cb.position() < cb.limit(); cb.get()) { if (match(cb, matchString)) return true; } return false; } private static boolean isXMLSpace(char c) { return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); } private static boolean isXMLSpace(byte b) { return (b == 0x20) || (b == 0x09) || (b == 0x0A) || (b == 0x0D); } private static boolean matchByte(ByteBuffer bb, int byteValue, int count) { if (count < 1) return true; int restore = bb.position(); int i = 0; int n = count; for (; i < n; ++i) { if (bb.get() != byteValue) break; } if (i == n) return true; else { bb.position(restore); return false; } } private static void skipSpace(ByteBuffer bb, int codeLength, boolean bigEndian) { while (bb.position() < bb.limit()) { int restore = bb.position(); if (bigEndian && !matchByte(bb, 0, codeLength - 1)) { bb.position(restore); break; } if (!isXMLSpace(bb.get())) { bb.position(restore); break; } if (!bigEndian && !matchByte(bb, 0, codeLength - 1)) { bb.position(restore); break; } } } private static void skipSpace(CharBuffer cb) { while (cb.position() < cb.limit()) { int restore = cb.position(); char c = cb.get(); if (!isXMLSpace(c)) { cb.position(restore); break; } } } }