/*
* Copyright 2004 Outerthought bvba and Schaubroeck nv
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.xpn.xwiki.plugin.lucene.textextraction.xmlutil;
import java.io.UnsupportedEncodingException;
import java.text.MessageFormat;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Utility code to detect the encoding of XML provided as a byte array. This code is based on the
* class com.sun.syndication.io.XmlReader from the Rome project (https://rome.dev.java.net/), which
* is licensed under the Apache V2 license (and doesn't include a NOTICE file).
*/
public class XmlEncodingDetector
{
private static final String UTF_8 = "UTF-8";
private static final String UTF_16BE = "UTF-16BE";
private static final String UTF_16LE = "UTF-16LE";
private static final String UTF_16 = "UTF-16";
public static String detectEncoding(byte[] data)
{
String bomEnc = getBOMEncoding(data);
String xmlGuessEnc = getXMLGuessEncoding(data);
String xmlEnc = getXMLPrologEncoding(data, xmlGuessEnc);
String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
return encoding;
}
// returns the BOM in the stream, NULL if not present,
// if there was BOM the in the stream it is consumed
private static String getBOMEncoding(byte[] bytes)
{
String encoding = null;
if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
encoding = UTF_16BE;
} else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
encoding = UTF_16LE;
} else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
encoding = UTF_8;
}
return encoding;
}
// returns the best guess for the encoding by looking the first bytes of the stream, '<?'
private static String getXMLGuessEncoding(byte[] bytes)
{
String encoding = null;
if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
encoding = UTF_16BE;
} else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
encoding = UTF_16LE;
} else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
encoding = UTF_8;
}
return encoding;
}
private static final Pattern ENCODING_PATTERN =
Pattern.compile("^<\\?xml.*encoding=\"(.*)\".*\\?>");
// returns the encoding declared in the <?xml encoding=...?>, NULL if none
private static String getXMLPrologEncoding(byte[] data, String guessedEnc)
{
String encoding = null;
if (guessedEnc != null) {
if (data.length > -1) {
int endFirstLinePos = Math.min(data.length, 1024);
for (int i = 0; i < 1024 && i < data.length; i++) {
if (data[i] == '\n' || data[i] == '\r') {
endFirstLinePos = i;
break;
}
}
String prolog = null;
try {
prolog = new String(data, 0, endFirstLinePos, guessedEnc);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
Matcher m = ENCODING_PATTERN.matcher(prolog);
encoding = (m.find()) ? m.group(1).toUpperCase() : null;
}
}
return encoding;
}
private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc)
{
String encoding;
if (bomEnc == null) {
if (xmlGuessEnc == null || xmlEnc == null) {
encoding = UTF_8;
} else if (xmlEnc.equals(UTF_16)
&& (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
encoding = xmlGuessEnc;
} else {
encoding = xmlEnc;
}
} else if (bomEnc.equals(UTF_8)) {
if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
throw new RuntimeException(RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc,
xmlEnc}));
}
if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
throw new RuntimeException(RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc,
xmlEnc}));
}
encoding = UTF_8;
} else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
throw new RuntimeException(RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc,
xmlEnc}));
}
if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
throw new RuntimeException(RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc,
xmlEnc}));
}
encoding = bomEnc;
} else {
throw new RuntimeException(RAW_EX_2
.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}));
}
return encoding;
}
private static final MessageFormat RAW_EX_1 =
new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
private static final MessageFormat RAW_EX_2 =
new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
}