/*******************************************************************************
* Copyright (c) 2009, 2010 Progress Software Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
******************************************************************************/
// Copyright (c) 2009 Progress Software Corporation.
package org.fusesource.tools.core.message.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
public class EncodingReader {
public static final String UTF_8 = "UTF-8";
public static final String UTF_16 = "UTF-16";
public static final String UTF_16BE = "UTF-16BE";
public static final String UTF_16LE = "UTF-16LE";
public static final String UCS_4 = "UCS-4";
public static final String UCS_4BE = "UCS-4BE";
public static final String UCS_4LE = "UCS-4LE";
public static final String UCS_4_3412 = "UCS-4-3412";
public static final String UCS_4_2143 = "UCS-4-2143";
public static final int UTF_8_BOM_LENGTH = 3;
private static final String LATIN_1 = "8859_1";
private static int FROM_BOM = 1;
private static int FROM_DECL = 2;
private static int FROM_SYS = 3;
private static EncodingReader instance = new EncodingReader();
public static EncodingReader getInstance() {
return instance;
}
public EncodingInfo readCharset(byte[] bytes) {
return readCharset(bytes, true);
}
/**
*
* @param bytes
* @param doDefault
* if true, returns the default charset of the platform if the charset could not be
* determined from the content
* @return
*/
public EncodingInfo readCharset(byte[] bytes, boolean doDefault) {
int len = Math.min(bytes.length, 4);
int first4[] = new int[4];
for (int i = 0; i < len; i++) {
first4[i] = bytes[i] & 0xFF;
}
EncodingInfo charset = checkfirst4bytes(first4, len);
if (charset != null) {
return charset;
}
if (startsWithDecalaration(first4)) {
try {
charset = findEncodingDeclaration(new String(bytes, 0, Math.min(bytes.length, 1024), LATIN_1));
if (charset != null) {
return charset;
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
return doDefault ? defaultEncoding() : null;
}
public EncodingInfo readCharset(InputStream stream) throws IOException {
return readCharset(stream, true);
}
/**
* If the stream supports <b>mark/reset</b>, the stream is reset to where it was before the
* method invocation.
*
* @param stream
* stream is not closed by the EncodingReader
* @param doDefault
* if true, returns the default charset of the platform if the charset could not be
* determined from the content
* @throws IOException
*/
public EncodingInfo readCharset(InputStream stream, boolean doDefault) throws IOException {
boolean mark = stream.markSupported();
if (mark) {
stream.mark(1024);
}
try {
int first4[] = new int[4];
int len = 0;
for (int i = 0; i < first4.length; i++) {
int r = stream.read();
if (r == -1) {
break;
}
len++;
first4[i] = r;
}
EncodingInfo charset = checkfirst4bytes(first4, len);
if (charset != null) {
return charset;
}
if (startsWithDecalaration(first4)) {
try {
byte[] bytes = new byte[1024];
for (int i = 0; i < len; i++) {
bytes[i] = ((byte) first4[i]);
}
int i = stream.read(bytes, len, bytes.length - len);
if (i == -1) {
return defaultEncoding();
}
charset = findEncodingDeclaration(new String(bytes, 0, i, LATIN_1));
if (charset != null) {
return charset;
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
return doDefault ? defaultEncoding() : null;
} finally {
try {
if (mark) {
stream.reset();
}
} catch (IOException e) {
}
}
}
private EncodingInfo checkfirst4bytes(int[] bytes, int length) {
if (length < 3) {
return null;
}
int byte1 = bytes[0];
int byte2 = bytes[1];
int byte3 = bytes[2];
if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {
return bomEncoding(UTF_8);
}
if (length < 4) {
return null;
}
int byte4 = bytes[3];
if (byte1 == 0xFE && byte2 == 0xFF) {
if (byte3 == 0x00 && byte4 == 0x00) {
return bomEncoding(UCS_4_3412);
} else if (byte3 != 0x00 || byte4 != 0x00) {
return bomEncoding(UTF_16);
}
} else if (byte1 == 0xFF && byte2 == 0xFE) {
if (byte3 == 0x00 && byte4 == 0x00) {
return bomEncoding(UCS_4);
} else if (byte3 != 0x00 || byte4 != 0x00) {
return bomEncoding(UTF_16);
}
} else if (byte1 == 0x00 && byte2 == 0x00) {
if (byte3 == 0xFE && byte4 == 0xFF) {
return bomEncoding(UCS_4);
} else if (byte3 == 0xFF && byte4 != 0xFE) {
return bomEncoding(UCS_4_2143);
}
}
// no BOM present; try to guess from the way version declaration is encoded
// this does not comply fully with the suggestions in xml spec (which are non-normative
// btw.)
if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0x00 && byte4 == '<') {
return declEncoding(UCS_4BE);
}
if (byte1 == '<' && byte2 == 0x00 && byte3 == 0x00 && byte4 == 0x00) {
return declEncoding(UCS_4LE);
}
if (byte1 == 0x00 && byte2 == 0x00 && byte3 == '<' && byte4 == 0x00) {
return declEncoding(UCS_4_2143);
}
if (byte1 == 0x00 && byte2 == '<' && byte3 == 0x00 && byte4 == 0x00) {
return declEncoding(UCS_4_3412);
}
if (byte1 == 0x00 && byte2 == '<' && byte3 == 0x00 && byte4 == '?') {
return declEncoding(UTF_16BE);
} else if (byte1 == '<' && byte2 == 0x00 && byte3 == '?' && byte4 == 0x00) {
return declEncoding(UTF_16LE);
}
return null;
}
private boolean startsWithDecalaration(int[] first4) {
return first4[0] == '<' && first4[1] == '?' && first4[2] == 'x' && first4[3] == 'm';
}
private EncodingInfo findEncodingDeclaration(String declaration) {
String encoding = "encoding";
int position = declaration.indexOf(encoding) + encoding.length();
if (position == -1) {
return null;
}
char c = 0;
// get rid of white space before equals sign
while (position < declaration.length()) {
c = declaration.charAt(position++);
if (!Character.isSpace(c)) {
break;
}
}
if (c != '=') { // malformed
return null;
}
// get rid of white space after equals sign
while (position < declaration.length()) {
c = declaration.charAt(position++);
if (!Character.isSpace(c)) {
break;
}
}
char delimiter = c;
if (delimiter != '\'' && delimiter != '"') { // malformed
return null;
}
// now positioned to read encoding name
StringBuffer encodingName = new StringBuffer();
while (position < declaration.length()) {
c = declaration.charAt(position++);
if (c == delimiter) {
break;
}
encodingName.append(c);
}
if (c != delimiter) {
return null;
}
return declEncoding(encodingName.toString());
}
public static void main(String[] args) throws IOException {
EncodingInfo s = getInstance().readCharset(new byte[0]);
System.out.println("s = " + s);
}
private static EncodingInfo bomEncoding(String name) {
return new EncodingInfo(name, FROM_BOM);
}
private static EncodingInfo declEncoding(String name) {
return new EncodingInfo(name, FROM_DECL);
}
private static EncodingInfo defaultEncoding() {
return new EncodingInfo(System.getProperty("file.encoding"), FROM_SYS);
}
public static class EncodingInfo {
private String encoding;
private int type;
private EncodingInfo(String encoding, int type) {
this.encoding = encoding;
this.type = type;
}
public String name() {
return encoding;
}
public boolean isFromBOM() {
return type == FROM_BOM;
}
public boolean isFromDeclaration() {
return type == FROM_DECL;
}
public boolean isSystemDefault() {
return type == FROM_SYS;
}
@Override
public String toString() {
return encoding;
}
}
}