/*
* Copyright (c) 2006 Henri Sivonen
* Copyright (c) 2008 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.htmlparser.io;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderMalfunctionError;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
public class Encoding {
public static final Encoding UTF8;
public static final Encoding UTF16;
public static final Encoding UTF16LE;
public static final Encoding UTF16BE;
public static final Encoding WINDOWS1252;
private static String[] SHOULD_NOT = { "jisx02121990", "xjis0208" };
private static String[] BANNED = { "bocu1", "cesu8", "compoundtext",
"iscii91", "macarabic", "maccentraleurroman", "maccroatian",
"maccyrillic", "macdevanagari", "macfarsi", "macgreek",
"macgujarati", "macgurmukhi", "machebrew", "macicelandic",
"macroman", "macromanian", "macthai", "macturkish", "macukranian",
"scsu", "utf32", "utf32be", "utf32le", "utf7", "ximapmailboxname",
"xjisautodetect", "xutf16bebom", "xutf16lebom", "xutf32bebom",
"xutf32lebom", "xutf16oppositeendian", "xutf16platformendian",
"xutf32oppositeendian", "xutf32platformendian" };
private static String[] NOT_OBSCURE = { "big5", "big5hkscs", "eucjp",
"euckr", "gb18030", "gbk", "iso2022jp", "iso2022kr", "iso88591",
"iso885913", "iso885915", "iso88592", "iso88593", "iso88594",
"iso88595", "iso88596", "iso88597", "iso88598", "iso88599",
"koi8r", "shiftjis", "tis620", "usascii", "utf16", "utf16be",
"utf16le", "utf8", "windows1250", "windows1251", "windows1252",
"windows1253", "windows1254", "windows1255", "windows1256",
"windows1257", "windows1258" };
private static Map<String, Encoding> encodingByCookedName = new HashMap<String, Encoding>();
private final String canonName;
private final Charset charset;
private final boolean asciiSuperset;
private final boolean obscure;
private final boolean shouldNot;
private final boolean likelyEbcdic;
private Encoding actualHtmlEncoding = null;
static {
byte[] testBuf = new byte[0x7F];
for (int i = 0; i < 0x7F; i++) {
if (isAsciiSupersetnessSensitive(i)) {
testBuf[i] = (byte) i;
} else {
testBuf[i] = (byte) 0x20;
}
}
Set<Encoding> encodings = new HashSet<Encoding>();
SortedMap<String, Charset> charsets = Charset.availableCharsets();
for (Map.Entry<String, Charset> entry : charsets.entrySet()) {
Charset cs = entry.getValue();
String name = toNameKey(cs.name());
String canonName = toAsciiLowerCase(cs.name());
if (!isBanned(name)) {
name = name.intern();
boolean asciiSuperset = asciiMapsToBasicLatin(testBuf, cs);
Encoding enc = new Encoding(canonName.intern(), cs,
asciiSuperset, isObscure(name), isShouldNot(name),
isLikelyEbcdic(name, asciiSuperset));
encodings.add(enc);
Set<String> aliases = cs.aliases();
for (String alias : aliases) {
encodingByCookedName.put(toNameKey(alias).intern(), enc);
}
}
}
// Overwrite possible overlapping aliases with the real things--just in
// case
for (Encoding encoding : encodings) {
encodingByCookedName.put(toNameKey(encoding.getCanonName()),
encoding);
}
UTF8 = forName("utf-8");
UTF16 = forName("utf-16");
UTF16BE = forName("utf-16be");
UTF16LE = forName("utf-16le");
WINDOWS1252 = forName("windows-1252");
try {
forName("iso-8859-1").actualHtmlEncoding = forName("windows-1252");
} catch (UnsupportedCharsetException e) {
}
try {
forName("iso-8859-9").actualHtmlEncoding = forName("windows-1254");
} catch (UnsupportedCharsetException e) {
}
try {
forName("iso-8859-11").actualHtmlEncoding = forName("windows-874");
} catch (UnsupportedCharsetException e) {
}
try {
forName("x-iso-8859-11").actualHtmlEncoding = forName("windows-874");
} catch (UnsupportedCharsetException e) {
}
try {
forName("tis-620").actualHtmlEncoding = forName("windows-874");
} catch (UnsupportedCharsetException e) {
}
try {
forName("gb_2312-80").actualHtmlEncoding = forName("gbk");
} catch (UnsupportedCharsetException e) {
}
try {
forName("gb2312").actualHtmlEncoding = forName("gbk");
} catch (UnsupportedCharsetException e) {
}
try {
encodingByCookedName.put("x-x-big5", forName("big5"));
} catch (UnsupportedCharsetException e) {
}
try {
encodingByCookedName.put("euc-kr", forName("windows-949"));
} catch (UnsupportedCharsetException e) {
}
try {
encodingByCookedName.put("ks_c_5601-1987", forName("windows-949"));
} catch (UnsupportedCharsetException e) {
}
}
private static boolean isAsciiSupersetnessSensitive(int c) {
return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0x22)
|| (c >= 0x26 && c <= 0x27) || (c >= 0x2C && c <= 0x3F)
|| (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A);
}
private static boolean isObscure(String lowerCasePreferredIanaName) {
return !(Arrays.binarySearch(NOT_OBSCURE, lowerCasePreferredIanaName) > -1);
}
private static boolean isBanned(String lowerCasePreferredIanaName) {
if (lowerCasePreferredIanaName.startsWith("xibm")) {
return true;
}
return (Arrays.binarySearch(BANNED, lowerCasePreferredIanaName) > -1);
}
private static boolean isShouldNot(String lowerCasePreferredIanaName) {
return (Arrays.binarySearch(SHOULD_NOT, lowerCasePreferredIanaName) > -1);
}
/**
* @param testBuf
* @param cs
*/
private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
CharsetDecoder dec = cs.newDecoder();
dec.onMalformedInput(CodingErrorAction.REPORT);
dec.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
try {
for (int i = 0; i < 0x7F; i++) {
if (isAsciiSupersetnessSensitive(i)) {
if (r.read() != i) {
return false;
}
} else {
if (r.read() != 0x20) {
return false;
}
}
}
} catch (IOException e) {
return false;
} catch (Exception e) {
return false;
} catch (CoderMalfunctionError e) {
return false;
}
return true;
}
private static boolean isLikelyEbcdic(String canonName,
boolean asciiSuperset) {
if (!asciiSuperset) {
return (canonName.startsWith("cp") || canonName.startsWith("ibm") || canonName.startsWith("xibm"));
} else {
return false;
}
}
public static Encoding forName(String name) {
Encoding rv = encodingByCookedName.get(toNameKey(name));
if (rv == null) {
throw new UnsupportedCharsetException(name);
} else {
return rv;
}
}
public static String toNameKey(String str) {
if (str == null) {
return null;
}
int j = 0;
char[] buf = new char[str.length()];
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c >= 'A' && c <= 'Z') {
c += 0x20;
}
if (!((c >= '\t' && c <= '\r') || (c >= '\u0020' && c <= '\u002F')
|| (c >= '\u003A' && c <= '\u0040')
|| (c >= '\u005B' && c <= '\u0060') || (c >= '\u007B' && c <= '\u007E'))) {
buf[j] = c;
j++;
}
}
return new String(buf, 0, j);
}
public static String toAsciiLowerCase(String str) {
if (str == null) {
return null;
}
char[] buf = new char[str.length()];
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c >= 'A' && c <= 'Z') {
c += 0x20;
}
buf[i] = c;
}
return new String(buf);
}
/**
* @param canonName
* @param charset
* @param asciiSuperset
* @param obscure
* @param shouldNot
* @param likelyEbcdic
*/
private Encoding(final String canonName, final Charset charset,
final boolean asciiSuperset, final boolean obscure,
final boolean shouldNot, final boolean likelyEbcdic) {
this.canonName = canonName;
this.charset = charset;
this.asciiSuperset = asciiSuperset;
this.obscure = obscure;
this.shouldNot = shouldNot;
this.likelyEbcdic = likelyEbcdic;
}
/**
* Returns the asciiSuperset.
*
* @return the asciiSuperset
*/
public boolean isAsciiSuperset() {
return asciiSuperset;
}
/**
* Returns the canonName.
*
* @return the canonName
*/
public String getCanonName() {
return canonName;
}
/**
* Returns the likelyEbcdic.
*
* @return the likelyEbcdic
*/
public boolean isLikelyEbcdic() {
return likelyEbcdic;
}
/**
* Returns the obscure.
*
* @return the obscure
*/
public boolean isObscure() {
return obscure;
}
/**
* Returns the shouldNot.
*
* @return the shouldNot
*/
public boolean isShouldNot() {
return shouldNot;
}
public boolean isRegistered() {
return !canonName.startsWith("x-");
}
/**
* @return
* @see java.nio.charset.Charset#canEncode()
*/
public boolean canEncode() {
return charset.canEncode();
}
/**
* @return
* @see java.nio.charset.Charset#newDecoder()
*/
public CharsetDecoder newDecoder() {
return charset.newDecoder();
}
/**
* @return
* @see java.nio.charset.Charset#newEncoder()
*/
public CharsetEncoder newEncoder() {
return charset.newEncoder();
}
/**
* Returns the actualHtmlEncoding.
*
* @return the actualHtmlEncoding
*/
public Encoding getActualHtmlEncoding() {
return actualHtmlEncoding;
}
public static void main(String[] args) {
for (Map.Entry<String, Encoding> entry : encodingByCookedName.entrySet()) {
String name = entry.getKey();
Encoding enc = entry.getValue();
System.out.printf(
"%21s: canon %21s, obs %5s, reg %5s, asc %5s, ebc %5s\n",
name, enc.getCanonName(), enc.isObscure(),
enc.isRegistered(), enc.isAsciiSuperset(),
enc.isLikelyEbcdic());
}
}
}