/* * Copyright (c) 2015 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.encoding; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.nio.charset.spi.CharsetProvider; import java.util.Arrays; import java.util.Collections; import java.util.SortedMap; import java.util.TreeMap; /** * Represents an <a href="https://encoding.spec.whatwg.org/#encoding">encoding</a> * as defined in the <a href="https://encoding.spec.whatwg.org/">Encoding * Standard</a>, provides access to each encoding defined in the Encoding * Standard via a static constant and provides the * "<a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an * encoding</a>" algorithm defined in the Encoding Standard. * * <p>This class inherits from {@link Charset} to allow the Encoding * Standard-compliant encodings to be used in contexts that support * <code>Charset</code> instances. However, by design, the Encoding * Standard-compliant encodings are not supplied via a {@link CharsetProvider} * and, therefore, are not available via and do not interfere with the static * methods provided by <code>Charset</code>. (This class provides methods of * the same name to hide each static method of <code>Charset</code> to help * avoid accidental calls to the static methods of the superclass when working * with Encoding Standard-compliant encodings.) * * <p>When an application needs to use a particular encoding, such as utf-8 * or windows-1252, the corresponding constant, i.e. * {@link #UTF_8 Encoding.UTF_8} and {@link #WINDOWS_1252 Encoding.WINDOWS_1252} * respectively, should be used. However, when the application receives an * encoding label from external input, the method {@link #forName(String) * forName()} should be used to obtain the object representing the encoding * identified by the label. In contexts where labels that map to the * <a href="https://encoding.spec.whatwg.org/#replacement">replacement * encoding</a> should be treated as unknown, the method {@link * #forNameNoReplacement(String) forNameNoReplacement()} should be used instead. * * * @author hsivonen */ public abstract class Encoding extends Charset { private static final String[] LABELS = { "866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252", "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "cp819", "cp866", "csbig5", "cseuckr", "cseucpkdfmtjapanese", "csgb2312", "csibm866", "csiso2022jp", "csiso2022kr", "csiso58gb231280", "csiso88596e", "csiso88596i", "csiso88598e", "csiso88598i", "csisolatin1", "csisolatin2", "csisolatin3", "csisolatin4", "csisolatin5", "csisolatin6", "csisolatin9", "csisolatinarabic", "csisolatincyrillic", "csisolatingreek", "csisolatinhebrew", "cskoi8r", "csksc56011987", "csmacintosh", "csshiftjis", "cyrillic", "dos-874", "ecma-114", "ecma-118", "elot_928", "euc-jp", "euc-kr", "gb18030", "gb2312", "gb_2312", "gb_2312-80", "gbk", "greek", "greek8", "hebrew", "hz-gb-2312", "ibm819", "ibm866", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-jp", "iso-2022-kr", "iso-8859-1", "iso-8859-10", "iso-8859-11", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-8859-7", "iso-8859-8", "iso-8859-8-e", "iso-8859-8-i", "iso-8859-9", "iso-ir-100", "iso-ir-101", "iso-ir-109", "iso-ir-110", "iso-ir-126", "iso-ir-127", "iso-ir-138", "iso-ir-144", "iso-ir-148", "iso-ir-149", "iso-ir-157", "iso-ir-58", "iso8859-1", "iso8859-10", "iso8859-11", "iso8859-13", "iso8859-14", "iso8859-15", "iso8859-2", "iso8859-3", "iso8859-4", "iso8859-5", "iso8859-6", "iso8859-7", "iso8859-8", "iso8859-9", "iso88591", "iso885910", "iso885911", "iso885913", "iso885914", "iso885915", "iso88592", "iso88593", "iso88594", "iso88595", "iso88596", "iso88597", "iso88598", "iso88599", "iso_8859-1", "iso_8859-15", "iso_8859-1:1987", "iso_8859-2", "iso_8859-2:1987", "iso_8859-3", "iso_8859-3:1988", "iso_8859-4", "iso_8859-4:1988", "iso_8859-5", "iso_8859-5:1988", "iso_8859-6", "iso_8859-6:1987", "iso_8859-7", "iso_8859-7:1987", "iso_8859-8", "iso_8859-8:1988", "iso_8859-9", "iso_8859-9:1989", "koi", "koi8", "koi8-r", "koi8-ru", "koi8-u", "koi8_r", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "l1", "l2", "l3", "l4", "l5", "l6", "l9", "latin1", "latin2", "latin3", "latin4", "latin5", "latin6", "logical", "mac", "macintosh", "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "sun_eu_greek", "tis-620", "unicode-1-1-utf-8", "us-ascii", "utf-16", "utf-16be", "utf-16le", "utf-8", "utf8", "visual", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "windows-31j", "windows-874", "windows-949", "x-cp1250", "x-cp1251", "x-cp1252", "x-cp1253", "x-cp1254", "x-cp1255", "x-cp1256", "x-cp1257", "x-cp1258", "x-euc-jp", "x-gbk", "x-mac-cyrillic", "x-mac-roman", "x-mac-ukrainian", "x-sjis", "x-user-defined", "x-x-big5", }; private static final Encoding[] ENCODINGS_FOR_LABELS = { Ibm866.INSTANCE, Windows1252.INSTANCE, Iso6.INSTANCE, Windows1252.INSTANCE, Iso6.INSTANCE, Big5.INSTANCE, Big5.INSTANCE, Gbk.INSTANCE, Big5.INSTANCE, Windows1250.INSTANCE, Windows1251.INSTANCE, Windows1252.INSTANCE, Windows1253.INSTANCE, Windows1254.INSTANCE, Windows1255.INSTANCE, Windows1256.INSTANCE, Windows1257.INSTANCE, Windows1258.INSTANCE, Windows1252.INSTANCE, Ibm866.INSTANCE, Big5.INSTANCE, EucKr.INSTANCE, EucJp.INSTANCE, Gbk.INSTANCE, Ibm866.INSTANCE, Iso2022Jp.INSTANCE, Replacement.INSTANCE, Gbk.INSTANCE, Iso6.INSTANCE, Iso6.INSTANCE, Iso8.INSTANCE, Iso8I.INSTANCE, Windows1252.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Windows1254.INSTANCE, Iso10.INSTANCE, Iso15.INSTANCE, Iso6.INSTANCE, Iso5.INSTANCE, Iso7.INSTANCE, Iso8.INSTANCE, Koi8R.INSTANCE, EucKr.INSTANCE, Macintosh.INSTANCE, ShiftJis.INSTANCE, Iso5.INSTANCE, Windows874.INSTANCE, Iso6.INSTANCE, Iso7.INSTANCE, Iso7.INSTANCE, EucJp.INSTANCE, EucKr.INSTANCE, Gb18030.INSTANCE, Gbk.INSTANCE, Gbk.INSTANCE, Gbk.INSTANCE, Gbk.INSTANCE, Iso7.INSTANCE, Iso7.INSTANCE, Iso8.INSTANCE, Replacement.INSTANCE, Windows1252.INSTANCE, Ibm866.INSTANCE, Replacement.INSTANCE, Replacement.INSTANCE, Iso2022Jp.INSTANCE, Replacement.INSTANCE, Windows1252.INSTANCE, Iso10.INSTANCE, Windows874.INSTANCE, Iso13.INSTANCE, Iso14.INSTANCE, Iso15.INSTANCE, Iso16.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Iso5.INSTANCE, Iso6.INSTANCE, Iso6.INSTANCE, Iso6.INSTANCE, Iso7.INSTANCE, Iso8.INSTANCE, Iso8.INSTANCE, Iso8I.INSTANCE, Windows1254.INSTANCE, Windows1252.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Iso7.INSTANCE, Iso6.INSTANCE, Iso8.INSTANCE, Iso5.INSTANCE, Windows1254.INSTANCE, EucKr.INSTANCE, Iso10.INSTANCE, Gbk.INSTANCE, Windows1252.INSTANCE, Iso10.INSTANCE, Windows874.INSTANCE, Iso13.INSTANCE, Iso14.INSTANCE, Iso15.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Iso5.INSTANCE, Iso6.INSTANCE, Iso7.INSTANCE, Iso8.INSTANCE, Windows1254.INSTANCE, Windows1252.INSTANCE, Iso10.INSTANCE, Windows874.INSTANCE, Iso13.INSTANCE, Iso14.INSTANCE, Iso15.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Iso5.INSTANCE, Iso6.INSTANCE, Iso7.INSTANCE, Iso8.INSTANCE, Windows1254.INSTANCE, Windows1252.INSTANCE, Iso15.INSTANCE, Windows1252.INSTANCE, Iso2.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Iso4.INSTANCE, Iso5.INSTANCE, Iso5.INSTANCE, Iso6.INSTANCE, Iso6.INSTANCE, Iso7.INSTANCE, Iso7.INSTANCE, Iso8.INSTANCE, Iso8.INSTANCE, Windows1254.INSTANCE, Windows1254.INSTANCE, Koi8R.INSTANCE, Koi8R.INSTANCE, Koi8R.INSTANCE, Koi8U.INSTANCE, Koi8U.INSTANCE, Koi8R.INSTANCE, EucKr.INSTANCE, EucKr.INSTANCE, EucKr.INSTANCE, EucKr.INSTANCE, EucKr.INSTANCE, Windows1252.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Windows1254.INSTANCE, Iso10.INSTANCE, Iso15.INSTANCE, Windows1252.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Windows1254.INSTANCE, Iso10.INSTANCE, Iso8I.INSTANCE, Macintosh.INSTANCE, Macintosh.INSTANCE, ShiftJis.INSTANCE, ShiftJis.INSTANCE, ShiftJis.INSTANCE, ShiftJis.INSTANCE, ShiftJis.INSTANCE, Iso7.INSTANCE, Windows874.INSTANCE, Utf8.INSTANCE, Windows1252.INSTANCE, Utf16Le.INSTANCE, Utf16Be.INSTANCE, Utf16Le.INSTANCE, Utf8.INSTANCE, Utf8.INSTANCE, Iso8.INSTANCE, Windows1250.INSTANCE, Windows1251.INSTANCE, Windows1252.INSTANCE, Windows1253.INSTANCE, Windows1254.INSTANCE, Windows1255.INSTANCE, Windows1256.INSTANCE, Windows1257.INSTANCE, Windows1258.INSTANCE, ShiftJis.INSTANCE, Windows874.INSTANCE, EucKr.INSTANCE, Windows1250.INSTANCE, Windows1251.INSTANCE, Windows1252.INSTANCE, Windows1253.INSTANCE, Windows1254.INSTANCE, Windows1255.INSTANCE, Windows1256.INSTANCE, Windows1257.INSTANCE, Windows1258.INSTANCE, EucJp.INSTANCE, Gbk.INSTANCE, MacCyrillic.INSTANCE, Macintosh.INSTANCE, MacCyrillic.INSTANCE, ShiftJis.INSTANCE, UserDefined.INSTANCE, Big5.INSTANCE, }; private static final Encoding[] ENCODINGS = { Big5.INSTANCE, EucJp.INSTANCE, EucKr.INSTANCE, Gb18030.INSTANCE, Gbk.INSTANCE, Ibm866.INSTANCE, Iso2022Jp.INSTANCE, Iso10.INSTANCE, Iso13.INSTANCE, Iso14.INSTANCE, Iso15.INSTANCE, Iso16.INSTANCE, Iso2.INSTANCE, Iso3.INSTANCE, Iso4.INSTANCE, Iso5.INSTANCE, Iso6.INSTANCE, Iso7.INSTANCE, Iso8.INSTANCE, Iso8I.INSTANCE, Koi8R.INSTANCE, Koi8U.INSTANCE, Macintosh.INSTANCE, Replacement.INSTANCE, ShiftJis.INSTANCE, Utf16Be.INSTANCE, Utf16Le.INSTANCE, Utf8.INSTANCE, Windows1250.INSTANCE, Windows1251.INSTANCE, Windows1252.INSTANCE, Windows1253.INSTANCE, Windows1254.INSTANCE, Windows1255.INSTANCE, Windows1256.INSTANCE, Windows1257.INSTANCE, Windows1258.INSTANCE, Windows874.INSTANCE, MacCyrillic.INSTANCE, UserDefined.INSTANCE, }; /** * The big5 encoding. */ public static final Encoding BIG5 = Big5.INSTANCE; /** * The euc-jp encoding. */ public static final Encoding EUC_JP = EucJp.INSTANCE; /** * The euc-kr encoding. */ public static final Encoding EUC_KR = EucKr.INSTANCE; /** * The gb18030 encoding. */ public static final Encoding GB18030 = Gb18030.INSTANCE; /** * The gbk encoding. */ public static final Encoding GBK = Gbk.INSTANCE; /** * The ibm866 encoding. */ public static final Encoding IBM866 = Ibm866.INSTANCE; /** * The iso-2022-jp encoding. */ public static final Encoding ISO_2022_JP = Iso2022Jp.INSTANCE; /** * The iso-8859-10 encoding. */ public static final Encoding ISO_8859_10 = Iso10.INSTANCE; /** * The iso-8859-13 encoding. */ public static final Encoding ISO_8859_13 = Iso13.INSTANCE; /** * The iso-8859-14 encoding. */ public static final Encoding ISO_8859_14 = Iso14.INSTANCE; /** * The iso-8859-15 encoding. */ public static final Encoding ISO_8859_15 = Iso15.INSTANCE; /** * The iso-8859-16 encoding. */ public static final Encoding ISO_8859_16 = Iso16.INSTANCE; /** * The iso-8859-2 encoding. */ public static final Encoding ISO_8859_2 = Iso2.INSTANCE; /** * The iso-8859-3 encoding. */ public static final Encoding ISO_8859_3 = Iso3.INSTANCE; /** * The iso-8859-4 encoding. */ public static final Encoding ISO_8859_4 = Iso4.INSTANCE; /** * The iso-8859-5 encoding. */ public static final Encoding ISO_8859_5 = Iso5.INSTANCE; /** * The iso-8859-6 encoding. */ public static final Encoding ISO_8859_6 = Iso6.INSTANCE; /** * The iso-8859-7 encoding. */ public static final Encoding ISO_8859_7 = Iso7.INSTANCE; /** * The iso-8859-8 encoding. */ public static final Encoding ISO_8859_8 = Iso8.INSTANCE; /** * The iso-8859-8-i encoding. */ public static final Encoding ISO_8859_8_I = Iso8I.INSTANCE; /** * The koi8-r encoding. */ public static final Encoding KOI8_R = Koi8R.INSTANCE; /** * The koi8-u encoding. */ public static final Encoding KOI8_U = Koi8U.INSTANCE; /** * The macintosh encoding. */ public static final Encoding MACINTOSH = Macintosh.INSTANCE; /** * The replacement encoding. */ public static final Encoding REPLACEMENT = Replacement.INSTANCE; /** * The shift_jis encoding. */ public static final Encoding SHIFT_JIS = ShiftJis.INSTANCE; /** * The utf-16be encoding. */ public static final Encoding UTF_16BE = Utf16Be.INSTANCE; /** * The utf-16le encoding. */ public static final Encoding UTF_16LE = Utf16Le.INSTANCE; /** * The utf-8 encoding. */ public static final Encoding UTF_8 = Utf8.INSTANCE; /** * The windows-1250 encoding. */ public static final Encoding WINDOWS_1250 = Windows1250.INSTANCE; /** * The windows-1251 encoding. */ public static final Encoding WINDOWS_1251 = Windows1251.INSTANCE; /** * The windows-1252 encoding. */ public static final Encoding WINDOWS_1252 = Windows1252.INSTANCE; /** * The windows-1253 encoding. */ public static final Encoding WINDOWS_1253 = Windows1253.INSTANCE; /** * The windows-1254 encoding. */ public static final Encoding WINDOWS_1254 = Windows1254.INSTANCE; /** * The windows-1255 encoding. */ public static final Encoding WINDOWS_1255 = Windows1255.INSTANCE; /** * The windows-1256 encoding. */ public static final Encoding WINDOWS_1256 = Windows1256.INSTANCE; /** * The windows-1257 encoding. */ public static final Encoding WINDOWS_1257 = Windows1257.INSTANCE; /** * The windows-1258 encoding. */ public static final Encoding WINDOWS_1258 = Windows1258.INSTANCE; /** * The windows-874 encoding. */ public static final Encoding WINDOWS_874 = Windows874.INSTANCE; /** * The x-mac-cyrillic encoding. */ public static final Encoding X_MAC_CYRILLIC = MacCyrillic.INSTANCE; /** * The x-user-defined encoding. */ public static final Encoding X_USER_DEFINED = UserDefined.INSTANCE; private static SortedMap<String, Charset> encodings = null; protected Encoding(String canonicalName, String[] aliases) { super(canonicalName, aliases); } private enum State { HEAD, LABEL, TAIL }; public static Encoding forName(String label) { if (label == null) { throw new IllegalArgumentException("Label must not be null."); } if (label.length() == 0) { throw new IllegalCharsetNameException(label); } // First try the fast path int index = Arrays.binarySearch(LABELS, label); if (index >= 0) { return ENCODINGS_FOR_LABELS[index]; } // Else, slow path StringBuilder sb = new StringBuilder(); State state = State.HEAD; for (int i = 0; i < label.length(); i++) { char c = label.charAt(i); if ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t') || (c == '\u000C')) { if (state == State.LABEL) { state = State.TAIL; } continue; } if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { switch (state) { case HEAD: state = State.LABEL; // Fall through case LABEL: sb.append(c); continue; case TAIL: throw new IllegalCharsetNameException(label); } } if (c >= 'A' && c <= 'Z') { c += 0x20; switch (state) { case HEAD: state = State.LABEL; // Fall through case LABEL: sb.append(c); continue; case TAIL: throw new IllegalCharsetNameException(label); } } if ((c == '-') || (c == '+') || (c == '.') || (c == ':') || (c == '_')) { switch (state) { case LABEL: sb.append(c); continue; case HEAD: case TAIL: throw new IllegalCharsetNameException(label); } } throw new IllegalCharsetNameException(label); } index = Arrays.binarySearch(LABELS, sb.toString()); if (index >= 0) { return ENCODINGS_FOR_LABELS[index]; } throw new UnsupportedCharsetException(label); } public static Encoding forNameNoReplacement(String label) { Encoding encoding = Encoding.forName(label); if (encoding == Encoding.REPLACEMENT) { throw new UnsupportedCharsetException(label); } return encoding; } public static boolean isSupported(String label) { try { Encoding.forName(label); } catch (UnsupportedCharsetException e) { return false; } return true; } public static boolean isSupportedNoReplacement(String label) { try { Encoding.forNameNoReplacement(label); } catch (UnsupportedCharsetException e) { return false; } return true; } public static SortedMap<String, Charset> availableCharsets() { if (encodings == null) { TreeMap<String, Charset> map = new TreeMap<String, Charset>(); for (Encoding encoding : ENCODINGS) { map.put(encoding.name(), encoding); } encodings = Collections.unmodifiableSortedMap(map); } return encodings; } public static Encoding defaultCharset() { return WINDOWS_1252; } @Override public boolean canEncode() { return false; } @Override public boolean contains(Charset cs) { return false; } @Override public CharsetEncoder newEncoder() { throw new UnsupportedOperationException("Encoder not implemented."); } }