/* * This file is part of muCommander, http://www.mucommander.com * Copyright (C) 2002-2016 Maxence Bernard * * muCommander is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * muCommander is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.mucommander.commons.io.bom; import java.nio.charset.Charset; /** * BOM represents a Byte-Order Mark, a byte sequence that can be found at the beginning of a Unicode text stream * which indicates the encoding of the text that follows. * * @see BOMInputStream * @author Maxence Bernard */ public class BOM { /** the byte sequence that identifies this BOM */ private byte[] sig; /** the character encoding denoted by this BOM */ private String encoding; /** character encoding aliases that map onto this BOM */ private String aliases[]; /** * Creates a new <code>BOM</code> instance identified by the given signature and denoting the specified * character encoding. * * @param signature the byte sequence that identifies this BOM * @param encoding the character encoding denoted by this BOM * @param aliases character encoding aliases */ BOM(byte signature[], String encoding, String[] aliases) { this.sig = signature; this.encoding = encoding; this.aliases = aliases; } /** * Returns the byte sequence that identifies this BOM at the beginning of a byte stream. * * @return the byte sequence that identifies this BOM at the beginning of a byte stream */ public byte[] getSignature() { return sig; } /** * Returns the character encoding that this BOM denotes. * * @return the character encoding that this BOM denotes */ public String getEncoding() { return encoding; } /** * Returns a set of character encoding aliases that map onto this BOM. * * @return a set of character encoding aliases that map onto this BOM */ public String[] getAliases() { return aliases; } /** * Returns <code>true</code> if this BOM's signature starts with the given byte sequence. * * @param bytes the byte sequence to compare against this BOM's signature * @return true if this BOM's signature starts with the given byte sequence */ public boolean sigStartsWith(byte bytes[]) { int bytesLen = bytes.length; if(bytesLen>sig.length) return false; for(int i=0; i<bytesLen; i++) { if(bytes[i]!= sig[i]) return false; } return true; } /** * Returns <code>true</code> if this BOM's signature matches the given byte sequence. * * @param bytes the byte sequence to compare against this BOM's signature * @return true if this BOM's signature matches the given byte sequence */ public boolean sigEquals(byte bytes[]) { return bytes.length==sig.length && sigStartsWith(bytes); } //////////////////// // Static methods // //////////////////// /** * Returns a {@link BOM} instance for the specified encoding, <code>null</code> if the encoding doesn't * have a corresponding BOM (non-Unicode encoding). The search is case-insensitive. * * <p>All UTF encoding aliases are supported, in a BOM-neutral way: a BOM is always returned, regardless of * whether the particular encoding requires a BOM to be used or not. For instance, * <code>UTF-16LE</code> and <code>UnicodeLittleUnmarked</code> will both return the {@link BOMConstants#UTF16_LE_BOM} * BOM, even though by specification <code>UTF-16LE</code> and <code>UnicodeLittleUnmarked</code> should not * include a BOM in the data stream. Furthermore, when called with <code>UTF-16</code> and <code>UTF-32</code>, * the returned BOM will arbitrarily default to big endian and return {@link BOMConstants#UTF16_BE_BOM} and * {@link BOMConstants#UTF32_BE_BOM} respectively. * * @param encoding name of a character encoding * @return a {@link BOM} instance for the specified encoding, <code>null</code> if the encoding doesn't * have a corresponding BOM (non-Unicode encoding). */ public static BOM getInstance(String encoding) { if(!Charset.isSupported(encoding)) return null; Charset charset = Charset.forName(encoding); // Retrieve the charset's canonical name for aliases we may not know about encoding = charset.name(); String[] aliases; for(int i=0; i<BOMConstants.SUPPORTED_BOMS.length; i++) { if(BOMConstants.SUPPORTED_BOMS[i].getEncoding().equalsIgnoreCase(encoding)) return BOMConstants.SUPPORTED_BOMS[i]; aliases = BOMConstants.SUPPORTED_BOMS[i].getAliases(); for (String alias : aliases) if (alias.equalsIgnoreCase(encoding)) return BOMConstants.SUPPORTED_BOMS[i]; } return null; } //////////////////////// // Overridden methods // //////////////////////// /** * Returns <code>true</code> if and only if the given Object is a <code>BOM</code> instance with the same * signature as this instance. * * * @param o the Object to test for equality * @return true if the specified Object is a BOM instance with the same signature as this instance */ public boolean equals(Object o) { return (o instanceof BOM) && ((BOM)o).sigEquals(sig); } /** * Returns a String representation of this <code>BOM</code>. * * @return returns a String representation of this <code>BOM</code>. */ public String toString() { StringBuilder out; out = new StringBuilder(super.toString()); out.append(", signature="); for(int i=0; i < sig.length; i++) { out.append(0xFF&sig[i]); out.append((i==sig.length-1?"}":", ")); } out.append(", encoding="); out.append(encoding); return out.toString(); } }