/*
* This file is part of muCommander, http://www.mucommander.com
* Copyright (C) 2002-2016 Maxence Bernard
*
* muCommander is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* muCommander is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.mucommander.commons.io;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
/**
* This class allows to guess at an encoding in which an array of bytes is encoded. Detecting an encoding is by no means
* an accurate operation, as it relies on heuristics that are imprecise by nature. However, accuracy improves with the
* quantity of bytes that is supplied: a small amount of data (say 10 bytes) has little chance of being guessed
* correctly, whereas a larger amount of data (say 1000 bytes) is likely to provide a good result. On the other hand,
* providing a very large amount of data will only marginally improve the accuracy, and is not worth the extra effort
* considering that encoding detection is a costly operation which involves many comparisons per byte.
* The {@link #MAX_RECOMMENDED_BYTE_SIZE} field controls that threshold: if a supplied byte array is larger than this
* value, the additional bytes will not be processed by the <code>detectEncoding</code> methods. Therefore, this value
* should be taken into account if bytes are to be fetched specifically for the purpose of detecting the encoding.
*
* <p>
* EncodingDetector uses <i>ICU4J</i> under the hood. Here's a list of encodings that can currently be detected:
* <pre>
* UTF-8
* UTF-16BE
* UTF-16LE
* UTF-32BE
* UTF-32LE
* Shift_JIS
* ISO-2022-JP
* ISO-2022-CN
* ISO-2022-KR
* GB18030
* EUC-JP
* EUC-KR
* Big5
* ISO-8859-1
* ISO-8859-2
* ISO-8859-5
* ISO-8859-6
* ISO-8859-7
* ISO-8859-8
* windows-1251
* windows-1256
* KOI8-R
* ISO-8859-9
* </pre>
* </p>
*
* @author Maxence Bernard, Nicolas Rinaudo
* @see <a href="http://philip.html5.org/data/charsets.html">ICU charset detection accuracy</a>
*/
public class EncodingDetector {
private static final Logger LOGGER = LoggerFactory.getLogger(EncodingDetector.class);
/** Maximum number of bytes that the detectEncoding methods will process.
* <p>
* See http://philip.html5.org/data/charsets.html and http://philip.html5.org/data/encoding-detection.svg
* for why 4096 is the recommended size.
* </p>
* */
public final static int MAX_RECOMMENDED_BYTE_SIZE = 4096;
/**
* This method is a shorthand for {@link #detectEncoding(byte[], int, int) detectEncoding(b, 0, b.length)}.
*
* @param bytes the bytes for which to detect the encoding
* @return the best guess at the character encoding, null if there is none (not enough data or confidence)
*/
public static String detectEncoding(byte bytes[]) {
return detectEncoding(bytes, 0, bytes.length);
}
/**
* Try and detect the character encoding in which the given bytes are encoded, and returns the best guess or
* <code>null</code> if there is none (not enough data or confidence).
* Note that the returned character encoding may not be available on the Java runtime -- use
* <code>java.nio.Charset#isSupported(String)</code> to determine if it is available.
*
* <p>A maximum of {@link #MAX_RECOMMENDED_BYTE_SIZE} will be read from the array. If the array is larger than this
* value, all further bytes will be ignored.</p>
*
* @param bytes the bytes for which to detect the encoding
* @param off the array offset at which the data to process starts
* @param len length of the data in the array
* @return the best guess at the encoding, null if there is none (not enough data or confidence)
*/
public static String detectEncoding(byte bytes[], int off, int len) {
// The current ICU CharsetDetector class will throw an ArrayIndexOutOfBoundsException exception if the
// supplied array is less than 4 bytes long. In that case, return null.
if(len<4)
return null;
// Trim the array if it is too long, detecting the charset is an expensive operation and past a certain point,
// having more bytes won't help any further
if(len > MAX_RECOMMENDED_BYTE_SIZE)
len = MAX_RECOMMENDED_BYTE_SIZE;
// CharsetDetector will process the array fully, so if the data does not start at 0 or ends before the array's
// length, create a new array that fits the data exactly
if(off>0 || len<bytes.length) {
byte tmp[] = new byte[len];
System.arraycopy(bytes, off, tmp, 0, len);
bytes = tmp;
}
CharsetDetector cd = new CharsetDetector();
cd.setText(bytes);
CharsetMatch cm = cd.detect();
// Debug info
LOGGER.trace("bestMatch getName()={}, getConfidence()={}", (cm==null?"null":cm.getName()),
(cm==null?"null":Integer.toString(cm.getConfidence())));
// CharsetMatch cms[] = cd.detectAll();
// for(int i=0; i<cms.length; i++)
// CommonsLogger.finest("getName()="+cms[i].getName()+" getConfidence()="+cms[i].getConfidence());
return cm==null?null:cm.getName();
}
/**
* Try and detect the character encoding in which the bytes contained by the given <code>InputStream</code> are
* encoded, and returns the best guess or <code>null</code> if there is none (not enough data or confidence).
* Note that the returned character encoding may or may not be available on the Java runtime -- use
* <code>java.nio.Charset#isSupported(String)</code> to determine if it is available.
*
* <p>A maximum of {@link #MAX_RECOMMENDED_BYTE_SIZE} will be read from the <code>InputStream</code>. The
* stream will not be closed and will not be repositionned after the bytes have been read. It is up to the calling
* method to use the <code>InputStream#mark()</code> and <code>InputStream#reset()</code> methods (if supported)
* or reopen the stream if needed.
* </p>
*
* @param in the InputStream that supplies the bytes
* @return the best guess at the character encoding, null if there is none (not enough data or confidence)
* @throws IOException if an error occurred while reading the stream
*/
public static String detectEncoding(InputStream in) throws IOException {
byte buf[] = BufferPool.getByteArray(MAX_RECOMMENDED_BYTE_SIZE);
try {
return detectEncoding(buf, 0, StreamUtils.readUpTo(in, buf));
}
finally {
BufferPool.releaseByteArray(buf);
}
}
/**
* Returns an array of encodings that can be detected by the <code>detectEncoding</code> methods.
* Note that some of the returned character encodings may not be available on the Java runtime.
*
* @return an array of encodings that can be detected by the <code>detectEncoding</code> methods.
*/
public static String[] getDetectableEncodings() {
return CharsetDetector.getAllDetectableCharsets();
}
/**
* Lists all detectable encodings as returned by {@link #getDetectableEncodings()} to the standard output.
* @param args command line arguments.
*/
public static void main(String args[]) {
String encodings[] = getDetectableEncodings();
for (String encoding : encodings)
System.out.println(encoding);
}
}