/******************************************************************************* * Copyright (c) 2009-2013 CWI * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * * Davy Landman - Davy.Landman@cwi.nl - CWI *******************************************************************************/ package org.rascalmpl.unicode; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; public class UnicodeDetector { private static final int maximumBOMLength = 4; private static final ByteOrderMarker[] boms = { ByteOrderMarker.UTF8, ByteOrderMarker.UTF32BE, ByteOrderMarker.UTF32LE, // 32 first, to avoid ambituity with the 16 LE ByteOrderMarker.UTF16BE, ByteOrderMarker.UTF16LE }; /** * Try to detect an encoding, only UTF8 and UTF32 we can try to detect. * Other detection are to intensive. * <b>Warning, don't fallback to UTF8, if this method didn't return UTF8 as a encoding * it really isn't UTF8! Try Charset.defaultCharset().</b> * @return either the detected Charset or null */ public static Charset detectByContent(byte[] buffer, int bufferSize) { // first, lets see if it might be a valid UTF8 (biggest chance) // http://www.w3.org/International/questions/qa-forms-utf-8 // we translate this Regular Expression to a while loop // using the help of // http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c boolean match= true; int i = 0; while (match && i + 3 < bufferSize) { int c0 = buffer[i] & 0xff; if (0x01 <= c0 && c0 <= 0x7F) { // one byte UTF8 i++; continue; } int c1 = buffer[i + 1] & 0xff; if (0xC0 <= c0 && c0 < 0xE0 && (c1 & 0xC0) == 0x80) { // two byte UTF8 i += 2; continue; } int c2 = buffer[i + 2] & 0xff; if (0xE0 <= c0 && c0 < 0xF0 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80) { if (c0 == 0xED && 0xA0 <= c1 && c1 <= 0xBF && 0x80 <= c2 && c2 <= 0xBF) { // this is a UTF-16 surrogate pair, which are not allowed in UTF-8 match = false; break; } else { // three byte UTF8 i += 3; continue; } } int c3 = buffer[i + 3] & 0xff; if (0xF0 <= c0 && c0 < 0xF8 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80 && (c3 & 0xC0) == 0x80) { // four byte UTF8 i += 4; continue; } match = false; break; } if (match) return Charset.forName("UTF8"); // the other thing we can check if it might be a UTF32 file // they must be of the pattern 0x00 0x10 0x.. 0x.. (BE) or 0x.. 0x.. 0x10 0x00 (LE) match = true; for (i = 0; i + 1 < bufferSize && match; i+=2) { match = (buffer[i] & 0xff) == 0 && (buffer[i + 1] & 0xff) == 0x10; } if (match) return Charset.forName("UTF-32BE"); match = true; for (i = 2; i + 1 < bufferSize && match; i+=2) { match = (buffer[i] & 0xff) == 0x10 && (buffer[i + 1] & 0xff) == 0x0; } if (match) return Charset.forName("UTF-32LE"); return null; } public static ByteOrderMarker detectBom(byte[] detectionBuffer, int bufferSize) { for (ByteOrderMarker b: boms) { if (b.matches(detectionBuffer, bufferSize)) return b; } return null; } public static boolean isAmbigiousBOM(Charset a, Charset b) { boolean isUTF32LE = a.name().equals("UTF-32LE") || b.name().equals("UTF-32LE"); boolean isUTF16LE = a.name().equals("UTF-16LE") || b.name().equals("UTF-16LE"); boolean isUTF16 = a.name().equals("UTF-16") || b.name().equals("UTF-16"); return isUTF32LE && (isUTF16 || isUTF16LE); } public static int getMaximumBOMLength() { return maximumBOMLength; } public static int getSuggestedDetectionSampleSize() { return 32; } /** * Try to estimate if the content might be incoded in UTF-8/16/32. * <b>Warning, if this method does not return a charset, it can't be UTF8 or UTF32. * It might be UTF-16 (unlickely) or a strange codepoint. * </b> */ public static Charset estimateCharset(InputStream in) throws IOException { byte[] buffer = new byte[getSuggestedDetectionSampleSize()]; int totalRead = in.read(buffer); ByteOrderMarker bom = detectBom(buffer, totalRead); if (bom != null) return bom.getCharset(); return detectByContent(buffer, totalRead); } }