// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* ******************************************************************************* * Copyright (C) 1996-2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * */ package org.apache.tika.parser.txt; /** * This class matches UTF-16 and UTF-32, both big- and little-endian. The * BOM will be used if it is present. */ abstract class CharsetRecog_Unicode extends CharsetRecognizer { static int codeUnit16FromBytes(byte hi, byte lo) { return ((hi & 0xff) << 8) | (lo & 0xff); } // UTF-16 confidence calculation. Very simple minded, but better than nothing. // Any 8 bit non-control characters bump the confidence up. These have a zero high byte, // and are very likely to be UTF-16, although they could also be part of a UTF-32 code. // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. // NULs should be rare in actual text. static int adjustConfidence(int codeUnit, int confidence) { if (codeUnit == 0) { confidence -= 10; } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { confidence += 10; } if (confidence < 0) { confidence = 0; } else if (confidence > 100) { confidence = 100; } return confidence; } /* (non-Javadoc) * @see com.ibm.icu.text.CharsetRecognizer#getName() */ abstract String getName(); /* (non-Javadoc) * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) */ abstract CharsetMatch match(CharsetDetector det); static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode { String getName() { return "UTF-16BE"; } CharsetMatch match(CharsetDetector det) { byte[] input = det.fRawInput; int confidence = 10; int bytesToCheck = Math.min(input.length, 30); for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) { int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]); if (charIndex == 0 && codeUnit == 0xFEFF) { confidence = 100; break; } confidence = adjustConfidence(codeUnit, confidence); if (confidence == 0 || confidence == 100) { break; } } if (bytesToCheck < 4 && confidence < 100) { confidence = 0; } if (confidence > 0) { return new CharsetMatch(det, this, confidence); } return null; } } static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode { String getName() { return "UTF-16LE"; } CharsetMatch match(CharsetDetector det) { byte[] input = det.fRawInput; int confidence = 10; int bytesToCheck = Math.min(input.length, 30); for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) { int codeUnit = codeUnit16FromBytes(input[charIndex + 1], input[charIndex]); if (charIndex == 0 && codeUnit == 0xFEFF) { confidence = 100; break; } confidence = adjustConfidence(codeUnit, confidence); if (confidence == 0 || confidence == 100) { break; } } if (bytesToCheck < 4 && confidence < 100) { confidence = 0; } if (confidence > 0) { return new CharsetMatch(det, this, confidence); } return null; } } static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode { abstract int getChar(byte[] input, int index); abstract String getName(); CharsetMatch match(CharsetDetector det) { byte[] input = det.fRawInput; int limit = (det.fRawLength / 4) * 4; int numValid = 0; int numInvalid = 0; boolean hasBOM = false; int confidence = 0; if (limit == 0) { return null; } if (getChar(input, 0) == 0x0000FEFF) { hasBOM = true; } for (int i = 0; i < limit; i += 4) { int ch = getChar(input, i); if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { numInvalid += 1; } else { numValid += 1; } } // Cook up some sort of confidence score, based on presence of a BOM // and the existence of valid and/or invalid multi-byte sequences. if (hasBOM && numInvalid == 0) { confidence = 100; } else if (hasBOM && numValid > numInvalid * 10) { confidence = 80; } else if (numValid > 3 && numInvalid == 0) { confidence = 100; } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid > numInvalid * 10) { // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance. confidence = 25; } return confidence == 0 ? null : new CharsetMatch(det, this, confidence); } } static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 { int getChar(byte[] input, int index) { return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); } String getName() { return "UTF-32BE"; } } static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 { int getChar(byte[] input, int index) { return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); } String getName() { return "UTF-32LE"; } } }