/* * This is free and unencumbered software released into the public domain. * * Anyone is free to copy, modify, publish, use, compile, sell, or * distribute this software, either in source code form or as a compiled * binary, for any purpose, commercial or non-commercial, and by any * means. * * In jurisdictions that recognize copyright laws, the author or authors * of this software dedicate any and all copyright interest in the * software to the public domain. We make this dedication for the benefit * of the public at large and to the detriment of our heirs and * successors. We intend this dedication to be an overt act of * relinquishment in perpetuity of all present and future rights to this * software under copyright law. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * For more information, please refer to <http://unlicense.org/> */ package jxtn.core.unix; import java.nio.ByteBuffer; /** * UTF-8 related functions * <p> * Rules: * <ul> * <li>Errors are ignored silently unless explicitly stated.</li> * </ul> * </p> * * @author aqd */ public final class FastUTF8 { public static final int UTF8_2B_MIN = 0b11000000; public static final int UTF8_3B_MIN = 0b11100000; public static final int UTF8_4B_MIN = 0b11110000; public static final int UTF8_5B_MIN = 0b11111000; public static final int UTF8_6B_MIN = 0b11111100; public static final int UTF8_6B_MAX = 0b11111101; /** * Encode a single character to UTF-8 bytes * * @param c character to encode * @return UTF-8 bytes representing {@code c} */ public static byte[] encode(char c) { if (c < 0x80) { // Have at most seven bits return new byte[] { (byte) c }; } else if (c < 0x800) { // 2 bytes, 11 bits return new byte[] { (byte) (0xc0 | (c >>> 6)), (byte) (0x80 | (c & 0x3f)) }; } else { // 3 bytes, 16 bits return new byte[] { (byte) (0xe0 | ((c >>> 12))), (byte) (0x80 | ((c >>> 6) & 0x3f)), (byte) (0x80 | (c & 0x3f)) }; } } /** * Encode a single character to UTF-8 bytes and store in specified destination array * * @param c character to encode * @param dstBuffer destination buffer to store the resulting UTF-8 bytes * @param dstOffset offset in destination buffer for storing of the resulting UTF-8 bytes * @return number of UTF-8 bytes encoded from {@code c} */ public static int encode(char c, byte[] dstBuffer, int dstOffset) { if (c < 0x80) { // Have at most seven bits dstBuffer[dstOffset] = (byte) c; return 1; } else if (c < 0x800) { // 2 bytes, 11 bits dstBuffer[dstOffset + 0] = (byte) (0xc0 | (c >>> 6)); dstBuffer[dstOffset + 1] = (byte) (0x80 | (c & 0x3f)); return 2; } else { // 3 bytes, 16 bits dstBuffer[dstOffset + 0] = (byte) (0xe0 | ((c >>> 12))); dstBuffer[dstOffset + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f)); dstBuffer[dstOffset + 2] = (byte) (0x80 | (c & 0x3f)); return 3; } } /** * Encode a {@link String} to UTF-8 C String (NUL-terminated) and store in specified destination array * <p> * If the length or the capacity of {@code dstBuffer} is insufficient, this method shall encode as much as it can * and return the length of encoded UTF-8 bytes. There is no indication about the status of completion. * </p> * * @param s string to encode * @param dstBuffer destination buffer to store the resulting UTF-8 bytes * @return number of UTF-8 bytes encoded from {@code s}, not including the NUL termination at the end. */ public static int encode(CharSequence s, byte[] dstBuffer) { return encode(s, dstBuffer, 0, dstBuffer.length); } public static int encode(CharSequence s, byte[] dstBuffer, int dstOffset, int dstLength) { int dPos = dstOffset; int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */; int sLen = s.length(); for (int i = 0; i < sLen; i++) { char c = s.charAt(i); if (c < 0x80) { if (dPos > dEnd - 1) { break; } // Have at most seven bits dstBuffer[dPos] = (byte) c; dPos += 1; } else if (c < 0x800) { if (dPos > dEnd - 2) { break; } // 2 bytes, 11 bits dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6)); dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f)); dPos += 2; } else { if (dPos > dEnd - 3) { break; } // 3 bytes, 16 bits dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12))); dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f)); dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f)); dPos += 3; } } return dPos - dstOffset; } public static int encode(char[] s, byte[] dstBuffer) { return encode(s, 0, s.length, dstBuffer, 0, dstBuffer.length); } public static int encode(char[] srcBuffer, int srcOffset, int srcLength, byte[] dstBuffer, int dstOffset, int dstLength) { int dPos = dstOffset; int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */; int sEnd = srcOffset + srcLength; for (int i = srcOffset; i < sEnd; i++) { char c = srcBuffer[i]; if (c < 0x80) { if (dPos > dEnd - 1) { break; } // Have at most seven bits dstBuffer[dPos] = (byte) c; dPos += 1; } else if (c < 0x800) { if (dPos > dEnd - 2) { break; } // 2 bytes, 11 bits dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6)); dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f)); dPos += 2; } else { if (dPos > dEnd - 3) { break; } // 3 bytes, 16 bits dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12))); dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f)); dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f)); dPos += 3; } } return dPos - dstOffset; } /** * Encode a {@link String} to UTF-8 C String (NUL-terminated) and store in specified destination array * <p> * If the length or the capacity of {@code dstBuffer} is insufficient, this method shall encode as much as it can * and return the length of encoded UTF-8 bytes. There is no indication about the status of completion. * </p> * * @param s string to encode * @param dstBuffer destination buffer to store the resulting UTF-8 bytes * @return number of UTF-8 bytes encoded from {@code s}, not including the NUL termination at the end. */ public static int encodeToCString(CharSequence s, byte[] dstBuffer) { return encodeToCString(s, dstBuffer, 0, dstBuffer.length); } /** * Encode a {@link String}r to UTF-8 C String (NUL-terminated) and store in specified destination array * <p> * If the length or the capacity of {@code dstBuffer} is insufficient, this method shall encode as much as it can * and return the length of encoded UTF-8 bytes. There is no indication about the status of completion. * </p> * * @param s string to encode * @param dstBuffer destination buffer to store the resulting UTF-8 bytes * @param dstOffset offset in destination buffer for storing of the resulting UTF-8 bytes * @param dstLength length in destination buffer which may be used to store the results * @return number of UTF-8 bytes encoded from {@code s}, not including the NUL termination at the end. */ public static int encodeToCString(CharSequence s, byte[] dstBuffer, int dstOffset, int dstLength) { int dPos = dstOffset; int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */; int sLen = s.length(); for (int i = 0; i < sLen; i++) { char c = s.charAt(i); if (c < 0x80) { if (dPos > dEnd - 1) { break; } // Have at most seven bits dstBuffer[dPos] = (byte) c; dPos += 1; } else if (c < 0x800) { if (dPos > dEnd - 2) { break; } // 2 bytes, 11 bits dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6)); dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f)); dPos += 2; } else { if (dPos > dEnd - 3) { break; } // 3 bytes, 16 bits dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12))); dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f)); dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f)); dPos += 3; } } dstBuffer[dPos] = 0; return dPos - dstOffset; } public static int encodeToCString(char[] s, byte[] dstBuffer) { return encodeToCString(s, 0, s.length, dstBuffer, 0, dstBuffer.length); } public static int encodeToCString(char[] srcBuffer, int srcOffset, int srcLength, byte[] dstBuffer, int dstOffset, int dstLength) { int dPos = dstOffset; int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */; int sEnd = srcOffset + srcLength; for (int i = srcOffset; i < sEnd; i++) { char c = srcBuffer[i]; if (c < 0x80) { if (dPos > dEnd - 1) { break; } // Have at most seven bits dstBuffer[dPos] = (byte) c; dPos += 1; } else if (c < 0x800) { if (dPos > dEnd - 2) { break; } // 2 bytes, 11 bits dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6)); dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f)); dPos += 2; } else { if (dPos > dEnd - 3) { break; } // 3 bytes, 16 bits dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12))); dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f)); dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f)); dPos += 3; } } dstBuffer[dPos] = 0; return dPos - dstOffset; } public static boolean verify(ByteBuffer buffer) { return verify(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); } public static boolean verify(byte[] buffer) { return verify(buffer, 0, buffer.length); } public static boolean verify(byte[] buffer, int offset, int length) { // simplified from Google Profobuf, see https://en.wikipedia.org/wiki/UTF-8#Description int index = offset; int limit = offset + length; while (true) { byte firstByte; // skip ASCII do { if (index >= limit) { return true; } } while ((firstByte = buffer[index++]) > 0); int firstChar = firstByte & 0xFF; // if (firstChar < UTF8_2B_MIN) { return false; } // two-byte form: rest 1 if (firstChar < UTF8_3B_MIN) { if (index >= limit) { return false; } if (!verifyUTF8Rest(buffer[index++])) { return false; } continue; } // three-byte form: rest 2 if (firstChar < UTF8_4B_MIN) { if (index >= limit - 1) { return false; } if (!verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++])) { return false; } continue; } // four-byte form: rest 3 if (firstChar < UTF8_5B_MIN) { if (index >= limit - 2) { return false; } if (!verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++])) { return false; } continue; } // five-byte form: rest 4 if (firstChar < UTF8_6B_MIN) { if (index >= limit - 3) { return false; } if (!verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++])) { return false; } continue; } // six-byte form: rest 5 if (firstChar <= UTF8_6B_MAX) { if (index >= limit - 4) { return false; } if (!verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++]) || !verifyUTF8Rest(buffer[index++])) { return false; } continue; } return false; } } public static boolean verifyUTF8Rest(byte b) { // see https://en.wikipedia.org/wiki/UTF-8#Description int hi = (b & 0xff) >>> 6; return hi == 0b10; } }