/*
* This is free and unencumbered software released into the public domain.
*
* Anyone is free to copy, modify, publish, use, compile, sell, or
* distribute this software, either in source code form or as a compiled
* binary, for any purpose, commercial or non-commercial, and by any
* means.
*
* In jurisdictions that recognize copyright laws, the author or authors
* of this software dedicate any and all copyright interest in the
* software to the public domain. We make this dedication for the benefit
* of the public at large and to the detriment of our heirs and
* successors. We intend this dedication to be an overt act of
* relinquishment in perpetuity of all present and future rights to this
* software under copyright law.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* For more information, please refer to <http://unlicense.org/>
*/
package jxtn.core.unix;
import java.nio.ByteBuffer;
/**
* UTF-8 related functions
* <p>
* Rules:
* <ul>
* <li>Errors are ignored silently unless explicitly stated.</li>
* </ul>
* </p>
*
* @author aqd
*/
public final class FastUTF8 {
public static final int UTF8_2B_MIN = 0b11000000;
public static final int UTF8_3B_MIN = 0b11100000;
public static final int UTF8_4B_MIN = 0b11110000;
public static final int UTF8_5B_MIN = 0b11111000;
public static final int UTF8_6B_MIN = 0b11111100;
public static final int UTF8_6B_MAX = 0b11111101;
/**
* Encode a single character to UTF-8 bytes
*
* @param c character to encode
* @return UTF-8 bytes representing {@code c}
*/
public static byte[] encode(char c) {
if (c < 0x80) {
// Have at most seven bits
return new byte[] { (byte) c };
} else if (c < 0x800) {
// 2 bytes, 11 bits
return new byte[] {
(byte) (0xc0 | (c >>> 6)),
(byte) (0x80 | (c & 0x3f))
};
} else {
// 3 bytes, 16 bits
return new byte[] {
(byte) (0xe0 | ((c >>> 12))),
(byte) (0x80 | ((c >>> 6) & 0x3f)),
(byte) (0x80 | (c & 0x3f))
};
}
}
/**
* Encode a single character to UTF-8 bytes and store in specified destination array
*
* @param c character to encode
* @param dstBuffer destination buffer to store the resulting UTF-8 bytes
* @param dstOffset offset in destination buffer for storing of the resulting UTF-8 bytes
* @return number of UTF-8 bytes encoded from {@code c}
*/
public static int encode(char c, byte[] dstBuffer, int dstOffset) {
if (c < 0x80) {
// Have at most seven bits
dstBuffer[dstOffset] = (byte) c;
return 1;
} else if (c < 0x800) {
// 2 bytes, 11 bits
dstBuffer[dstOffset + 0] = (byte) (0xc0 | (c >>> 6));
dstBuffer[dstOffset + 1] = (byte) (0x80 | (c & 0x3f));
return 2;
} else {
// 3 bytes, 16 bits
dstBuffer[dstOffset + 0] = (byte) (0xe0 | ((c >>> 12)));
dstBuffer[dstOffset + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f));
dstBuffer[dstOffset + 2] = (byte) (0x80 | (c & 0x3f));
return 3;
}
}
/**
* Encode a {@link String} to UTF-8 C String (NUL-terminated) and store in specified destination array
* <p>
* If the length or the capacity of {@code dstBuffer} is insufficient, this method shall encode as much as it can
* and return the length of encoded UTF-8 bytes. There is no indication about the status of completion.
* </p>
*
* @param s string to encode
* @param dstBuffer destination buffer to store the resulting UTF-8 bytes
* @return number of UTF-8 bytes encoded from {@code s}, not including the NUL termination at the end.
*/
public static int encode(CharSequence s, byte[] dstBuffer) {
return encode(s, dstBuffer, 0, dstBuffer.length);
}
public static int encode(CharSequence s, byte[] dstBuffer, int dstOffset, int dstLength) {
int dPos = dstOffset;
int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */;
int sLen = s.length();
for (int i = 0; i < sLen; i++) {
char c = s.charAt(i);
if (c < 0x80) {
if (dPos > dEnd - 1) {
break;
}
// Have at most seven bits
dstBuffer[dPos] = (byte) c;
dPos += 1;
} else if (c < 0x800) {
if (dPos > dEnd - 2) {
break;
}
// 2 bytes, 11 bits
dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6));
dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f));
dPos += 2;
} else {
if (dPos > dEnd - 3) {
break;
}
// 3 bytes, 16 bits
dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12)));
dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f));
dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f));
dPos += 3;
}
}
return dPos - dstOffset;
}
public static int encode(char[] s, byte[] dstBuffer) {
return encode(s, 0, s.length, dstBuffer, 0, dstBuffer.length);
}
public static int encode(char[] srcBuffer, int srcOffset, int srcLength,
byte[] dstBuffer, int dstOffset, int dstLength) {
int dPos = dstOffset;
int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */;
int sEnd = srcOffset + srcLength;
for (int i = srcOffset; i < sEnd; i++) {
char c = srcBuffer[i];
if (c < 0x80) {
if (dPos > dEnd - 1) {
break;
}
// Have at most seven bits
dstBuffer[dPos] = (byte) c;
dPos += 1;
} else if (c < 0x800) {
if (dPos > dEnd - 2) {
break;
}
// 2 bytes, 11 bits
dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6));
dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f));
dPos += 2;
} else {
if (dPos > dEnd - 3) {
break;
}
// 3 bytes, 16 bits
dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12)));
dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f));
dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f));
dPos += 3;
}
}
return dPos - dstOffset;
}
/**
* Encode a {@link String} to UTF-8 C String (NUL-terminated) and store in specified destination array
* <p>
* If the length or the capacity of {@code dstBuffer} is insufficient, this method shall encode as much as it can
* and return the length of encoded UTF-8 bytes. There is no indication about the status of completion.
* </p>
*
* @param s string to encode
* @param dstBuffer destination buffer to store the resulting UTF-8 bytes
* @return number of UTF-8 bytes encoded from {@code s}, not including the NUL termination at the end.
*/
public static int encodeToCString(CharSequence s, byte[] dstBuffer) {
return encodeToCString(s, dstBuffer, 0, dstBuffer.length);
}
/**
* Encode a {@link String}r to UTF-8 C String (NUL-terminated) and store in specified destination array
* <p>
* If the length or the capacity of {@code dstBuffer} is insufficient, this method shall encode as much as it can
* and return the length of encoded UTF-8 bytes. There is no indication about the status of completion.
* </p>
*
* @param s string to encode
* @param dstBuffer destination buffer to store the resulting UTF-8 bytes
* @param dstOffset offset in destination buffer for storing of the resulting UTF-8 bytes
* @param dstLength length in destination buffer which may be used to store the results
* @return number of UTF-8 bytes encoded from {@code s}, not including the NUL termination at the end.
*/
public static int encodeToCString(CharSequence s, byte[] dstBuffer, int dstOffset, int dstLength) {
int dPos = dstOffset;
int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */;
int sLen = s.length();
for (int i = 0; i < sLen; i++) {
char c = s.charAt(i);
if (c < 0x80) {
if (dPos > dEnd - 1) {
break;
}
// Have at most seven bits
dstBuffer[dPos] = (byte) c;
dPos += 1;
} else if (c < 0x800) {
if (dPos > dEnd - 2) {
break;
}
// 2 bytes, 11 bits
dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6));
dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f));
dPos += 2;
} else {
if (dPos > dEnd - 3) {
break;
}
// 3 bytes, 16 bits
dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12)));
dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f));
dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f));
dPos += 3;
}
}
dstBuffer[dPos] = 0;
return dPos - dstOffset;
}
public static int encodeToCString(char[] s, byte[] dstBuffer) {
return encodeToCString(s, 0, s.length, dstBuffer, 0, dstBuffer.length);
}
public static int encodeToCString(char[] srcBuffer, int srcOffset, int srcLength,
byte[] dstBuffer, int dstOffset, int dstLength) {
int dPos = dstOffset;
int dEnd = dstOffset + Math.min(dstBuffer.length - dstOffset, dstLength) - 1 /* NUL */;
int sEnd = srcOffset + srcLength;
for (int i = srcOffset; i < sEnd; i++) {
char c = srcBuffer[i];
if (c < 0x80) {
if (dPos > dEnd - 1) {
break;
}
// Have at most seven bits
dstBuffer[dPos] = (byte) c;
dPos += 1;
} else if (c < 0x800) {
if (dPos > dEnd - 2) {
break;
}
// 2 bytes, 11 bits
dstBuffer[dPos + 0] = (byte) (0xc0 | (c >>> 6));
dstBuffer[dPos + 1] = (byte) (0x80 | (c & 0x3f));
dPos += 2;
} else {
if (dPos > dEnd - 3) {
break;
}
// 3 bytes, 16 bits
dstBuffer[dPos + 0] = (byte) (0xe0 | ((c >>> 12)));
dstBuffer[dPos + 1] = (byte) (0x80 | ((c >>> 6) & 0x3f));
dstBuffer[dPos + 2] = (byte) (0x80 | (c & 0x3f));
dPos += 3;
}
}
dstBuffer[dPos] = 0;
return dPos - dstOffset;
}
public static boolean verify(ByteBuffer buffer) {
return verify(buffer.array(),
buffer.arrayOffset() + buffer.position(),
buffer.remaining());
}
public static boolean verify(byte[] buffer) {
return verify(buffer, 0, buffer.length);
}
public static boolean verify(byte[] buffer, int offset, int length) {
// simplified from Google Profobuf, see https://en.wikipedia.org/wiki/UTF-8#Description
int index = offset;
int limit = offset + length;
while (true) {
byte firstByte;
// skip ASCII
do {
if (index >= limit) {
return true;
}
} while ((firstByte = buffer[index++]) > 0);
int firstChar = firstByte & 0xFF;
//
if (firstChar < UTF8_2B_MIN) {
return false;
}
// two-byte form: rest 1
if (firstChar < UTF8_3B_MIN) {
if (index >= limit) {
return false;
}
if (!verifyUTF8Rest(buffer[index++])) {
return false;
}
continue;
}
// three-byte form: rest 2
if (firstChar < UTF8_4B_MIN) {
if (index >= limit - 1) {
return false;
}
if (!verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])) {
return false;
}
continue;
}
// four-byte form: rest 3
if (firstChar < UTF8_5B_MIN) {
if (index >= limit - 2) {
return false;
}
if (!verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])) {
return false;
}
continue;
}
// five-byte form: rest 4
if (firstChar < UTF8_6B_MIN) {
if (index >= limit - 3) {
return false;
}
if (!verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])) {
return false;
}
continue;
}
// six-byte form: rest 5
if (firstChar <= UTF8_6B_MAX) {
if (index >= limit - 4) {
return false;
}
if (!verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])
|| !verifyUTF8Rest(buffer[index++])) {
return false;
}
continue;
}
return false;
}
}
public static boolean verifyUTF8Rest(byte b) {
// see https://en.wikipedia.org/wiki/UTF-8#Description
int hi = (b & 0xff) >>> 6;
return hi == 0b10;
}
}