/**
* Copyright 2007-2015, Kaazing Corporation. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kaazing.netx.ws.internal.util;
import static java.lang.Character.charCount;
import static java.lang.Character.codePointAt;
import static java.lang.String.format;
import java.io.IOException;
import java.nio.ByteBuffer;
public final class Utf8Util {
public static final int INVALID_UTF8 = -1;
private static final String MSG_INVALID_CODEPOINT = "Invalid UTF-16 codepoint %d";
private Utf8Util() {
}
public static int byteCountUTF8(char[] cbuf, int offset, int length) throws IOException {
int count = 0;
while (offset < length) {
int codePoint = codePointAt(cbuf, offset);
count += byteCountUTF8(codePoint);
offset += charCount(codePoint);
}
return count;
}
public static int byteCountUTF8(int codePoint) throws IOException {
if ((codePoint | 0x7f) == 0x7f) {
return 1;
}
else if ((codePoint | 0x07ff) == 0x07ff) {
return 2;
}
else if ((codePoint | 0xffff) == 0xffff) {
return 3;
}
else if ((codePoint | 0x1fffff) == 0x1fffff) {
return 4;
}
else {
throw new IOException("Invalid UTF-8 code point. UTF-8 code point cannot span for more than 4 bytes.");
}
}
public static int initialDecodeUTF8(int remainingWidth, int encodedByte) throws IOException {
switch (remainingWidth) {
case 0:
return encodedByte & 0x7f;
case 1:
return encodedByte & 0x1f;
case 2:
return encodedByte & 0x0f;
case 3:
return encodedByte & 0x07;
default:
throw new IOException("Invalid UTF-8 byte sequence. UTF-8 char cannot span for more than 4 bytes.");
}
}
public static int remainingDecodeUTF8(int decodedBytes, int remainingWidth, int encodedByte) throws IOException {
switch (remainingWidth) {
case 3:
case 2:
case 1:
return (decodedBytes << 6) | (encodedByte & 0x3f);
case 0:
return decodedBytes;
default:
throw new IOException("Invalid UTF-8 byte sequence. UTF-8 char cannot span for more than 4 bytes.");
}
}
public static int remainingBytesUTF8(int leadingByte) {
if ((leadingByte & 0x80) == 0) {
return 0;
}
for (byte i = 0; i < 7; i++) {
int bitMask = 1 << (7 - i);
if ((leadingByte & bitMask) != 0) {
continue;
}
else {
switch (i) {
case 0:
case 7:
throw new IllegalStateException(format("Invalid UTF-8 sequence leader byte: 0x%02x", leadingByte));
default:
return i - 1;
}
}
}
throw new IllegalStateException(String.format("Invalid UTF-8 sequence leader byte: 0x%02x", leadingByte));
}
public static boolean validBytesUTF8(byte[] input) {
for (int index = 0; index < input.length;) {
byte leadingByte = input[index++];
if ((leadingByte & 0xc0) == 0x80) {
return false;
}
int remaining = remainingBytesUTF8(leadingByte);
switch (remaining) {
case 0:
break;
default:
while (remaining-- > 0) {
if ((input[index++] & 0xc0) != 0x80) {
return false;
}
}
}
}
return true;
}
public static int validateUTF8(ByteBuffer buffer, int offset, int length, ErrorHandler errorHandler) {
for (int index = 0; index < length; index++) {
byte leadingByte = buffer.get(offset + index);
final int expectedLen;
int codePoint;
if ((leadingByte & 0x80) == 0) {
continue;
}
if ((leadingByte & 0xff) > 0xf4) {
errorHandler.handleError(format("Invalid leading byte: %x", leadingByte));
return INVALID_UTF8;
}
if ((leadingByte & 0xE0) == 0xC0) {
expectedLen = 2;
codePoint = leadingByte & 0x1F;
if (codePoint < 2) {
errorHandler.handleError(format("Overlong encoding: %x%x", leadingByte, buffer.get(offset + index + 1)));
return INVALID_UTF8;
}
} else if ((leadingByte & 0xF0) == 0xE0) {
expectedLen = 3;
codePoint = leadingByte & 0x0F;
} else if ((leadingByte & 0xF8) == 0xF0) {
expectedLen = 4;
codePoint = leadingByte & 0x07;
} else {
errorHandler.handleError(format("Value exceeds Unicode limit: %x", leadingByte));
return INVALID_UTF8;
}
int characterStartIndex = index;
int remainingLen = expectedLen;
while (--remainingLen > 0) {
if (++index >= length) {
// incomplete character at end
return length - characterStartIndex;
}
byte nextByte = buffer.get(offset + index);
if ((nextByte & 0xC0) != 0x80) {
errorHandler.handleError(format("Invalid continuation byte: %x", nextByte));
return INVALID_UTF8;
}
codePoint = (codePoint << 6) | (nextByte & 0x3F);
if (codePoint > 0x10FFFF) { // maximum Unicode code point
return INVALID_UTF8;
}
}
try {
if (expectedLen > byteCountUTF8(codePoint)) {
errorHandler.handleError(format("Overlong encoding starting at byte %x postion %d", leadingByte,
characterStartIndex));
return INVALID_UTF8;
}
} catch (IOException e) {
errorHandler.handleError(e.getMessage());
return INVALID_UTF8;
}
}
return 0;
}
public static boolean validBytesUTF8(ByteBuffer buf, int offset, int limit) {
for (int index = offset; index < limit;) {
byte leadingByte = buf.get(index++);
if ((leadingByte & 0xc0) == 0x80) {
return false;
}
int remaining = remainingBytesUTF8(leadingByte);
switch (remaining) {
case 0:
break;
default:
while (remaining-- > 0) {
if ((buf.get(index++) & 0xc0) != 0x80) {
return false;
}
}
}
}
return true;
}
/**
* Custom UTF-8 encoding. Generates UTF-8 byte sequence for the specified char[]. The UTF-8 byte sequence is
* encoded in the specified ByteBuffer.
*
* @param srcBuf the source char[] to be encoded as UTF-8 byte sequence
* @param srcOffset offset in the char[] from where the conversion to UTF-8 should begin
* @param srcLength the number of chars to be encoded as UTF-8 bytes
* @param dest the destination ByteBuffer
* @param destOffset offset in the ByteBuffer starting where the encoded UTF-8 bytes should be copied
* @return the number of bytes encoded
*/
public static int charstoUTF8Bytes(char[] srcBuf, int srcOffset, int srcLength, ByteBuffer dest, int destOffset) {
int destMark = destOffset;
for (int i = srcOffset; i < srcLength;) {
char ch = srcBuf[i];
if (ch < 0x0080) {
dest.put(destOffset++, (byte) ch);
}
else if (ch < 0x0800) {
dest.put(destOffset++, (byte) (0xc0 | (ch >> 6)));
dest.put(destOffset++, (byte) (0x80 | ((ch >> 0) & 0x3f)));
}
else if (((ch >= 0x0800) && (ch <= 0xD7FF)) ||
((ch >= 0xE000) && (ch <= 0xFFFF))) {
dest.put(destOffset++, (byte) (0xe0 | (ch >> 12)));
dest.put(destOffset++, (byte) (0x80 | ((ch >> 6) & 0x3F)));
dest.put(destOffset++, (byte) (0x80 | ((ch >> 0) & 0x3F)));
}
else if ((ch >= Character.MIN_SURROGATE) && (ch <= Character.MAX_SURROGATE)) { // Surrogate pair
if (i == srcBuf.length) {
throw new IllegalStateException(format(MSG_INVALID_CODEPOINT, ch));
}
char ch1 = ch;
char ch2 = srcBuf[++i];
if (ch1 > Character.MAX_HIGH_SURROGATE) {
throw new IllegalStateException(format(MSG_INVALID_CODEPOINT, ch1));
}
int codePoint = Character.toCodePoint(ch1, ch2);
// int codePoint = (((ch1 & 0x03FF) << 10) | (ch2 & 0x03FF)) + Character.MIN_SUPPLEMENTARY_CODE_POINT;
dest.put(destOffset++, (byte) (0xf0 | (codePoint >> 18)));
dest.put(destOffset++, (byte) (0x80 | ((codePoint >> 12) & 0x3F)));
dest.put(destOffset++, (byte) (0x80 | ((codePoint >> 6) & 0x3F)));
dest.put(destOffset++, (byte) (0x80 | ((codePoint >> 0) & 0x3F)));
}
else {
throw new IllegalStateException(format(MSG_INVALID_CODEPOINT, ch));
}
i++;
}
return destOffset - destMark;
}
}