package com.cloudhopper.commons.charset;
/*
* #%L
* ch-commons-charset
* %%
* Copyright (C) 2012 Cloudhopper by Twitter
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.cloudhopper.commons.charset.CharSequenceAccessor.CharArrayWrapper;
/**
* <p>Charset representing "Modified UTF-8". Java originally used 2 byte char
* primitives to store characters in its Strings. These were originally encoded
* in UCS2 -- which let Java natively support ~65K characters in Unicode. In
* Java 5, UCS2 is no longer used -- UTF-16 is now used. This let's Java
* natively support the entire range of Unicode characters which can be > 65K.
* For higher range UTF-16 characters with a Java char value of (> 0x7FFF), this
* charset does NOT correctly encode these values to the correct UTF-8 byte
* sequence.</p>
*
* <p>Its usually quite uncommon in most situations to actually use a character
* value > 0x7FFF. This is why this charset exists -- it takes advantage of this
* property to speed up UTF-8 encoding/decoding of byte arrays. If you decide
* to solely use this charset for serialization, you also don't risk any issues
* with encoding/decoding since the resulting Java String will always be the same
* as if you actually used UTF-8.</p>
*
* <p>This charset turns out to be very useful for directly encoding/decoding from
* byte arrays (especially if the byte array is already allocated), where the
* default Java classes would force you to create a new byte array. It also
* is ~30% faster than Java at decoding/encoding in most cases. In some cases
* it's a little slower. On average though it usually matches Java and has
* a good chance at being much faster during decoding.</p>
*
* <p>This charset is originally based on much of the work in DataOuputStream.java
* and DataInputStream.java with a few notable tweaks:
* <ul>
* <li>0x0000 is encoded as 0x00 rather than DataOutputStreams 2 byte version.
* This is identical to how UTF-8 is actually supposed to work</li>
* <li>CharSequenceAccessor utility class uses reflection to access the
* private fields in String.java to reduce copying of char arrays.</li>
* <li>CharSequenceAccessor utility class uses reflection to access the
* private fields in String.java to directly access the underlying
* char array vs. ever calling charAt.</li>
* </ul>
*
* </p>
*
* @author joelauer (twitter: @jjlauer or <a href="http://twitter.com/jjlauer" target=window>http://twitter.com/jjlauer</a>)
*/
public class ModifiedUTF8Charset extends BaseCharset {
public ModifiedUTF8Charset() {
// do nothing
}
@Override
public int estimateEncodeByteLength(CharSequence str0) {
return calculateByteLength(str0);
}
@Override
public int estimateDecodeCharLength(byte[] bytes) {
if (bytes == null) {
return 0;
}
return bytes.length; // decoding will never be more than 1 char = 1 byte
}
@Override
public byte[] encode(CharSequence charSeq) {
if (charSeq == null) {
return null;
}
CharArrayWrapper wrapper = CharSequenceAccessor.access(charSeq);
if (wrapper != null) {
// use more efficient direct access to char array using the wrapper
int utf8len = calculateByteLength(null, wrapper.value, wrapper.offset, wrapper.length);
byte[] buf = new byte[utf8len];
encodeToByteArray(null, wrapper.value, wrapper.offset, wrapper.length, buf, 0);
return buf;
} else {
// encode based on charAt() method (slightly less efficient)
int utf8len = calculateByteLength(charSeq, null, 0, 0);
byte[] buf = new byte[utf8len];
encodeToByteArray(charSeq, null, 0, 0, buf, 0);
return buf;
}
}
@Override
public void decode(byte[] bytes, StringBuilder buffer) {
if (bytes == null) {
return;
}
// expand buffer as necessary to support all possible UTF-8 bytes
buffer.ensureCapacity(buffer.length()+bytes.length);
CharArrayWrapper wrapper = CharSequenceAccessor.access(buffer);
// since we want to mimic an "append", the "length" of the existing char
// array represents how much data is currently contained inside it
// we'll start our "append" at that offset
int charLength = decodeToCharArray(bytes, 0, bytes.length, wrapper.value, wrapper.length);
// the "wrapper" is merely prepped for reading
// we'll need to do a final update of it's values
CharSequenceAccessor.updateStringBuilder(buffer, charLength+wrapper.length);
}
@Override
public String decode(byte[] bytes) {
if (bytes == null) {
return null;
}
char[] charBuffer = new char[bytes.length];
int charLength = decodeToCharArray(bytes, 0, bytes.length, charBuffer, 0);
// this prevents re-duplicating a char[] that we know will never change
// its performance impact is more pronounced when decoding larger strings
return CharSequenceAccessor.createOptimizedString(charBuffer, 0, charLength);
//return new String(charBuffer, 0, charLength);
}
public String decode(byte[] bytes, int offset, int length) {
if (bytes == null) {
return null;
}
char[] charBuffer = new char[bytes.length];
int charLength = decodeToCharArray(bytes, offset, length, charBuffer, 0);
// this prevents re-duplicating a char[] that we know will never change
// its performance impact is more pronounced when decoding larger strings
return CharSequenceAccessor.createOptimizedString(charBuffer, 0, charLength);
//return new String(charBuffer, 0, charLength);
}
/**
* Highly efficient method for calculating the byte length of
* a String if it was encoded as modified UTF-8 bytes. Since no byte array
* is allocated just for calculating the byte length, this method can speed up
* checks by 90% vs. something like s.getBytes("UTF8").length. This method
* is adapted from JDK source code for DataOutputStream.java.
* @param charSeq The character sequence to use for encoding.
* @return The number of bytes required to represent the String as modified
* UTF-8 encoded bytes.
*/
static public int calculateByteLength(CharSequence charSeq) {
return calculateByteLength(charSeq, null, 0, 0);
}
/**
* Highly efficient method for calculating the byte length of
* a String if it was encoded as modified UTF-8 bytes. Since no byte array
* is allocated just for calculating the byte length, this method can speed up
* checks by 90% vs. something like s.getBytes("UTF8").length. This method
* is adapted from JDK source code for DataOutputStream.java.
* @param charSeq The optional character sequence to use for encoding rather
* than the provided character buffer. It is always higher performance
* to supply a char array vs. use a CharSequence. Set to null if the
* character array is supplied.
* @param charBuffer The source char array to encode
* @param charOffset The offset in the source char array to start encode from
* @param charLength The length from the offset in the source char array to encode
* @return The number of bytes required to represent the String as modified
* UTF-8 encoded bytes.
*/
static public int calculateByteLength(CharSequence charSeq, char[] charBuffer, int charOffset, int charLength) {
int c = 0;
int byteLength = 0;
int charPos = charOffset; // start at char offset
int charAbsLength = charPos + charLength;
if (charBuffer == null) {
if (charSeq == null) {
return 0;
}
// use charSequence rather than charBuffer
charOffset = 0;
charAbsLength = charSeq.length();
}
for (; charPos < charAbsLength; charPos++) {
// optimized method for getting char to encode
if (charBuffer != null) {
c = charBuffer[charPos];
} else {
c = charSeq.charAt(charPos);
}
if ((c >= 0x0000) && (c <= 0x007F)) {
byteLength++;
} else if (c > 0x07FF) {
byteLength += 3;
} else {
byteLength += 2;
}
}
return byteLength;
}
/**
* Encode the string to an array of UTF-8 bytes. The buffer must be pre-allocated
* and have enough space to hold the encoded string.
* @param charSeq The optional character sequence to use for encoding rather
* than the provided character buffer. It is always higher performance
* to supply a char array vs. use a CharSequence. Set to null if the
* character array is supplied.
* @param charBuffer The source char array to encode
* @param charOffset The offset in the source char array to start encode from
* @param charLength The length from the offset in the source char array to encode
* @param byteBuffer The destination byte array to encode to
* @param byteOffset The offset in the destination byte array to start encode to
* @return The number of bytes written to the destination byte array
* @see #calculateByteLength(java.lang.CharSequence)
*/
static public int encodeToByteArray(CharSequence charSeq, char[] charBuffer, int charOffset, int charLength, byte[] byteBuffer, int byteOffset) {
int c = 0;
int bytePos = byteOffset; // start at byte offset
int charPos = charOffset; // start at char offset
int charAbsLength = charPos + charLength;
if (charBuffer == null) {
if (charSeq == null) {
throw new IllegalArgumentException("Both charSeq and charBuffer cannot be null");
}
// use charSequence rather than charBuffer
charOffset = 0;
charAbsLength = charSeq.length();
}
// optimized method is only ascii chars used
for (; charPos < charAbsLength; charPos++) {
// optimized method for getting char to encode
if (charBuffer != null) {
c = charBuffer[charPos];
} else {
c = charSeq.charAt(charPos);
}
if (!((c >= 0x0000) && (c <= 0x007F)))
break;
byteBuffer[bytePos++] = (byte) c;
}
for (; charPos < charAbsLength; charPos++) {
// optimized method for getting char to encode
if (charBuffer != null) {
c = charBuffer[charPos];
} else {
c = charSeq.charAt(charPos);
}
if ((c >= 0x0000) && (c <= 0x007F)) {
byteBuffer[bytePos++] = (byte) c;
} else if (c > 0x07FF) {
byteBuffer[bytePos++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
byteBuffer[bytePos++] = (byte) (0x80 | ((c >> 6) & 0x3F));
byteBuffer[bytePos++] = (byte) (0x80 | (c & 0x3F));
} else {
byteBuffer[bytePos++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
byteBuffer[bytePos++] = (byte) (0x80 | (c & 0x3F));
}
}
return (bytePos-byteOffset);
}
static public int decodeToCharArray(byte[] byteBuffer, int byteOffset, int byteLength, char[] charBuffer, int charOffset) {
int c = 0, char2 = 0, char3 = 0;
int bytePos = byteOffset;
int byteAbsLength = byteOffset + byteLength;
int charPos = charOffset;
// optimization - do simple conversion of ascii-only chars
while (bytePos < byteAbsLength) {
c = (int) byteBuffer[bytePos] & 0xff;
if (c > 127)
break;
bytePos++;
charBuffer[charPos++] = (char)c;
}
while (bytePos < byteAbsLength) {
c = (int) byteBuffer[bytePos] & 0xff;
switch (c >> 4) {
// cases 0000 thru 0111
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
// 0xxxxxxx
bytePos++;
charBuffer[charPos++] = (char)c;
break;
// why not case 8, 9, 10, or 11? (are those invalid UTF-8 sequences?
case 12:
case 13:
// 110x xxxx then 10xx xxxx
bytePos += 2;
if (bytePos > byteAbsLength)
throw new IllegalArgumentException("malformed input: partial character at end");
char2 = (int) byteBuffer[bytePos - 1];
if ((char2 & 0xC0) != 0x80)
throw new IllegalArgumentException("malformed input around byte " + bytePos);
charBuffer[charPos++] = (char) (((c & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
// 1110 xxxx then 10xx xxxx then 10xx xxxx
bytePos += 3;
if (bytePos > byteAbsLength)
throw new IllegalArgumentException("malformed input: partial character at end");
char2 = (int) byteBuffer[bytePos - 2];
char3 = (int) byteBuffer[bytePos - 1];
if (((char2 & 0xC0) != 0x80)
|| ((char3 & 0xC0) != 0x80))
throw new IllegalArgumentException("malformed input around byte " + (bytePos - 1));
charBuffer[charPos++] = (char) (((c & 0x0F) << 12)
| ((char2 & 0x3F) << 6) | (char3 & 0x3F));
break;
default:
// 10xx xxxx, 1111 xxxx
throw new IllegalArgumentException("malformed input around byte " + bytePos);
}
}
return (charPos - charOffset);
}
}