/**
* Copyright (C) 2011 Twitter, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package com.cloudhopper.commons.charset;
import org.apache.log4j.Logger;
/**
*
*
* @author joelauer
*/
public class UTF8Charset extends JavaCharset {
private static final Logger logger = Logger.getLogger(UTF8Charset.class);
public UTF8Charset() {
super("UTF8");
}
@Override
public int estimateEncodeByteLength(CharSequence str0) {
if (str0 == null) {
return 0;
}
// let's double the estimate
return str0.length() * 2;
}
@Override
public int estimateDecodeCharLength(byte[] bytes) {
if (bytes == null) {
return 0;
}
// best guess would be 8-bit chars
return bytes.length;
}
/**
* Highly efficient and performant method for calculating the byte length of
* a String if it was encoded as UTF-8 bytes. Since no byte array is allocated
* just for getting the byte length, this method is proven to speed up
* checks by 90% vs. something like s.getBytes("UTF8").length.
* @param s The String to calculate the UTF-8 byte length from
* @return The number of bytes required to represent the String
* @see http://mail-archives.apache.org/mod_mbox/incubator-thrift-commits/201004.mbox/%3C20100425152011.3DFA123888FE@eris.apache.org%3E
*/
public static int calculateByteLength(final String s) {
if (s == null) {
return 0;
}
int byteLength = 0;
int c;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
if (c <= 0x007F) {
byteLength++;
} else if (c > 0x07FF) {
byteLength += 3;
} else {
byteLength += 2;
}
}
return byteLength;
}
/** THESE METHODS MAY ALL PROVE TO BE FASTER THAN DEFAULT JVM IMPL
* http://mail-archives.apache.org/mod_mbox/incubator-thrift-commits/201004.mbox/%3C20100425152011.3DFA123888FE@eris.apache.org%3E
*
* tested this out -- definitely cuts down conversions by 50%
public static byte[] encode(String s) {
byte[] buf = new byte[calculateByteLength(s)];
encode(s, buf, 0);
return buf;
}
public static void encode(String s, byte[] buf, int offset) {
int nextByte = 0;
int c;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
if (c <= 0x007F) {
buf[offset + nextByte] = (byte)c;
nextByte++;
} else if (c > 0x07FF) {
buf[offset + nextByte ] = (byte)(0xE0 | c >> 12 & 0x0F);
buf[offset + nextByte + 1] = (byte)(0x80 | c >> 6 & 0x3F);
buf[offset + nextByte + 2] = (byte)(0x80 | c & 0x3F);
nextByte+=3;
} else {
buf[offset + nextByte ] = (byte)(0xC0 | c >> 6 & 0x1F);
buf[offset + nextByte + 1] = (byte)(0x80 | c & 0x3F);
nextByte+=2;
}
}
}
/**
public static String decode(byte[] buf) {
return decode(buf, 0, buf.length);
}
public static String decode(byte[] buf, int offset, int byteLength) {
int charCount = 0;
char[] chars = new char[byteLength];
int c;
int byteIndex = offset;
int charIndex = 0;
while (byteIndex < offset + byteLength) {
c = buf[byteIndex++] & 0xFF;
switch (c >> 4) {
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
chars[charIndex++] = (char) c;
break;
case 12:
case 13:
chars[charIndex++] = (char) ((c & 0x1F) << 6 | (buf[byteIndex++] & 0x3F));
break;
case 14:
chars[charIndex++] = (char) ((c & 0x0F) << 12 | (buf[byteIndex++] & 0x3F) << 6 | (buf[byteIndex++] & 0x3F) << 0);
break;
}
charCount++;
}
return new String(chars, 0, charCount);
}
*/
}