/**
* Copyright (C) 2009-2013 FoundationDB, LLC
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.foundationdb.server.rowdata.encoding;
import com.foundationdb.server.error.AkibanInternalException;
import com.foundationdb.server.rowdata.FieldDef;
import java.io.UnsupportedEncodingException;
/** Single byte encoding. */
public class UTF8Encoder extends VariableWidthEncoding {
public static final Encoding INSTANCE = new UTF8Encoder();
// See https://tools.ietf.org/html/rfc3629
private static final int MAX_1_BYTE = 0x007F;
private static final int MAX_2_BYTE = 0x07FF;
private static final int MAX_3_BYTE = 0xFFFF;
private static final int MAX_4_BYTE = 0x10FFFF;
private UTF8Encoder() {
}
@Override
public int widthFromObject(final FieldDef fieldDef, final Object value) {
int size = fieldDef.getPrefixSize();
if (value != null) {
String str;
if (value instanceof byte[]) {
try {
str = new String((byte[]) value, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new AkibanInternalException("while decoding binary", e);
}
}
else {
str = value.toString();
}
for (int i = 0; i < str.length(); i++) {
int ch = str.charAt(i);
// Assumes consumers want standard UTF8 (e.g. String, nio.charset), not modified
if (ch <= MAX_1_BYTE)
size += 1;
else if (ch <= MAX_2_BYTE)
size += 2;
else {
// codePointAt will return the same as charAt if not a high surrogate pair *or* not followed by low
int codePoint = str.codePointAt(i);
if (codePoint == ch) {
size += 3;
} else {
if (++i >= str.length())
throw new IllegalStateException("Got codePoint but missing low pair: " + str);
if (codePoint <= MAX_3_BYTE)
size += 3;
else {
assert codePoint <= MAX_4_BYTE : "Illegal code point: " + codePoint;
size += 4;
}
}
}
}
}
return size;
}
}