package org.apache.cassandra.db.marshal; /* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import com.google.common.base.Charsets; import org.apache.cassandra.utils.ByteBufferUtil; public class UTF8Type extends AbstractType<String> { public static final UTF8Type instance = new UTF8Type(); UTF8Type() {} // singleton public String compose(ByteBuffer bytes) { return getString(bytes); } public ByteBuffer decompose(String value) { return ByteBufferUtil.bytes(value, Charsets.UTF_8); } public int compare(ByteBuffer o1, ByteBuffer o2) { return BytesType.bytesCompare(o1, o2); } public String getString(ByteBuffer bytes) { try { return ByteBufferUtil.string(bytes); } catch (CharacterCodingException e) { throw new MarshalException("invalid UTF8 bytes " + ByteBufferUtil.bytesToHex(bytes)); } } public ByteBuffer fromString(String source) { return decompose(source); } public void validate(ByteBuffer bytes) throws MarshalException { if (!UTF8Validator.validate(bytes.slice())) throw new MarshalException("String didn't validate."); } static class UTF8Validator { enum State { START, TWO, TWO_80, THREE_a0bf, THREE_80bf_1, THREE_80bf_2, FOUR_90bf, FOUR_80bf_3, }; // since we're not converting to java strings, we don't need to worry about converting to surrogates. // buf has already been sliced/duplicated. static boolean validate(ByteBuffer buf) { int b = 0; State state = State.START; while (buf.remaining() > 0) { b = buf.get(); switch (state) { case START: if (b >= 0) { // ascii, state stays start. if (b > 127) return false; } else if ((b >> 5) == -2) { // validate first byte of 2-byte char, 0xc2-0xdf if (b == (byte) 0xc0) // speical case: modified utf8 null is 0xc080. state = State.TWO_80; else if ((b & 0x1e) == 0) return false; state = State.TWO; } else if ((b >> 4) == -2) { // 3 bytes. first byte will be 0xe0 or 0xe1-0xef. handling of second byte will differ. // so 0xe0,0xa0-0xbf,0x80-0xbf or 0xe1-0xef,0x80-0xbf,0x80-0xbf. if (b == (byte)0xe0) state = State.THREE_a0bf; else state = State.THREE_80bf_2; break; } else if ((b >> 3) == -2) { // 4 bytes. this is where the fun starts. if (b == (byte)0xf0) // 0xf0, 0x90-0xbf, 0x80-0xbf, 0x80-0xbf state = State.FOUR_90bf; else if (b == (byte)0xf4) // 0xf4, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf state = State.FOUR_80bf_3; else // 0xf1-0xf3, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf state = State.FOUR_80bf_3; break; } else return false; // malformed. break; case TWO: // validate second byte of 2-byte char, 0x80-0xbf if ((b & 0xc0) != 0x80) return false; state = State.START; break; case TWO_80: if (b != (byte)0x80) return false; state = State.START; break; case THREE_a0bf: if ((b & 0xe0) == 0x80) return false; state = State.THREE_80bf_1; break; case THREE_80bf_1: // expecting 0x80-0xbf if ((b & 0xc0) != 0x80) return false; state = State.START; break; case THREE_80bf_2: // expecting 0x80-bf and then another of the same. if ((b & 0xc0) != 0x80) return false; state = State.THREE_80bf_1; break; case FOUR_90bf: // expecting 0x90-bf. 2nd byte of 4byte sequence. after that it should degrade to 80-bf,80-bf (like 3byte seq). if ((b & 0x30) == 0) return false; state = State.THREE_80bf_2; break; case FOUR_80bf_3: // expecting 0x80-bf 3 times. degenerates to THREE_80bf_2. if ((b & 0xc0) != 0x80) return false; state = State.THREE_80bf_2; break; default: return false; // invalid state. } } // if state != start, we've got underflow. that's an error. return state == State.START; } } }