/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.serializers; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; public class UTF8Serializer extends AbstractTextSerializer { public static final UTF8Serializer instance = new UTF8Serializer(); private UTF8Serializer() { super(StandardCharsets.UTF_8); } public void validate(ByteBuffer bytes) throws MarshalException { if (!UTF8Validator.validate(bytes)) throw new MarshalException("String didn't validate."); } static class UTF8Validator { enum State { START, TWO, TWO_80, THREE_a0bf, THREE_80bf_1, THREE_80bf_2, FOUR_90bf, FOUR_80bf_3, }; // since we're not converting to java strings, we don't need to worry about converting to surrogates. // buf has already been sliced/duplicated. static boolean validate(ByteBuffer buf) { if (buf == null) return false; buf = buf.slice(); int b = 0; State state = State.START; while (buf.remaining() > 0) { b = buf.get(); switch (state) { case START: if (b >= 0) { // ascii, state stays start. if (b > 127) return false; } else if ((b >> 5) == -2) { // validate first byte of 2-byte char, 0xc2-0xdf if (b == (byte) 0xc0) // special case: modified utf8 null is 0xc080. state = State.TWO_80; else if ((b & 0x1e) == 0) return false; else state = State.TWO; } else if ((b >> 4) == -2) { // 3 bytes. first byte will be 0xe0 or 0xe1-0xef. handling of second byte will differ. // so 0xe0,0xa0-0xbf,0x80-0xbf or 0xe1-0xef,0x80-0xbf,0x80-0xbf. if (b == (byte)0xe0) state = State.THREE_a0bf; else state = State.THREE_80bf_2; break; } else if ((b >> 3) == -2) { // 4 bytes. this is where the fun starts. if (b == (byte)0xf0) // 0xf0, 0x90-0xbf, 0x80-0xbf, 0x80-0xbf state = State.FOUR_90bf; else // 0xf4, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf // 0xf1-0xf3, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf state = State.FOUR_80bf_3; break; } else return false; // malformed. break; case TWO: // validate second byte of 2-byte char, 0x80-0xbf if ((b & 0xc0) != 0x80) return false; state = State.START; break; case TWO_80: if (b != (byte)0x80) return false; state = State.START; break; case THREE_a0bf: if ((b & 0xe0) == 0x80) return false; state = State.THREE_80bf_1; break; case THREE_80bf_1: // expecting 0x80-0xbf if ((b & 0xc0) != 0x80) return false; state = State.START; break; case THREE_80bf_2: // expecting 0x80-bf and then another of the same. if ((b & 0xc0) != 0x80) return false; state = State.THREE_80bf_1; break; case FOUR_90bf: // expecting 0x90-bf. 2nd byte of 4byte sequence. after that it should degrade to 80-bf,80-bf (like 3byte seq). if ((b & 0x30) == 0) return false; state = State.THREE_80bf_2; break; case FOUR_80bf_3: // expecting 0x80-bf 3 times. degenerates to THREE_80bf_2. if ((b & 0xc0) != 0x80) return false; state = State.THREE_80bf_2; break; default: return false; // invalid state. } } // if state != start, we've got underflow. that's an error. return state == State.START; } } }