/****************************************************************************** * * Copyright 2011-2012 Tavendo GmbH * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Implements the algorithm "Flexible and Economical UTF-8 Decoder" by * Bjoern Hoehrmann (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/). * ******************************************************************************/ package org.magnum.soda.transport.wamp; /** * Incremental UTF-8 validator. The validator runs with constant memory * consumption (minimal state). Purpose is to validate UTF-8, not to * decode (which could be done easily also, but we rely on Java built in * facilities for that). * * Implements the algorithm "Flexible and Economical UTF-8 Decoder" by * Bjoern Hoehrmann (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/). */ public class Utf8Validator { /// DFA state transitions (14 x 32 = 448). private static final int[] DFA = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // s7..s8 }; private static final int ACCEPT = 0; private static final int REJECT = 1; private int mState; private int mPos; /** * Create new incremental UTF-8 validator. The validator is already * resetted and thus immediately usable. */ public Utf8Validator() { reset(); } /** * Reset validator state to begin validation of new * UTF-8 stream. */ public void reset() { mState = ACCEPT; mPos = 0; } /** * Get end of validated position within stream. When validate() * returns false, indicating an UTF-8 error, this function can * be used to get the exact position within the stream upon * which the violation was encountered. * * @return Current position with stream validated. */ public int position() { return mPos; } /** * Check if incremental validation (currently) has ended on * a complete encoded Unicode codepoint. * * @return True, iff currently ended on codepoint. */ public boolean isValid() { return mState == ACCEPT; } /** * Validate a chunk of octets for UTF-8. * * @param data Buffer which contains chunk to validate. * @param off Offset within buffer where to continue with validation. * @param len Length in octets to validate within buffer. * @return False as soon as UTF-8 violation occurs, true otherwise. */ public boolean validate(byte[] data, int off, int len) { for (int i = off; i < off + len; ++i) { mState = DFA[256 + (mState << 4) + DFA[(int) (0xff & data[i])]]; if (mState == REJECT) { mPos += i; return false; } } mPos += len; return true; } /** * Validate a chunk of octets for UTF-8. * * @param data Buffer which contains chunk to validate. * @return False as soon as UTF-8 violation occurs, true otherwise. */ public boolean validate(byte[] data) { return validate(data, 0, data.length); } }