/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.basis.chars; import com.google.common.annotations.Beta; import com.google.common.base.Objects; /** * A CharSequence backed by utf-8 bytes instead of java chars (ie. utf-16 bytes) */ @Beta public abstract class AbstractReadOnlyUtfBuf implements ReadableCharBuf { /** * thread-safe, lightweight, oppertunistic index cache * tracks number of multi-byte chars for a given substring * of bytes starting from 0 to a variable end point. * * it is two numbers packed into one int. The reasoning is * that non-volatile int reads/writes are guaranteed to be * atomic whereas two shorts (or any length numeral) could * sometimes give _wrong_ data instead of just potentially * unhelpful data. Longs are not guaranteed to be safe from * word tearing so we limit ourselves to working well for up * to what an int provides. A subclass/ alt impl. may decide * to make other trade-offs re: index cache maximums, thread * safety etc. if needed. * Their non-volatile nature notably benefits us on two fronts. * a) It helps single-threaded use by preventing any memory fencing * overhead. * b) It helps multi-threaded use by making it more likely for * changes to not be visible to each other and thereby making it * more likely for the cache to remain helpful in that thread's * context. The most likely case being two threads concurrently * iterating over it via the CharSequence interface. * * The first number can only ever be negative. It is the number * of 'extra' bytes needed to represent all non-ascii characters. * We do not know the exact number of non-ascii characters since * each one may use a variable number of bytes. However counting * byte-wise allows us to know the number of charactes up to the * stored index, and this is the most helpful information. Since * in the worst case, this negative number can be up to two times * the character index, we pack it into 16 bits + the negative sign * bit for 17 total bits. This lets it get up to double the remaining * 15 bits for the character index. * * The second number is the character index that the first one is defined * in reference to. If the first number is zero, then this index may * be as large as Integer.MAX_VALUE. Otherwise, it is limited to * Short.MAX_VALUE. So by taking this number, character index, and * subtracting the first number, the byte offset, we end up with the * character index + some positive number, which gives us the byte index * the next character can be found at. * * Finally, it is protected (not private) because here there be * dragons and it might be helpful. Likely we will provide a * 'final' subclass for maximum safe-string-replacing semantics. **/ protected int packedIndexCache; protected static final int MAX_USHORT = 65535; protected static final int MAX_USHORT_LESS_ONE = 65534; protected static final int MAX_USHORT_LESS_FOUR = 65531; // upper half of packedIndexCache; must use a locally stored copy of the cache value to be thread safe // it is a negative value counting down from zero. this lets us do lazier ascii purity queries/ adds protected static int cacheByteOffset(int cacheInstance) { return cacheInstance >> 15; } // lower half of packedIndexCache; must use a locally stored copy of the cache value to be thread safe protected static int cacheCharIndex(int cacheInstance) { return 0x7FFF & cacheInstance; } protected static int packIndexCache(int byteOffset, int charIndex) { byteOffset <<= 15; byteOffset |= charIndex; return byteOffset; } protected static int byteIndex(int byteOffset, int charIndex) { return charIndex - byteOffset; } // returns true if entire string is known to be ascii only protected boolean knownAsciiOnly(int cacheInstance) { // byte lengths can't be negative so this implicitly fails non-asciis as well // as tests for completeness return cacheInstance == getByteLength(); } @Override public int length() { // TODO: experiment with getLong() and masking for flag bits; possibly much faster for off-heap impls final int cacheInstance = packedIndexCache; if ((cacheInstance >= 0) && knownAsciiOnly(tryAsciiScan(cacheInstance, getByteLength()))) { return getByteLength(); } int charIndex = cacheCharIndex(cacheInstance); int byteOffset = cacheByteOffset(cacheInstance); final int byteLength = getByteLength(); int byteIndex = byteIndex(byteOffset, charIndex); if (byteIndex == byteLength) { return charIndex; } for (; byteIndex < byteLength; byteIndex++) { byte b = getByte(byteIndex); if (b < 0) { // check four-byte first for the over-zealous branch removal strategy int continuations = ((int) b & Utf8.FOUR_BYTE_MASK) >> Utf8.FOUR_BYTE_SHIFT; byteOffset += continuations; // reverse later substraction continuations += ((int) b & Utf8.THREE_BYTE_MASK) >> Utf8.THREE_BYTE_SHIFT; // always at least two bytes here; we could just use the cont. mask to remove all branches but... continuations += 1; byteOffset -= continuations; byteIndex += continuations; } } // byteOffset is stored as a negative, so remember to add here charIndex = byteLength + byteOffset; if (charIndex <= Short.MAX_VALUE) { packedIndexCache = packIndexCache(byteOffset, charIndex); } return charIndex; } private int tryAsciiScan(int start, int end) { end = Math.min(end, getByteLength()); for (int i = start; i < end; i++) { byte b = getByte(i); if (b < 0) { packedIndexCache = i; return i; } } packedIndexCache = end; return end; } private void nonAsciiScan(BufferIndex index, int end) { if (index.charIndex > end) { // TODO: reverse scanning from index if guessed to be faster index.charIndex = 0; index.byteOffset = 0; index.byteIndex = 0; } // TODO: branch optimization, bounds check optimizations (maybe?) // scan until next character is the requested one for (; index.charIndex < end; index.charIndex++) { byte b = getByte(index.byteIndex); index.byteIndex += 1; // assume well formed and valid index, so all negatives are seq headers if (b < 0) { // advance past the continuation bytes based on header meta-data // check four-byte first for the over-zealous branch removal strategy int continuations = ((int) b & Utf8.FOUR_BYTE_MASK) >> Utf8.FOUR_BYTE_SHIFT; index.charIndex += continuations; index.byteIndex += continuations; continuations = ((int) b & Utf8.THREE_BYTE_MASK) >> Utf8.THREE_BYTE_SHIFT; // always at least two bytes here; we could just use the cont. mask to remove all branches but... continuations += 1; index.byteOffset -= continuations; index.byteIndex += continuations; } } } // start is inclusive, end is exclusive // TODO: enforce argument bounds @Override public ReadableCharBuf subSequence(int start, int end) { final int cacheInstance = packedIndexCache; // ascii pre-computed short circuit; end must be positive so it enforces ascii only. // the comparison checks if the cache knows about all the bytes up to the end index. if (end <= cacheInstance) { return getSubSequenceForByteBounds(start, end); } else if (cacheInstance >= 0 && end <= tryAsciiScan(cacheInstance, end)) { return getSubSequenceForByteBounds(start, end); } BufferIndex index = new BufferIndex(cacheInstance); nonAsciiScan(index, start); // check to see if we would 'split in half' a surrogate pair -- if java won't stop this madness, we will if (index.charIndex > start) { throw new IllegalArgumentException("first character of the requested subsequence is a low-surrogate"); } int startByte = index.byteIndex; nonAsciiScan(index, end); // check to see if we would 'split in half' a surrogate pair -- if java won't stop this madness, we will if (index.byteIndex > end) { throw new IllegalArgumentException("last character of the requested subsequence is a high-surrogate"); } if (index.charIndex <= Short.MAX_VALUE) { packedIndexCache = packIndexCache(index.byteOffset, index.charIndex); } int endByte = index.byteIndex; return getSubSequenceForByteBounds(startByte, endByte); } private char nextCharForBufferIndex(BufferIndex index, boolean highSurrogate) { index.charIndex += 1; byte b = getByte(index.byteIndex ++); char out; if (b >= 0) { // one-byte out = (char) b; } else if (b < Utf8.MIN_THREE_HEADER) { // two-bytes out = (char) ((b & Utf8.TWO_BYTE_HEADER_MASK) << 6); b = getByte(index.byteIndex ++); out |= (char) (b & Utf8.CONTINUATION_MASK); index.byteOffset -= 1; } else if (b < Utf8.MIN_FOUR_HEADER) { // three-bytes out = (char) ((b & Utf8.THREE_BYTE_HEADER_MASK) << (6 + 6)); b = getByte(index.byteIndex ++); out |= (char) ((b & Utf8.CONTINUATION_MASK) << 6); b = getByte(index.byteIndex ++); out |= (char) (b & Utf8.CONTINUATION_MASK); index.byteOffset -= 2; } else { // four-bytes int codePoint = (b & Utf8.FOUR_BYTE_HEADER_MASK) << (6 + 6 + 6); b = getByte(index.byteIndex ++); codePoint |= (b & Utf8.CONTINUATION_MASK) << (6 + 6); b = getByte(index.byteIndex ++); codePoint |= (b & Utf8.CONTINUATION_MASK) << 6; b = getByte(index.byteIndex ++); codePoint |= b & Utf8.CONTINUATION_MASK; // high or low surrogate if (highSurrogate) { out = Character.highSurrogate(codePoint); } else { out = Character.lowSurrogate(codePoint); } index.charIndex += 1; index.byteOffset -= 2; } if (index.charIndex <= Short.MAX_VALUE) { packedIndexCache = packIndexCache(index.byteOffset, index.charIndex); } return out; } @Override public char charAt(int index) { // ascii short cuts byte b = getByte(index); // unused for non-ascii but oh well final int cacheInstance = packedIndexCache; if (index < cacheInstance) { return (char) b; } else if ((cacheInstance >= 0) && (index < tryAsciiScan(cacheInstance, index + 16))) { return (char) b; } BufferIndex bufferIndex = new BufferIndex(packedIndexCache); nonAsciiScan(bufferIndex, index); // check to see if we were asked for a low-surrogate; if so we just passed the four-byter and must rewind boolean highSurrogate = true; if (bufferIndex.charIndex > index) { bufferIndex.byteIndex -= 4; bufferIndex.byteOffset += 2; bufferIndex.charIndex -= 2; highSurrogate = false; } return nextCharForBufferIndex(bufferIndex, highSurrogate); } /** * Should be the same hash as that of a String representing the same * sequence of characters. This hash code is _not_ cached (for now). * It should be trivial to subclass and implement if really needed. */ @Override public int hashCode() { // TODO: use better iteration int length = length(); int hash = 0; for (int i = 0; i < length; i++) { char c = charAt(i); hash = 31 * hash + c; } return hash; } @Override public int compareTo(ReadableCharBuf o) { return CharSequenceComparator.INSTANCE.compare(this, o); } public String toDebugString() { int cacheInstance = packedIndexCache; return Objects.toStringHelper(this) .add("byteLength", getByteLength()) .add("byteIndex", cacheCharIndex(cacheInstance)) .add("charDelta", cacheByteOffset(cacheInstance)) .toString(); } }