AbstractReadOnlyUtfBuf.java example

Explorer

basis-master
- basis-core
  - src
    - main
      - java
        com
        addthis
        basis
        annotations
        Scaling.java
        collect
        CountMap.java
        HotMap.java
        dbq
        DiskBackedQueue.java
        DiskBackedQueueInternals.java
        Page.java
        SerializableSerializer.java
        Serializer.java
        concurrentlinkedhashmap
        EntryWeigher.java
        EvictionMediator.java
        LinkedDeque.java
        MediatedEvictionConcurrentHashMap.java
        Weigher.java
        Weighers.java
        package-info.java
        io
        GZIPInputStreamX.java
        GZOut.java
        IOWrap.java
        jmx
        DynamicMBeanImpl.java
        FieldBasedDynamicMBean.java
        MBeanListener.java
        MBeanRemotingSupport.java
        MBeanUtils.java
        MapMBean.java
        WrappingDynamicMBean.java
        jvm
        LessSafe.java
        Shutdown.java
        kv
        KAPair.java
        KVCounters.java
        KVPair.java
        KVPairs.java
        KVTable.java
        test
        PerfTest.java
        SlowTest.java
        util
        AdjustableSemaphore.java
        Backoff.java
        Base64.java
        Bench.java
        CUID.java
        Calc.java
        ClosableIterator.java
        CookieSafeBase90.java
        Dict.java
        DictUtil.java
        FileLogger.java
        GCInspector.java
        IteratorClone.java
        JitterClock.java
        LessBytes.java
        LessFiles.java
        LessNumbers.java
        LessPaths.java
        LessStreams.java
        LessStrings.java
        MapHelper.java
        MemoryCounter.java
        Meter.java
        Multidict.java
        MultidictUtil.java
        Parameter.java
        SimpleExec.java
        StringSplitter.java
        TokenReplacer.java
        TokenReplacerOverflowException.java
    - test
      - java
        com
        addthis
        basis
        collect
        dbq
        TestDiskBackedQueue.java
        jmx
        FieldBasedDynamicMBeanTest.java
        MBeanSetTest.java
        MBeanUtilsTest.java
        MapMBeanTest.java
        jvm
        LessSafeTest.java
        ShutdownTest.java
        kv
        KVCountersTest.java
        KVPairTest.java
        KVPairsTest.java
        KVTableTest.java
        util
        AdjustableSemaphoreTest.java
        Base64Test.java
        CookieSafeBase90Test.java
        CuidTest.java
        DictUtilTest.java
        LessBytesTest.java
        LessFilesTest.java
        LessNumbersTest.java
        LessStringsTest.java
        MeterTest.java
        ParameterTest.java
        TokenReplacerTest.java
- basis-kafka8
  - src
    - main
      - java
        com
        addthis
        basis
        kafka
        KafkaConsumerFactory.java
        KafkaConsumerService.java
        MessageHandler.java
- basis-net
  - src
    - main
      - java
        com
        addthis
        basis
        net
        ConstrainedHttpClient.java
        HttpUtil.java
        NetUtil.java
        http
        HttpHeader.java
        HttpMethod.java
        HttpResponse.java
    - test
      - java
        com
        addthis
        basis
        net
        ConstrainedHttpClientTest.java
        HttpUtilTest.java
        NetUtilTest.java
- basis-netty
  - src
    - main
      - java
        com
        addthis
        basis
        chars
        AbstractReadOnlyUtfBuf.java
        AsciiBuf.java
        BufferIndex.java
        ByteArrayReadOnlyAsciiBuf.java
        ByteArrayReadOnlyUtfBuf.java
        CharBuf.java
        CharBufs.java
        CharSequenceComparator.java
        ReadOnlyAsciiBuf.java
        ReadOnlyUtfBuf.java
        ReadableCharBuf.java
        Utf8.java
        util
        Varint.java
    - test
      - java
        com
        addthis
        basis
        chars
        ByteArrayReadOnlyUtfBufTest.java
        CharSequenceTests.java
        ReadOnlyUtfBufTest.java
        SharedCacheTests.java
        util
        VarintTest.java
- basis-time
  - src
    - main
      - java
        com
        addthis
        basis
        time
        DTimeUnit.java
        Dates.java
        util
        RollingLog.java
    - test
      - java
        com
        addthis
        basis
        time
        TestDates.java
- benchy
  - src
    - main
      - java
        com
        addthis
        basis
        chars
        ByteArrayCharBufJol.java
        CharSeqIterAndEncode.java
        CharSeqOnlyIter.java
        util
        BitReversals.java
    - test
      - java
        com
        addthis
        basis
        util
        BitReversalsTest.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.addthis.basis.chars;

import com.google.common.annotations.Beta;
import com.google.common.base.Objects;

/**
 * A CharSequence backed by utf-8 bytes instead of java chars (ie. utf-16 bytes)
 */
@Beta
public abstract class AbstractReadOnlyUtfBuf implements ReadableCharBuf {

    /**
     * thread-safe, lightweight, oppertunistic index cache
     * tracks number of multi-byte chars for a given substring
     * of bytes starting from 0 to a variable end point.
     *
     * it is two numbers packed into one int. The reasoning is
     * that non-volatile int reads/writes are guaranteed to be
     * atomic whereas two shorts (or any length numeral) could
     * sometimes give _wrong_ data instead of just potentially
     * unhelpful data. Longs are not guaranteed to be safe from
     * word tearing so we limit ourselves to working well for up
     * to what an int provides. A subclass/ alt impl. may decide
     * to make other trade-offs re: index cache maximums, thread
     * safety etc. if needed.
     * Their non-volatile nature notably benefits us on two fronts.
     * a) It helps single-threaded use by preventing any memory fencing
     * overhead.
     * b) It helps multi-threaded use by making it more likely for
     * changes to not be visible to each other and thereby making it
     * more likely for the cache to remain helpful in that thread's
     * context. The most likely case being two threads concurrently
     * iterating over it via the CharSequence interface.
     *
     * The first number can only ever be negative. It is the number
     * of 'extra' bytes needed to represent all non-ascii characters.
     * We do not know the exact number of non-ascii characters since
     * each one may use a variable number of bytes. However counting
     * byte-wise allows us to know the number of charactes up to the
     * stored index, and this is the most helpful information. Since
     * in the worst case, this negative number can be up to two times
     * the character index, we pack it into 16 bits + the negative sign
     * bit for 17 total bits. This lets it get up to double the remaining
     * 15 bits for the character index.
     *
     * The second number is the character index that the first one is defined
     * in reference to. If the first number is zero, then this index may
     * be as large as Integer.MAX_VALUE. Otherwise, it is limited to
     * Short.MAX_VALUE. So by taking this number, character index, and
     * subtracting the first number, the byte offset, we end up with the
     * character index + some positive number, which gives us the byte index
     * the next character can be found at.
     *
     * Finally, it is protected (not private) because here there be
     * dragons and it might be helpful. Likely we will provide a
     * 'final' subclass for maximum safe-string-replacing semantics.
     **/
    protected int packedIndexCache;

    protected static final int MAX_USHORT = 65535;
    protected static final int MAX_USHORT_LESS_ONE = 65534;
    protected static final int MAX_USHORT_LESS_FOUR = 65531;

    // upper half of packedIndexCache; must use a locally stored copy of the cache value to be thread safe
    // it is a negative value counting down from zero. this lets us do lazier ascii purity queries/ adds
    protected static int cacheByteOffset(int cacheInstance) {
            return cacheInstance >> 15;
    }

    // lower half of packedIndexCache; must use a locally stored copy of the cache value to be thread safe
    protected static int cacheCharIndex(int cacheInstance) {
        return 0x7FFF & cacheInstance;
    }

    protected static int packIndexCache(int byteOffset, int charIndex) {
        byteOffset <<= 15;
        byteOffset |= charIndex;
        return byteOffset;
    }

    protected static int byteIndex(int byteOffset, int charIndex) {
        return charIndex - byteOffset;
    }

    // returns true if entire string is known to be ascii only
    protected boolean knownAsciiOnly(int cacheInstance) {
        // byte lengths can't be negative so this implicitly fails non-asciis as well
        // as tests for completeness
        return cacheInstance == getByteLength();
    }

    @Override
    public int length() {
        // TODO: experiment with getLong() and masking for flag bits; possibly much faster for off-heap impls
        final int cacheInstance = packedIndexCache;
        if ((cacheInstance >= 0) && knownAsciiOnly(tryAsciiScan(cacheInstance, getByteLength()))) {
            return getByteLength();
        }
        int charIndex = cacheCharIndex(cacheInstance);
        int byteOffset = cacheByteOffset(cacheInstance);
        final int byteLength = getByteLength();
        int byteIndex = byteIndex(byteOffset, charIndex);
        if (byteIndex == byteLength) {
            return charIndex;
        }
        for (; byteIndex < byteLength; byteIndex++) {
            byte b = getByte(byteIndex);
            if (b < 0) {
                // check four-byte first for the over-zealous branch removal strategy
                int continuations = ((int) b & Utf8.FOUR_BYTE_MASK) >> Utf8.FOUR_BYTE_SHIFT;
                byteOffset += continuations; // reverse later substraction
                continuations += ((int) b & Utf8.THREE_BYTE_MASK) >> Utf8.THREE_BYTE_SHIFT;
                // always at least two bytes here; we could just use the cont. mask to remove all branches but...
                continuations += 1;
                byteOffset -= continuations;
                byteIndex += continuations;
            }
        }
        // byteOffset is stored as a negative, so remember to add here
        charIndex = byteLength + byteOffset;
        if (charIndex <= Short.MAX_VALUE) {
            packedIndexCache = packIndexCache(byteOffset, charIndex);
        }
        return charIndex;
    }

    private int tryAsciiScan(int start, int end) {
        end = Math.min(end, getByteLength());
        for (int i = start; i < end; i++) {
            byte b = getByte(i);
            if (b < 0) {
                packedIndexCache = i;
                return i;
            }
        }
        packedIndexCache = end;
        return end;
    }

    private void nonAsciiScan(BufferIndex index, int end) {
        if (index.charIndex > end) {
            // TODO: reverse scanning from index if guessed to be faster
            index.charIndex = 0;
            index.byteOffset = 0;
            index.byteIndex = 0;
        }

        // TODO: branch optimization, bounds check optimizations (maybe?)
        // scan until next character is the requested one
        for (; index.charIndex < end; index.charIndex++) {
            byte b = getByte(index.byteIndex);
            index.byteIndex += 1;
            // assume well formed and valid index, so all negatives are seq headers
            if (b < 0) { // advance past the continuation bytes based on header meta-data
                // check four-byte first for the over-zealous branch removal strategy
                int continuations = ((int) b & Utf8.FOUR_BYTE_MASK) >> Utf8.FOUR_BYTE_SHIFT;
                index.charIndex += continuations;
                index.byteIndex += continuations;

                continuations = ((int) b & Utf8.THREE_BYTE_MASK) >> Utf8.THREE_BYTE_SHIFT;
                // always at least two bytes here; we could just use the cont. mask to remove all branches but...
                continuations += 1;
                index.byteOffset -= continuations;
                index.byteIndex += continuations;
            }
        }
    }

    // start is inclusive, end is exclusive
    // TODO: enforce argument bounds
    @Override
    public ReadableCharBuf subSequence(int start, int end) {
        final int cacheInstance = packedIndexCache;
        // ascii pre-computed short circuit; end must be positive so it enforces ascii only.
        // the comparison checks if the cache knows about all the bytes up to the end index.
        if (end <= cacheInstance) {
            return getSubSequenceForByteBounds(start, end);
        } else if (cacheInstance >= 0 && end <= tryAsciiScan(cacheInstance, end)) {
            return getSubSequenceForByteBounds(start, end);
        }
        BufferIndex index = new BufferIndex(cacheInstance);
        nonAsciiScan(index, start);
        // check to see if we would 'split in half' a surrogate pair -- if java won't stop this madness, we will
        if (index.charIndex > start) {
            throw new IllegalArgumentException("first character of the requested subsequence is a low-surrogate");
        }
        int startByte = index.byteIndex;

        nonAsciiScan(index, end);
        // check to see if we would 'split in half' a surrogate pair -- if java won't stop this madness, we will
        if (index.byteIndex > end) {
            throw new IllegalArgumentException("last character of the requested subsequence is a high-surrogate");
        }
        if (index.charIndex <= Short.MAX_VALUE) {
            packedIndexCache = packIndexCache(index.byteOffset, index.charIndex);
        }
        int endByte = index.byteIndex;
        return getSubSequenceForByteBounds(startByte, endByte);
    }

    private char nextCharForBufferIndex(BufferIndex index, boolean highSurrogate) {
        index.charIndex += 1;
        byte b = getByte(index.byteIndex ++);
        char out;
        if (b >= 0) { // one-byte
            out = (char) b;
        } else if (b < Utf8.MIN_THREE_HEADER) { // two-bytes
            out = (char) ((b & Utf8.TWO_BYTE_HEADER_MASK) << 6);
            b = getByte(index.byteIndex ++);
            out |= (char) (b & Utf8.CONTINUATION_MASK);
            index.byteOffset -= 1;
        } else if (b < Utf8.MIN_FOUR_HEADER) { // three-bytes
            out = (char) ((b & Utf8.THREE_BYTE_HEADER_MASK) << (6 + 6));
            b = getByte(index.byteIndex ++);
            out |= (char) ((b & Utf8.CONTINUATION_MASK) << 6);
            b = getByte(index.byteIndex ++);
            out |= (char) (b & Utf8.CONTINUATION_MASK);
            index.byteOffset -= 2;
        } else { // four-bytes
            int codePoint = (b & Utf8.FOUR_BYTE_HEADER_MASK) << (6 + 6 + 6);
            b = getByte(index.byteIndex ++);
            codePoint |= (b & Utf8.CONTINUATION_MASK) << (6 + 6);
            b = getByte(index.byteIndex ++);
            codePoint |= (b & Utf8.CONTINUATION_MASK) << 6;
            b = getByte(index.byteIndex ++);
            codePoint |= b & Utf8.CONTINUATION_MASK;
            // high or low surrogate
            if (highSurrogate) {
                out = Character.highSurrogate(codePoint);
            } else {
                out = Character.lowSurrogate(codePoint);
            }
            index.charIndex  += 1;
            index.byteOffset -= 2;
        }
        if (index.charIndex <= Short.MAX_VALUE) {
            packedIndexCache = packIndexCache(index.byteOffset, index.charIndex);
        }
        return out;
    }

    @Override
    public char charAt(int index) {
        // ascii short cuts
        byte b = getByte(index); // unused for non-ascii but oh well
        final int cacheInstance = packedIndexCache;
        if (index < cacheInstance) {
            return (char) b;
        } else if ((cacheInstance >= 0) && (index < tryAsciiScan(cacheInstance, index + 16))) {
            return (char) b;
        }

        BufferIndex bufferIndex = new BufferIndex(packedIndexCache);
        nonAsciiScan(bufferIndex, index);
        // check to see if we were asked for a low-surrogate; if so we just passed the four-byter and must rewind
        boolean highSurrogate = true;
        if (bufferIndex.charIndex > index) {
            bufferIndex.byteIndex -= 4;
            bufferIndex.byteOffset += 2;
            bufferIndex.charIndex -= 2;
            highSurrogate = false;
        }
        return nextCharForBufferIndex(bufferIndex, highSurrogate);
    }

    /**
     * Should be the same hash as that of a String representing the same
     * sequence of characters. This hash code is _not_ cached (for now).
     * It should be trivial to subclass and implement if really needed.
     */
    @Override
    public int hashCode() {
        // TODO: use better iteration
        int length = length();
        int hash = 0;
        for (int i = 0; i < length; i++) {
            char c = charAt(i);
            hash = 31 * hash + c;
        }
        return hash;
    }

    @Override
    public int compareTo(ReadableCharBuf o) {
        return CharSequenceComparator.INSTANCE.compare(this, o);
    }

    public String toDebugString() {
        int cacheInstance = packedIndexCache;
        return Objects.toStringHelper(this)
                .add("byteLength", getByteLength())
                .add("byteIndex", cacheCharIndex(cacheInstance))
                .add("charDelta", cacheByteOffset(cacheInstance))
                .toString();
    }
}