/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.types; import org.apache.flink.annotation.Public; import org.apache.flink.core.memory.DataInputView; import org.apache.flink.core.memory.DataOutputView; import org.apache.flink.core.memory.MemorySegment; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.nio.CharBuffer; import static org.apache.flink.util.Preconditions.checkNotNull; /** * Mutable string data type that implements the Key interface. * StringValue encapsulates the basic functionality of a {@link String}, in a serializable and mutable way. * <p> * The mutability allows to reuse the object inside the user code, also across invocations. Reusing a StringValue object * helps to increase the performance, as string objects are rather heavy-weight objects and incur a lot of garbage * collection overhead, if created and destroyed en masse. * * @see org.apache.flink.types.NormalizableKey * @see java.lang.String * @see java.lang.CharSequence */ @Public public class StringValue implements NormalizableKey<StringValue>, CharSequence, ResettableValue<StringValue>, CopyableValue<StringValue>, Appendable { private static final long serialVersionUID = 1L; private static final char[] EMPTY_STRING = new char[0]; private static final int HIGH_BIT = 0x1 << 7; private static final int HIGH_BIT2 = 0x1 << 13; private static final int HIGH_BIT2_MASK = 0x3 << 6; private char[] value; // character value of the string value, not necessarily completely filled private int len; // length of the string value private int hashCode; // cache for the hashCode // -------------------------------------------------------------------------------------------- // Constructors // -------------------------------------------------------------------------------------------- /** * Initializes the encapsulated String object with an empty string. */ public StringValue() { this.value = EMPTY_STRING; } /** * Initializes this StringValue to the value of the given string. * * @param value The string containing the value for this StringValue. */ public StringValue(CharSequence value) { this.value = EMPTY_STRING; setValue(value); } /** * Initializes this StringValue to a copy the given StringValue. * * @param value The initial value. */ public StringValue(StringValue value) { this.value = EMPTY_STRING; setValue(value); } /** * Initializes the StringValue to a sub-string of the given StringValue. * * @param value The string containing the substring. * @param offset The offset of the substring. * @param len The length of the substring. */ public StringValue(StringValue value, int offset, int len) { this.value = EMPTY_STRING; setValue(value, offset, len); } // -------------------------------------------------------------------------------------------- // Getters and Setters // -------------------------------------------------------------------------------------------- /** * Sets a new length for the string. * * @param len The new length. */ public void setLength(int len) { if (len < 0 || len > this.len) { throw new IllegalArgumentException("Length must be between 0 and the current length."); } this.len = len; } /** * Returns this StringValue's internal character data. The array might be larger than the string * which is currently stored in the StringValue. * * @return The character data. */ public char[] getCharArray() { return this.value; } /** * Gets this StringValue as a String. * * @return A String resembling the contents of this StringValue. */ public String getValue() { return toString(); } /** * Sets the value of the StringValue to the given string. * * @param value The new string value. */ public void setValue(CharSequence value) { checkNotNull(value); setValue(value, 0, value.length()); } /** * Sets the value of the StringValue to the given string. * * @param value The new string value. */ @Override public void setValue(StringValue value) { checkNotNull(value); setValue(value.value, 0, value.len); } /** * Sets the value of the StringValue to a substring of the given string. * * @param value The new string value. * @param offset The position to start the substring. * @param len The length of the substring. */ public void setValue(StringValue value, int offset, int len) { checkNotNull(value); setValue(value.value, offset, len); } /** * Sets the value of the StringValue to a substring of the given string. * * @param value The new string value. * @param offset The position to start the substring. * @param len The length of the substring. */ public void setValue(CharSequence value, int offset, int len) { checkNotNull(value); if (offset < 0 || len < 0 || offset > value.length() - len) { throw new IndexOutOfBoundsException("offset: " + offset + " len: " + len + " value.len: " + len); } ensureSize(len); this.len = len; for (int i = 0; i < len; i++) { this.value[i] = value.charAt(offset + i); } this.len = len; this.hashCode = 0; } /** * Sets the contents of this string to the contents of the given <tt>CharBuffer</tt>. * The characters between the buffer's current position (inclusive) and the buffer's * limit (exclusive) will be stored in this string. * * @param buffer The character buffer to read the characters from. */ public void setValue(CharBuffer buffer) { checkNotNull(buffer); final int len = buffer.length(); ensureSize(len); buffer.get(this.value, 0, len); this.len = len; this.hashCode = 0; } /** * Sets the value of the StringValue to a substring of the given value. * * @param chars The new string value (as a character array). * @param offset The position to start the substring. * @param len The length of the substring. */ public void setValue(char[] chars, int offset, int len) { checkNotNull(chars); if (offset < 0 || len < 0 || offset > chars.length - len) { throw new IndexOutOfBoundsException(); } ensureSize(len); System.arraycopy(chars, offset, this.value, 0, len); this.len = len; this.hashCode = 0; } /** * Sets the value of this <code>StringValue</code>, assuming that the binary data is ASCII coded. The n-th character of the * <code>StringValue</code> corresponds directly to the n-th byte in the given array after the offset. * * @param bytes The binary character data. * @param offset The offset in the array. * @param len The number of bytes to read from the array. */ public void setValueAscii(byte[] bytes, int offset, int len) { if (bytes == null) { throw new NullPointerException("Bytes must not be null"); } if (len < 0 | offset < 0 | offset > bytes.length - len) { throw new IndexOutOfBoundsException(); } ensureSize(len); this.len = len; this.hashCode = 0; final char[] chars = this.value; for (int i = 0, limit = offset + len; offset < limit; offset++, i++) { chars[i] = (char) (bytes[offset] & 0xff); } } // -------------------------------------------------------------------------------------------- // String Methods // -------------------------------------------------------------------------------------------- /** * Returns a new <tt>StringValue</tt>string that is a substring of this string. The * substring begins at the given <code>start</code> index and ends at end of the string * * @param start The beginning index, inclusive. * @return The substring. * @exception IndexOutOfBoundsException Thrown, if the start is negative. */ public StringValue substring(int start) { return substring(start, this.len); } /** * Returns a new <tt>StringValue</tt>string that is a substring of this string. The * substring begins at the given <code>start</code> index and ends at <code>end - 1</code>. * * @param start The beginning index, inclusive. * @param end The ending index, exclusive. * @return The substring. * @exception IndexOutOfBoundsException * Thrown, if the start is negative, or the end is larger than the length. */ public StringValue substring(int start, int end) { return new StringValue(this, start, end - start); } /** * Copies a substring of this string into the given target StringValue. The * substring begins at the given <code>start</code> index and ends at end of the string * * @param target The StringValue object to copy the substring to. * @param start The beginning index, inclusive. * @exception IndexOutOfBoundsException Thrown, if the start is negative. */ public void substring(StringValue target, int start) { substring(target, start, this.len); } /** * Copies a substring of this string into the given target StringValue. The * substring begins at the given <code>start</code> index and ends at <code>end - 1</code>. * * @param target The StringValue object to copy the substring to. * @param start The beginning index, inclusive. * @param end The ending index, exclusive. * @exception IndexOutOfBoundsException * Thrown, if the start is negative, or the end is larger than the length. */ public void substring(StringValue target, int start, int end) { target.setValue(this, start, end - start); } /** * Finds any occurrence of the <code>str</code> character sequence in this StringValue. * * @return The position of the first occurrence of the search string in the string value, or <code>-1</code>, if * the character sequence was not found. */ public int find(CharSequence str) { return find(str, 0); } /** * Finds any occurrence of the <code>str</code> character sequence in this StringValue. * The search starts at position <code>start</code>. * * @return The position of the first occurrence of the search string in the string value, or <code>-1</code>, if * the character sequence was not found. */ public int find(CharSequence str, int start) { final int pLen = this.len; final int sLen = str.length(); if (sLen == 0) { throw new IllegalArgumentException("Cannot find empty string."); } int pPos = start; final char first = str.charAt(0); while (pPos < pLen) { if (first == this.value[pPos++]) { // matching first character final int fallBackPosition = pPos; int sPos = 1; boolean found = true; while (sPos < sLen) { if (pPos >= pLen) { // no more characters in string value pPos = fallBackPosition; found = false; break; } if (str.charAt(sPos++) != this.value[pPos++]) { pPos = fallBackPosition; found = false; break; } } if (found) { return fallBackPosition - 1; } } } return -1; } /** * Checks whether the substring, starting at the specified index, starts with the given prefix string. * * @param prefix The prefix character sequence. * @param startIndex The position to start checking for the prefix. * * @return True, if this StringValue substring, starting at position <code>startIndex</code> has <code>prefix</code> * as its prefix. */ public boolean startsWith(CharSequence prefix, int startIndex) { final char[] thisChars = this.value; final int pLen = this.len; final int sLen = prefix.length(); if ((startIndex < 0) || (startIndex > pLen - sLen)) { return false; } int sPos = 0; while (sPos < sLen) { if (thisChars[startIndex++] != prefix.charAt(sPos++)) { return false; } } return true; } /** * Checks whether this StringValue starts with the given prefix string. * * @param prefix The prefix character sequence. * * @return True, if this StringValue has <code>prefix</code> as its prefix. */ public boolean startsWith(CharSequence prefix) { return startsWith(prefix, 0); } // -------------------------------------------------------------------------------------------- // Appendable Methods // -------------------------------------------------------------------------------------------- /* * (non-Javadoc) * @see java.lang.Appendable#append(char) */ @Override public Appendable append(char c) { grow(this.len + 1); this.value[this.len++] = c; return this; } /* * (non-Javadoc) * @see java.lang.Appendable#append(java.lang.CharSequence) */ @Override public Appendable append(CharSequence csq) { append(csq, 0, csq.length()); return this; } /* * (non-Javadoc) * @see java.lang.Appendable#append(java.lang.CharSequence, int, int) */ @Override public Appendable append(CharSequence csq, int start, int end) { final int otherLen = end - start; grow(this.len + otherLen); for (int pos = start; pos < end; pos++) { this.value[this.len + pos] = csq.charAt(pos); } this.len += otherLen; return this; } /* * (non-Javadoc) * @see java.lang.Appendable#append(java.lang.CharSequence) */ public Appendable append(StringValue csq) { append(csq, 0, csq.length()); return this; } /* * (non-Javadoc) * @see java.lang.Appendable#append(java.lang.CharSequence, int, int) */ public Appendable append(StringValue csq, int start, int end) { final int otherLen = end - start; grow(this.len + otherLen); System.arraycopy(csq.value, start, this.value, this.len, otherLen); this.len += otherLen; return this; } // -------------------------------------------------------------------------------------------- // Serialization / De-Serialization // -------------------------------------------------------------------------------------------- @Override public void read(final DataInputView in) throws IOException { int len = in.readUnsignedByte(); if (len >= HIGH_BIT) { int shift = 7; int curr; len = len & 0x7f; while ((curr = in.readUnsignedByte()) >= HIGH_BIT) { len |= (curr & 0x7f) << shift; shift += 7; } len |= curr << shift; } this.len = len; this.hashCode = 0; ensureSize(len); final char[] data = this.value; for (int i = 0; i < len; i++) { int c = in.readUnsignedByte(); if (c < HIGH_BIT) { data[i] = (char) c; } else { int shift = 7; int curr; c = c & 0x7f; while ((curr = in.readUnsignedByte()) >= HIGH_BIT) { c |= (curr & 0x7f) << shift; shift += 7; } c |= curr << shift; data[i] = (char) c; } } } @Override public void write(final DataOutputView out) throws IOException { int len = this.len; // write the length, variable-length encoded while (len >= HIGH_BIT) { out.write(len | HIGH_BIT); len >>>= 7; } out.write(len); // write the char data, variable length encoded for (int i = 0; i < this.len; i++) { int c = this.value[i]; while (c >= HIGH_BIT) { out.write(c | HIGH_BIT); c >>>= 7; } out.write(c); } } // -------------------------------------------------------------------------------------------- @Override public String toString() { return new String(this.value, 0, this.len); } @Override public int compareTo(StringValue other) { int len1 = this.len; int len2 = other.len; int n = Math.min(len1, len2); char[] v1 = value; char[] v2 = other.value; for (int k = 0; k < n; k++) { char c1 = v1[k]; char c2 = v2[k]; if (c1 != c2) { return c1 - c2; } } return len1 - len2; } @Override public int hashCode() { int h = this.hashCode; if (h == 0 && this.len > 0) { int off = 0; char[] val = this.value; int len = this.len; for (int i = 0; i < len; i++) { h = 31 * h + val[off++]; } this.hashCode = h; } return h; } @Override public boolean equals(final Object obj) { if (this == obj) { return true; } if (obj instanceof StringValue) { final StringValue other = (StringValue) obj; int len = this.len; if (len == other.len) { final char[] tc = this.value; final char[] oc = other.value; int i = 0, j = 0; while (len-- != 0) { if (tc[i++] != oc[j++]) { return false; } } return true; } } return false; } // -------------------------------------------------------------------------------------------- // Char Sequence Implementation // -------------------------------------------------------------------------------------------- @Override public int length() { return this.len; } @Override public char charAt(int index) { if (index < len) { return this.value[index]; } else { throw new IndexOutOfBoundsException(); } } @Override public CharSequence subSequence(int start, int end) { return new StringValue(this, start, end - start); } // -------------------------------------------------------------------------------------------- // Normalized Key // -------------------------------------------------------------------------------------------- @Override public int getMaxNormalizedKeyLen() { return Integer.MAX_VALUE; } @Override public void copyNormalizedKey(MemorySegment target, int offset, int len) { // cache variables on stack, avoid repeated dereferencing of "this" final char[] chars = this.value; final int limit = offset + len; final int end = this.len; int pos = 0; while (pos < end && offset < limit) { char c = chars[pos++]; if (c < HIGH_BIT) { target.put(offset++, (byte) c); } else if (c < HIGH_BIT2) { target.put(offset++, (byte) ((c >>> 7) | HIGH_BIT)); if (offset < limit) { target.put(offset++, (byte) c); } } else { target.put(offset++, (byte) ((c >>> 10) | HIGH_BIT2_MASK)); if (offset < limit) { target.put(offset++, (byte) (c >>> 2)); } if (offset < limit) { target.put(offset++, (byte) c); } } } while (offset < limit) { target.put(offset++, (byte) 0); } } // -------------------------------------------------------------------------------------------- @Override public int getBinaryLength() { return -1; } @Override public void copyTo(StringValue target) { target.len = this.len; target.hashCode = this.hashCode; target.ensureSize(this.len); System.arraycopy(this.value, 0, target.value, 0, this.len); } @Override public StringValue copy() { return new StringValue(this); } @Override public void copy(DataInputView in, DataOutputView target) throws IOException { int len = in.readUnsignedByte(); target.writeByte(len); if (len >= HIGH_BIT) { int shift = 7; int curr; len = len & 0x7f; while ((curr = in.readUnsignedByte()) >= HIGH_BIT) { len |= (curr & 0x7f) << shift; shift += 7; target.writeByte(curr); } len |= curr << shift; target.writeByte(curr); } for (int i = 0; i < len; i++) { int c = in.readUnsignedByte(); target.writeByte(c); while (c >= HIGH_BIT) { c = in.readUnsignedByte(); target.writeByte(c); } } } // -------------------------------------------------------------------------------------------- // Utilities // -------------------------------------------------------------------------------------------- private void ensureSize(int size) { if (this.value.length < size) { this.value = new char[size]; } } /** * Grow and retain content. */ private void grow(int size) { if (this.value.length < size) { char[] value = new char[ Math.max(this.value.length * 3 / 2, size)]; System.arraycopy(this.value, 0, value, 0, this.len); this.value = value; } } // -------------------------------------------------------------------------------------------- // Static Helpers for String Serialization // -------------------------------------------------------------------------------------------- public static String readString(DataInput in) throws IOException { // the length we read is offset by one, because a length of zero indicates a null value int len = in.readUnsignedByte(); if (len == 0) { return null; } if (len >= HIGH_BIT) { int shift = 7; int curr; len = len & 0x7f; while ((curr = in.readUnsignedByte()) >= HIGH_BIT) { len |= (curr & 0x7f) << shift; shift += 7; } len |= curr << shift; } // subtract one for the null length len -= 1; final char[] data = new char[len]; for (int i = 0; i < len; i++) { int c = in.readUnsignedByte(); if (c < HIGH_BIT) { data[i] = (char) c; } else { int shift = 7; int curr; c = c & 0x7f; while ((curr = in.readUnsignedByte()) >= HIGH_BIT) { c |= (curr & 0x7f) << shift; shift += 7; } c |= curr << shift; data[i] = (char) c; } } return new String(data, 0, len); } public static final void writeString(CharSequence cs, DataOutput out) throws IOException { if (cs != null) { // the length we write is offset by one, because a length of zero indicates a null value int lenToWrite = cs.length()+1; if (lenToWrite < 0) { throw new IllegalArgumentException("CharSequence is too long."); } // write the length, variable-length encoded while (lenToWrite >= HIGH_BIT) { out.write(lenToWrite | HIGH_BIT); lenToWrite >>>= 7; } out.write(lenToWrite); // write the char data, variable length encoded for (int i = 0; i < cs.length(); i++) { int c = cs.charAt(i); while (c >= HIGH_BIT) { out.write(c | HIGH_BIT); c >>>= 7; } out.write(c); } } else { out.write(0); } } public static final void copyString(DataInput in, DataOutput out) throws IOException { int len = in.readUnsignedByte(); out.writeByte(len); if (len >= HIGH_BIT) { int shift = 7; int curr; len = len & 0x7f; while ((curr = in.readUnsignedByte()) >= HIGH_BIT) { out.writeByte(curr); len |= (curr & 0x7f) << shift; shift += 7; } out.writeByte(curr); len |= curr << shift; } // note that the length is one larger than the actual length (length 0 is a null string, not a zero length string) len--; for (int i = 0; i < len; i++) { int c = in.readUnsignedByte(); out.writeByte(c); while (c >= HIGH_BIT) { c = in.readUnsignedByte(); out.writeByte(c); } } } }