/*
* Copyright 2012 NGDATA nv
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lilyproject.bytes.impl;
import java.util.Arrays;
import org.lilyproject.bytes.api.DataOutput;
/**
* Implementation of {@link DataOutput} which writes and encodes primitve values to a byte[].
* This byte[] can then be used in the constructor of {@link DataInputImpl}.
*
* <p>The position within the underlying byte[] is maintained so that each write
* call will append the next encoded value in the byte[].
*
* <p>The underlying byte[] is resized when it is not large enough to contain the next value to be written.
*
* <p>This implementation (especially #writeUTF()) is based on (and some pieces are copied from) the work
* done by Lucene in the methods <code>UTF16toUTF8</code> and <code>UTF8toUTF16</code>
* in <code>org.apache.lucene.util.UnicodeUtil.java</code> (revision 1030754),
* and combined with the work done by ElasticSearch in
* <code>org.elasticsearch.common.io.stream.BytesStreamInput.java</code>,
* <code>org.elasticsearch.common.io.stream.BytesStreamOutput.java</code>,
* <code>org.elasticsearch.common.io.stream.StreamInput.java</code>,
* <code>org.elasticsearch.common.io.stream.StreamOutput.java</code>.
*
*/
public class DataOutputImpl implements DataOutput {
public static final int UNI_SUR_HIGH_START = 0xD800;
public static final int UNI_SUR_LOW_START = 0xDC00;
private static final long HALF_SHIFT = 10;
private static final int SURROGATE_OFFSET =
Character.MIN_SUPPLEMENTARY_CODE_POINT - (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
private byte[] buffer;
/** The position at which the next item will be added. */
private int pos = 0;
/**
* Default constructor.
* When it is possible to give a good estimate of the number of bytes
* that will be written, it is better to use {@link DataOutputImpl(int)}.
*/
public DataOutputImpl() {
this(256);
}
/**
* Constructor for <code>DataOutputImpl</code>
* @param sizeEstimate estimated size for the underlying byte[],
* a good estimate can avoid that the byte[] needs to be resized, or that too many bytes are allocated.
*/
public DataOutputImpl(int sizeEstimate) {
buffer = new byte[sizeEstimate];
}
@Override
public byte[] toByteArray() {
return Arrays.copyOfRange(buffer, 0, pos);
}
/**
* Checks if the buffer has enough space to put <code>len</code> bytes.
* If not the buffer is resized to at least twice its current size.
*/
private void assureSize(int len) {
int newcount = pos + len;
if (newcount > buffer.length) {
buffer = Arrays.copyOf(buffer, Math.max(buffer.length << 1, newcount));
}
}
@Override
public void writeByte(byte b) {
assureSize(1);
buffer[pos++] = b;
}
/**
* Writes a byte to the byte[] without checking that there is enough space in the byte[].
* @param b
*/
private void writeByteUnsafe(byte b) {
buffer[pos++] = b;
}
@Override
public void writeBytes(byte[] bytes) {
int length = bytes.length;
assureSize(length);
System.arraycopy(bytes, 0, buffer, pos, length);
pos += length;
}
/**
* Encodes a string to (unmodified) UTF-8 bytes and puts it in the buffer.
*/
@Override
public void writeUTF(String string) {
writeUTF(string, true);
}
@Override
public void writeVUTF(String string) {
writeUTF(string, true, true);
}
@Override
public void writeUTF(String string, boolean includeLength) {
writeUTF(string, includeLength, false);
}
private void writeUTF(String string, boolean includeLength, boolean useVInt) {
if (string == null) {
if (useVInt)
writeVInt(-1);
else
writeInt(-1);
return;
}
int strlen = string.length();
int utflen = 0;
// First calculate the utflen
int i = 0;
while(i < strlen) {
final int code = string.charAt(i++);
if (code < 0x80) {
utflen++;
} else if (code < 0x800) {
utflen += 2;
} else if (code < 0xD800 || code > 0xDFFF) {
utflen += 3;
} else {
// surrogate pair
// confirm valid high surrogate
if (code < 0xDC00 && i < strlen) {
int utf32 = string.charAt(i);
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
utflen += 4;
i++;
continue;
}
}
utflen += 3;
}
}
assureSize(4 + utflen); // Make sure the buffer has enough space to put the bytes for the length and the string
if (includeLength) {
// Write the length in the buffer
if (useVInt) {
writeVIntUnsafe(utflen);
} else {
writeIntUnsafe(utflen);
}
}
int ch = 0; // Character from the string
// Optimized for loop as long as the characters can be encoded as one byte
for (i = 0; i < strlen; i++) {
ch = string.charAt(i);
if (!(ch < 0x80)) {
break; // Once we encounter a character that should be encoded with >1 byte we jump out of this optimized loop
}
buffer[pos++] = (byte) ch;
}
while(i < strlen) {
ch = (int) string.charAt(i++);
if (ch< 0x80) {
buffer[pos++] = (byte)ch;
} else if (ch < 0x800) {
buffer[pos++] = (byte) (0xC0 | (ch >> 6));
buffer[pos++] = (byte)(0x80 | (ch & 0x3F));
} else if (ch < 0xD800 || ch > 0xDFFF) {
buffer[pos++] = (byte)(0xE0 | (ch >> 12));
buffer[pos++] = (byte)(0x80 | ((ch >> 6) & 0x3F));
buffer[pos++] = (byte)(0x80 | (ch & 0x3F));
} else {
// surrogate pair
// confirm valid high surrogate
if (ch < 0xDC00 && i < strlen) {
int utf32 = string.charAt(i);
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
utf32 = (ch << 10) + utf32 + SURROGATE_OFFSET;
i++;
buffer[pos++] = (byte)(0xF0 | (utf32 >> 18));
buffer[pos++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
buffer[pos++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
buffer[pos++] = (byte)(0x80 | (utf32 & 0x3F));
continue;
}
}
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
buffer[pos++] = (byte) 0xEF;
buffer[pos++] = (byte) 0xBF;
buffer[pos++] = (byte) 0xBD;
}
}
}
@Override
public void writeInt(int integer) {
assureSize(4); // Make sure the buffer has enough space
writeIntUnsafe(integer);
}
/**
* Writes the int without checking if there is enough space for it
*/
private void writeIntUnsafe(int integer) {
buffer[pos++] = (byte) (integer >> 24);
buffer[pos++] = (byte) (integer >> 16);
buffer[pos++] = (byte) (integer >> 8);
buffer[pos++] = (byte) (integer);
}
private static byte ZERO = 0;
private static byte ONE = 1;
@Override
public void writeBoolean(boolean b) {
assureSize(1);
writeByteUnsafe(b ? ONE : ZERO);
}
@Override
public void writeDouble(double value) {
writeLong(Double.doubleToLongBits(value));
}
@Override
public void writeLong(long value) {
assureSize(8);
writeIntUnsafe((int) (value >> 32));
writeIntUnsafe((int) value);
}
@Override
public void writeShort(int value) {
assureSize(2);
writeByteUnsafe((byte) (value >> 8));
writeByteUnsafe((byte) value);
}
@Override
public void writeFloat(float v) {
writeInt(Float.floatToIntBits(v));
}
/**
* Writes an int in a variable-length format. Writes between one and
* five bytes. Smaller values take fewer bytes. Negative numbers are not
* supported.
*/
@Override
public void writeVInt(int i) {
assureSize(5);
writeVIntUnsafe(i);
}
/**
* Same as writeVInt(), but without checking that there is enough space for it.
*/
private void writeVIntUnsafe(int i) {
while ((i & ~0x7F) != 0) {
writeByte((byte) ((i & 0x7f) | 0x80));
i >>>= 7;
}
writeByteUnsafe((byte) i);
}
/**
* Writes a long in a variable-length format. Writes between one and five
* bytes. Smaller values take fewer bytes. Negative numbers are not
* supported.
*/
@Override
public void writeVLong(long i) {
assureSize(5);
writeVLongUnsafe(i);
}
/**
* Same as writeVLong(), but without checking that there is enough space for it.
*/
private void writeVLongUnsafe(long i) {
while ((i & ~0x7F) != 0) {
writeByte((byte) ((i & 0x7f) | 0x80));
i >>>= 7;
}
writeByteUnsafe((byte) i);
}
@Override
public int getSize() {
return pos;
}
}