/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.compat.hadoop; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.nio.charset.StandardCharsets; /** * Serializes Java Strings similar to a Hadoop Text without depending on the underlying * Hadoop libraries. Code is mostly taken from Hadoop 2.3's WritableUtils. */ public class TextSerializer { /** * Serialize a String using the same logic as a Hadoop Text object */ public static void writeStringAsText(DataOutput stream, String str) throws IOException { byte[] utf8Encoded = str.getBytes(StandardCharsets.UTF_8); writeVLong(stream, utf8Encoded.length); stream.write(utf8Encoded); } /** * Deserialize a Hadoop Text object into a String */ public static String readTextAsString(DataInput in) throws IOException { int bufLen = (int)readVLong(in); byte[] buf = new byte[bufLen]; in.readFully(buf); return new String(buf, StandardCharsets.UTF_8); } /** * From org.apache.hadoop.io.WritableUtis * * Serializes a long to a binary stream with zero-compressed encoding. * For -112 <= i <= 127, only one byte is used with the actual value. * For other values of i, the first byte value indicates whether the * long is positive or negative, and the number of bytes that follow. * If the first byte value v is between -113 and -120, the following long * is positive, with number of bytes that follow are -(v+112). * If the first byte value v is between -121 and -128, the following long * is negative, with number of bytes that follow are -(v+120). Bytes are * stored in the high-non-zero-byte-first order. * * @param stream Binary output stream * @param i Long to be serialized * @throws java.io.IOException */ private static void writeVLong(DataOutput stream, long i) throws IOException { if (i >= -112 && i <= 127) { stream.writeByte((byte)i); return; } int len = -112; if (i < 0) { i ^= -1L; // take one's complement' len = -120; } long tmp = i; while (tmp != 0) { tmp = tmp >> 8; len--; } stream.writeByte((byte)len); len = (len < -120) ? -(len + 120) : -(len + 112); for (int idx = len; idx != 0; idx--) { int shiftbits = (idx - 1) * 8; long mask = 0xFFL << shiftbits; stream.writeByte((byte)((i & mask) >> shiftbits)); } } /** * Reads a zero-compressed encoded long from input stream and returns it. * @param stream Binary input stream * @throws java.io.IOException * @return deserialized long from stream. */ private static long readVLong(DataInput stream) throws IOException { byte firstByte = stream.readByte(); int len = decodeVIntSize(firstByte); if (len == 1) { return firstByte; } long i = 0; for (int idx = 0; idx < len-1; idx++) { byte b = stream.readByte(); i = i << 8; i = i | (b & 0xFF); } return (isNegativeVInt(firstByte) ? (i ^ -1L) : i); } /** * Parse the first byte of a vint/vlong to determine the number of bytes * @param value the first byte of the vint/vlong * @return the total number of bytes (1 to 9) */ private static int decodeVIntSize(byte value) { if (value >= -112) { return 1; } else if (value < -120) { return -119 - value; } return -111 - value; } /** * Given the first byte of a vint/vlong, determine the sign * @param value the first byte * @return is the value negative */ private static boolean isNegativeVInt(byte value) { return value < -120 || (value >= -112 && value < 0); } }