/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.record; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.io.WritableUtils; /** * Various utility functions for Hadooop record I/O runtime. * * @deprecated Replaced by <a href="http://hadoop.apache.org/avro/">Avro</a>. */ @Deprecated @InterfaceAudience.Public @InterfaceStability.Stable public class Utils { /** Cannot create a new instance of Utils */ private Utils() { } public static final char[] hexchars = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; /** * * @param s * @return */ static String toXMLString(String s) { StringBuilder sb = new StringBuilder(); for (int idx = 0; idx < s.length(); idx++) { char ch = s.charAt(idx); if (ch == '<') { sb.append("<"); } else if (ch == '&') { sb.append("&"); } else if (ch == '%') { sb.append("%0025"); } else if (ch < 0x20 || (ch > 0xD7FF && ch < 0xE000) || (ch > 0xFFFD)) { sb.append("%"); sb.append(hexchars[(ch & 0xF000) >> 12]); sb.append(hexchars[(ch & 0x0F00) >> 8]); sb.append(hexchars[(ch & 0x00F0) >> 4]); sb.append(hexchars[(ch & 0x000F)]); } else { sb.append(ch); } } return sb.toString(); } static private int h2c(char ch) { if (ch >= '0' && ch <= '9') { return ch - '0'; } else if (ch >= 'A' && ch <= 'F') { return ch - 'A' + 10; } else if (ch >= 'a' && ch <= 'f') { return ch - 'a' + 10; } return 0; } /** * * @param s * @return */ static String fromXMLString(String s) { StringBuilder sb = new StringBuilder(); for (int idx = 0; idx < s.length();) { char ch = s.charAt(idx++); if (ch == '%') { int ch1 = h2c(s.charAt(idx++)) << 12; int ch2 = h2c(s.charAt(idx++)) << 8; int ch3 = h2c(s.charAt(idx++)) << 4; int ch4 = h2c(s.charAt(idx++)); char res = (char)(ch1 | ch2 | ch3 | ch4); sb.append(res); } else { sb.append(ch); } } return sb.toString(); } /** * * @param s * @return */ static String toCSVString(String s) { StringBuilder sb = new StringBuilder(s.length()+1); sb.append('\''); int len = s.length(); for (int i = 0; i < len; i++) { char c = s.charAt(i); switch(c) { case '\0': sb.append("%00"); break; case '\n': sb.append("%0A"); break; case '\r': sb.append("%0D"); break; case ',': sb.append("%2C"); break; case '}': sb.append("%7D"); break; case '%': sb.append("%25"); break; default: sb.append(c); } } return sb.toString(); } /** * * @param s * @throws java.io.IOException * @return */ static String fromCSVString(String s) throws IOException { if (s.charAt(0) != '\'') { throw new IOException("Error deserializing string."); } int len = s.length(); StringBuilder sb = new StringBuilder(len-1); for (int i = 1; i < len; i++) { char c = s.charAt(i); if (c == '%') { char ch1 = s.charAt(i+1); char ch2 = s.charAt(i+2); i += 2; if (ch1 == '0' && ch2 == '0') { sb.append('\0'); } else if (ch1 == '0' && ch2 == 'A') { sb.append('\n'); } else if (ch1 == '0' && ch2 == 'D') { sb.append('\r'); } else if (ch1 == '2' && ch2 == 'C') { sb.append(','); } else if (ch1 == '7' && ch2 == 'D') { sb.append('}'); } else if (ch1 == '2' && ch2 == '5') { sb.append('%'); } else { throw new IOException("Error deserializing string."); } } else { sb.append(c); } } return sb.toString(); } /** * * @param s * @return */ static String toXMLBuffer(Buffer s) { return s.toString(); } /** * * @param s * @throws java.io.IOException * @return */ static Buffer fromXMLBuffer(String s) throws IOException { if (s.length() == 0) { return new Buffer(); } int blen = s.length()/2; byte[] barr = new byte[blen]; for (int idx = 0; idx < blen; idx++) { char c1 = s.charAt(2*idx); char c2 = s.charAt(2*idx+1); barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); } return new Buffer(barr); } /** * * @param buf * @return */ static String toCSVBuffer(Buffer buf) { StringBuilder sb = new StringBuilder("#"); sb.append(buf.toString()); return sb.toString(); } /** * Converts a CSV-serialized representation of buffer to a new * Buffer * @param s CSV-serialized representation of buffer * @throws java.io.IOException * @return Deserialized Buffer */ static Buffer fromCSVBuffer(String s) throws IOException { if (s.charAt(0) != '#') { throw new IOException("Error deserializing buffer."); } if (s.length() == 1) { return new Buffer(); } int blen = (s.length()-1)/2; byte[] barr = new byte[blen]; for (int idx = 0; idx < blen; idx++) { char c1 = s.charAt(2*idx+1); char c2 = s.charAt(2*idx+2); barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); } return new Buffer(barr); } private static int utf8LenForCodePoint(final int cpt) throws IOException { if (cpt >=0 && cpt <= 0x7F) { return 1; } if (cpt >= 0x80 && cpt <= 0x07FF) { return 2; } if ((cpt >= 0x0800 && cpt < 0xD800) || (cpt > 0xDFFF && cpt <= 0xFFFD)) { return 3; } if (cpt >= 0x10000 && cpt <= 0x10FFFF) { return 4; } throw new IOException("Illegal Unicode Codepoint "+ Integer.toHexString(cpt)+" in string."); } private static final int B10 = Integer.parseInt("10000000", 2); private static final int B110 = Integer.parseInt("11000000", 2); private static final int B1110 = Integer.parseInt("11100000", 2); private static final int B11110 = Integer.parseInt("11110000", 2); private static final int B11 = Integer.parseInt("11000000", 2); private static final int B111 = Integer.parseInt("11100000", 2); private static final int B1111 = Integer.parseInt("11110000", 2); private static final int B11111 = Integer.parseInt("11111000", 2); private static int writeUtf8(int cpt, final byte[] bytes, final int offset) throws IOException { if (cpt >=0 && cpt <= 0x7F) { bytes[offset] = (byte) cpt; return 1; } if (cpt >= 0x80 && cpt <= 0x07FF) { bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); cpt = cpt >> 6; bytes[offset] = (byte) (B110 | (cpt & 0x1F)); return 2; } if ((cpt >= 0x0800 && cpt < 0xD800) || (cpt > 0xDFFF && cpt <= 0xFFFD)) { bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); cpt = cpt >> 6; bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); cpt = cpt >> 6; bytes[offset] = (byte) (B1110 | (cpt & 0x0F)); return 3; } if (cpt >= 0x10000 && cpt <= 0x10FFFF) { bytes[offset+3] = (byte) (B10 | (cpt & 0x3F)); cpt = cpt >> 6; bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); cpt = cpt >> 6; bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); cpt = cpt >> 6; bytes[offset] = (byte) (B11110 | (cpt & 0x07)); return 4; } throw new IOException("Illegal Unicode Codepoint "+ Integer.toHexString(cpt)+" in string."); } static void toBinaryString(final DataOutput out, final String str) throws IOException { final int strlen = str.length(); byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max int utf8Len = 0; int idx = 0; while(idx < strlen) { final int cpt = str.codePointAt(idx); idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1; utf8Len += writeUtf8(cpt, bytes, utf8Len); } writeVInt(out, utf8Len); out.write(bytes, 0, utf8Len); } static boolean isValidCodePoint(int cpt) { return !((cpt > 0x10FFFF) || (cpt >= 0xD800 && cpt <= 0xDFFF) || (cpt >= 0xFFFE && cpt <=0xFFFF)); } private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) { int cpt = 0; cpt = (((b1 & ~B11111) << 18) | ((b2 & ~B11) << 12) | ((b3 & ~B11) << 6) | (b4 & ~B11)); return cpt; } private static int utf8ToCodePoint(int b1, int b2, int b3) { int cpt = 0; cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11)); return cpt; } private static int utf8ToCodePoint(int b1, int b2) { int cpt = 0; cpt = (((b1 & ~B111) << 6) | (b2 & ~B11)); return cpt; } private static void checkB10(int b) throws IOException { if ((b & B11) != B10) { throw new IOException("Invalid UTF-8 representation."); } } static String fromBinaryString(final DataInput din) throws IOException { final int utf8Len = readVInt(din); final byte[] bytes = new byte[utf8Len]; din.readFully(bytes); int len = 0; // For the most commmon case, i.e. ascii, numChars = utf8Len StringBuilder sb = new StringBuilder(utf8Len); while(len < utf8Len) { int cpt = 0; final int b1 = bytes[len++] & 0xFF; if (b1 <= 0x7F) { cpt = b1; } else if ((b1 & B11111) == B11110) { int b2 = bytes[len++] & 0xFF; checkB10(b2); int b3 = bytes[len++] & 0xFF; checkB10(b3); int b4 = bytes[len++] & 0xFF; checkB10(b4); cpt = utf8ToCodePoint(b1, b2, b3, b4); } else if ((b1 & B1111) == B1110) { int b2 = bytes[len++] & 0xFF; checkB10(b2); int b3 = bytes[len++] & 0xFF; checkB10(b3); cpt = utf8ToCodePoint(b1, b2, b3); } else if ((b1 & B111) == B110) { int b2 = bytes[len++] & 0xFF; checkB10(b2); cpt = utf8ToCodePoint(b1, b2); } else { throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+ " at offset "+(len-1)+" in length of "+utf8Len); } if (!isValidCodePoint(cpt)) { throw new IOException("Illegal Unicode Codepoint "+ Integer.toHexString(cpt)+" in stream."); } sb.appendCodePoint(cpt); } return sb.toString(); } /** Parse a float from a byte array. */ public static float readFloat(byte[] bytes, int start) { return WritableComparator.readFloat(bytes, start); } /** Parse a double from a byte array. */ public static double readDouble(byte[] bytes, int start) { return WritableComparator.readDouble(bytes, start); } /** * Reads a zero-compressed encoded long from a byte array and returns it. * @param bytes byte array with decode long * @param start starting index * @throws java.io.IOException * @return deserialized long */ public static long readVLong(byte[] bytes, int start) throws IOException { return WritableComparator.readVLong(bytes, start); } /** * Reads a zero-compressed encoded integer from a byte array and returns it. * @param bytes byte array with the encoded integer * @param start start index * @throws java.io.IOException * @return deserialized integer */ public static int readVInt(byte[] bytes, int start) throws IOException { return WritableComparator.readVInt(bytes, start); } /** * Reads a zero-compressed encoded long from a stream and return it. * @param in input stream * @throws java.io.IOException * @return deserialized long */ public static long readVLong(DataInput in) throws IOException { return WritableUtils.readVLong(in); } /** * Reads a zero-compressed encoded integer from a stream and returns it. * @param in input stream * @throws java.io.IOException * @return deserialized integer */ public static int readVInt(DataInput in) throws IOException { return WritableUtils.readVInt(in); } /** * Get the encoded length if an integer is stored in a variable-length format * @return the encoded length */ public static int getVIntSize(long i) { return WritableUtils.getVIntSize(i); } /** * Serializes a long to a binary stream with zero-compressed encoding. * For -112 <= i <= 127, only one byte is used with the actual value. * For other values of i, the first byte value indicates whether the * long is positive or negative, and the number of bytes that follow. * If the first byte value v is between -113 and -120, the following long * is positive, with number of bytes that follow are -(v+112). * If the first byte value v is between -121 and -128, the following long * is negative, with number of bytes that follow are -(v+120). Bytes are * stored in the high-non-zero-byte-first order. * * @param stream Binary output stream * @param i Long to be serialized * @throws java.io.IOException */ public static void writeVLong(DataOutput stream, long i) throws IOException { WritableUtils.writeVLong(stream, i); } /** * Serializes an int to a binary stream with zero-compressed encoding. * * @param stream Binary output stream * @param i int to be serialized * @throws java.io.IOException */ public static void writeVInt(DataOutput stream, int i) throws IOException { WritableUtils.writeVInt(stream, i); } /** Lexicographic order of binary data. */ public static int compareBytes(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2); } }