/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.MalformedInputException; import java.text.CharacterIterator; import java.text.StringCharacterIterator; import junit.framework.Assert; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.BinaryComparable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.io.WritableUtils; /** * basically a clone of hadoop Text class with FlexBuffer as the backing store * and a cacheable String object that is lazilly populated by the call to the * toString API. * * From Text.java: * * This class stores text using standard UTF8 encoding. It provides methods to * serialize, deserialize, and compare texts at byte level. The type of length * is integer and is serialized using zero-compressed format. * <p> * In addition, it provides methods for string traversal without converting the * byte array to a string. * <p> * Also includes utilities for serializing/deserialing a string, coding/decoding * a string, checking if a byte array contains valid UTF8 code, calculating the * length of an encoded string. * * @author rana * */ public class TextBytes extends BinaryComparableWithOffset implements WritableComparable<BinaryComparable> { private static final Log LOG = LogFactory .getLog(TextBytes.class); private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = new ThreadLocal<CharsetEncoder>() { protected CharsetEncoder initialValue() { return Charset .forName( "UTF-8") .newEncoder() .onMalformedInput( CodingErrorAction.REPORT) .onUnmappableCharacter( CodingErrorAction.REPORT); } }; private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = new ThreadLocal<CharsetDecoder>() { protected CharsetDecoder initialValue() { return Charset .forName( "UTF-8") .newDecoder() .onMalformedInput( CodingErrorAction.REPORT) .onUnmappableCharacter( CodingErrorAction.REPORT); } }; private static final byte[] EMPTY_BYTES = new byte[0]; private static final String EMPTY_STRING = new String(""); private FlexBuffer bytes = new FlexBuffer(); private String cachedUTF8 = null; public TextBytes() { } /** * Construct from a string. */ public TextBytes(String string) { set(string); } /** Construct from another textbytes. */ public TextBytes(TextBytes utf8) { bytes = new FlexBuffer(utf8.bytes); cachedUTF8 = utf8.cachedUTF8; } /** Construct from another text. */ public TextBytes(Text utf8) { set(utf8); } /** * Construct from a byte array. */ public TextBytes(byte[] utf8) { set(utf8); } /** * get underlying buffer object */ public FlexBuffer getBuffer() { return bytes; } /** * Returns the raw bytes; however, only data up to {@link #getLength()} is * valid. */ public byte[] getBytes() { return bytes.get(); } /** Returns the number of bytes in the byte array */ public int getLength() { return bytes.getCount(); } /** Get offset (if set) **/ public int getOffset() { return bytes.getOffset(); } public void setLength(int newLength) { if (newLength > getCapacity()) { setCapacity(newLength, true); } bytes.setCount(newLength); } /** * Returns the Unicode Scalar Value (32-bit integer value) for the character * at <code>position</code>. Note that this method avoids using the converter * or doing String instatiation * * @return the Unicode scalar value at position or -1 if the position is * invalid or points to a trailing byte */ public int charAt(int position) { if (position > bytes.getCount()) return -1; // too long if (position < 0) return -1; // duh. ByteBuffer bb = (ByteBuffer) ByteBuffer.wrap(bytes.get(), bytes.getOffset(), bytes.getCount()).position(position); return bytesToCodePoint(bb.slice()); } public int find(String what) { return find(what, 0); } /** * Finds any occurence of <code>what</code> in the backing buffer, starting as * position <code>start</code>. The starting position is measured in bytes and * the return value is in terms of byte position in the buffer. The backing * buffer is not converted to a string for this operation. * * @return byte position of the first occurence of the search string in the * UTF-8 buffer or -1 if not found */ public int find(String what, int start) { try { ByteBuffer src = ByteBuffer.wrap(bytes.get(), bytes.getOffset(), bytes .getCount()); ByteBuffer tgt = encode(what); byte b = tgt.get(); src.position(start); while (src.hasRemaining()) { if (b == src.get()) { // matching first byte src.mark(); // save position in loop tgt.mark(); // save position in target boolean found = true; int pos = src.position() - 1; while (tgt.hasRemaining()) { if (!src.hasRemaining()) { // src expired first tgt.reset(); src.reset(); found = false; break; } if (!(tgt.get() == src.get())) { tgt.reset(); src.reset(); found = false; break; // no match } } if (found) return pos; } } return -1; // not found } catch (CharacterCodingException e) { // can't get here e.printStackTrace(); return -1; } } /** * Set to contain the contents of a string. */ public void set(String string) { try { ByteBuffer bb = encode(string, true); set(bb.array(), 0, bb.limit()); cachedUTF8 = string; // zbytes = bb.array(); // length = bb.limit(); // offset = 0; } catch (CharacterCodingException e) { throw new RuntimeException("Should not have happened " + e.toString()); } } /** * Set to a utf8 byte array */ public void set(byte[] utf8) { set(utf8, 0, utf8.length); } /** copy a text. */ public void set(Text other) { set(other.getBytes(), 0, other.getLength()); } /** copy a textbytes. */ public void set(TextBytes other) { set(other.getBytes(), other.getOffset(), other.getLength()); } /** * Set the Text to range of bytes * * @param utf8 * the data to copy from * @param start * the first position of the new string * @param len * the number of bytes of the new string */ public void set(byte[] utf8, int start, int len) { bytes.set(utf8, start, len); // reset string cache ... cachedUTF8 = null; } /** * Set TextBytes equal to then encoded contents of another TextBytes instance * * @param inputBuffer * @throws IOException */ public void setFromRawTextBytes(DataInputBuffer inputBuffer) throws IOException { int length = WritableUtils.readVInt(inputBuffer); set(inputBuffer.getData(),inputBuffer.getPosition(),length); } /** * Append a range of bytes to the end of the given text * * @param utf8 * the data to copy from * @param start * the first position to append from utf8 * @param len * the number of bytes to append */ public void append(byte[] utf8, int start, int len) { setCapacity(bytes.getCount() + len, true); if (bytes.isShared()) { bytes.copyOnWrite(); } System.arraycopy(utf8, start, bytes.get(), bytes.getCount(), len); bytes.setCount(bytes.getCount() + len); cachedUTF8 = null; } /** * Clear the string to empty. */ public void clear() { bytes.setCount(0); cachedUTF8 = null; } /* * Sets the capacity of this Text object to <em>at least</em> <code>len</code> * bytes. If the current buffer is longer, then the capacity and existing * content of the buffer are unchanged. If <code>len</code> is larger than the * current capacity, the Text object's capacity is increased to match. * * @param len the number of bytes we need * * @param keepData should the old data be kept */ private void setCapacity(int len, boolean keepData) { if (!keepData) { bytes.setCount(0); } bytes.setCapacity(len); } private int getCapacity() { return bytes.getCapacity(); } /** * Convert text back to string * * @see java.lang.Object#toString() */ public String toString() { if (cachedUTF8 == null) { if (bytes.getCount() == 0) { cachedUTF8 = EMPTY_STRING; } else { try { cachedUTF8 = decode(bytes.get(), bytes.getOffset(), bytes.getCount()); return cachedUTF8; } catch (CharacterCodingException e) { throw new RuntimeException("Should not have happened " + e.toString()); } } } return cachedUTF8; } /** * deserialize */ public void readFields(DataInput in) throws IOException { int newLength = WritableUtils.readVInt(in); // ensure capacity setCapacity(newLength, false); // in case we need to, ensure we have a private copy of the underlying // array bytes.copyOnWrite(); // read into the array in.readFully(bytes.get(), bytes.getOffset(), newLength); // reset count varaible bytes.setCount(newLength); // clear cached String pointer cachedUTF8 = null; } /** Skips over one Text in the input. */ public static void skip(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); WritableUtils.skipFully(in, length); } /** * serialize write this object to out length uses zero-compressed encoding * * @see Writable#write(DataOutput) */ public void write(DataOutput out) throws IOException { WritableUtils.writeVInt(out, bytes.getCount()); if (bytes.getCount() != 0) { out.write(bytes.get(), bytes.getOffset(), bytes.getCount()); } } /** Returns true iff <code>o</code> is a TextBytes with the same contents. */ public boolean equals(Object o) { if (o instanceof TextBytes) return super.equals(o); return false; } public int hashCode() { int hash = 1; int offset = getOffset(); int length = getLength(); byte bytesArray[] = bytes.get(); for (int i = offset; i < offset + length; i++) hash = (31 * hash) + (int) bytesArray[i]; return hash; } /** A WritableComparator optimized for Text keys. */ public static class Comparator extends WritableComparator { public Comparator() { super(Text.class); } public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int n1 = WritableUtils.decodeVIntSize(b1[s1]); int n2 = WritableUtils.decodeVIntSize(b2[s2]); return compareBytes(b1, s1 + n1, l1 - n1, b2, s2 + n2, l2 - n2); } } static { // register this comparator WritableComparator.define(Text.class, new Comparator()); } // / STATIC UTILITIES FROM HERE DOWN /** * Converts the provided byte array to a String using the UTF-8 encoding. If * the input is malformed, replace by a default value. */ public static String decode(byte[] utf8) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8), true); } public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8, start, length), true); } /** * Converts the provided byte array to a String using the UTF-8 encoding. If * <code>replace</code> is true, then malformed input is replaced with the * substitution character, which is U+FFFD. Otherwise the method throws a * MalformedInputException. */ public static String decode(byte[] utf8, int start, int length, boolean replace) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8, start, length), replace); } private static String decode(ByteBuffer utf8, boolean replace) throws CharacterCodingException { CharsetDecoder decoder = DECODER_FACTORY.get(); if (replace) { decoder.onMalformedInput(java.nio.charset.CodingErrorAction.REPLACE); decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } String str = decoder.decode(utf8).toString(); // set decoder back to its default value: REPORT if (replace) { decoder.onMalformedInput(CodingErrorAction.REPORT); decoder.onUnmappableCharacter(CodingErrorAction.REPORT); } return str; } /** * Converts the provided String to bytes using the UTF-8 encoding. If the * input is malformed, invalid chars are replaced by a default value. * * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is * ByteBuffer.limit() */ public static ByteBuffer encode(String string) throws CharacterCodingException { return encode(string, true); } /** * Converts the provided String to bytes using the UTF-8 encoding. If * <code>replace</code> is true, then malformed input is replaced with the * substitution character, which is U+FFFD. Otherwise the method throws a * MalformedInputException. * * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is * ByteBuffer.limit() */ public static ByteBuffer encode(String string, boolean replace) throws CharacterCodingException { CharsetEncoder encoder = ENCODER_FACTORY.get(); if (replace) { encoder.onMalformedInput(CodingErrorAction.REPLACE); encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } ByteBuffer bytes = encoder.encode(CharBuffer.wrap(string.toCharArray())); if (replace) { encoder.onMalformedInput(CodingErrorAction.REPORT); encoder.onUnmappableCharacter(CodingErrorAction.REPORT); } return bytes; } /** * Read a UTF8 encoded string from in */ public static String readString(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); byte[] bytes = new byte[length]; in.readFully(bytes, 0, length); return decode(bytes); } /** * Write a UTF8 encoded string to out */ public static int writeString(DataOutput out, String s) throws IOException { ByteBuffer bytes = encode(s); int length = bytes.limit(); WritableUtils.writeVInt(out, length); out.write(bytes.array(), 0, length); return length; } // //// states for validateUTF8 private static final int LEAD_BYTE = 0; private static final int TRAIL_BYTE_1 = 1; private static final int TRAIL_BYTE = 2; /** * Check if a byte array contains valid utf-8 * * @param utf8 * byte array * @throws MalformedInputException * if the byte array contains invalid utf-8 */ public static void validateUTF8(byte[] utf8) throws MalformedInputException { validateUTF8(utf8, 0, utf8.length); } /** * Check to see if a byte array is valid utf-8 * * @param utf8 * the array of bytes * @param start * the offset of the first byte in the array * @param len * the length of the byte sequence * @throws MalformedInputException * if the byte array contains invalid bytes */ public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { int count = start; int leadByte = 0; int length = 0; int state = LEAD_BYTE; while (count < start + len) { int aByte = ((int) utf8[count] & 0xFF); switch (state) { case LEAD_BYTE: leadByte = aByte; length = bytesFromUTF8[aByte]; switch (length) { case 0: // check for ASCII if (leadByte > 0x7F) throw new MalformedInputException(count); break; case 1: if (leadByte < 0xC2 || leadByte > 0xDF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 2: if (leadByte < 0xE0 || leadByte > 0xEF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 3: if (leadByte < 0xF0 || leadByte > 0xF4) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; default: // too long! Longest valid UTF-8 is 4 bytes (lead + three) // or if < 0 we got a trail byte in the lead byte position throw new MalformedInputException(count); } // switch (length) break; case TRAIL_BYTE_1: if (leadByte == 0xF0 && aByte < 0x90) throw new MalformedInputException(count); if (leadByte == 0xF4 && aByte > 0x8F) throw new MalformedInputException(count); if (leadByte == 0xE0 && aByte < 0xA0) throw new MalformedInputException(count); if (leadByte == 0xED && aByte > 0x9F) throw new MalformedInputException(count); // falls through to regular trail-byte test!! case TRAIL_BYTE: if (aByte < 0x80 || aByte > 0xBF) throw new MalformedInputException(count); if (--length == 0) { state = LEAD_BYTE; } else { state = TRAIL_BYTE; } break; } // switch (state) count++; } } /** * Magic numbers for UTF-8. These are the number of bytes that <em>follow</em> * a given lead byte. Trailing bytes have the value -1. The values 4 and 5 are * presented in this table, even though valid UTF-8 cannot include the five * and six byte sequences. */ static final int[] bytesFromUTF8 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // trail bytes -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; /** * Returns the next code point at the current position in the buffer. The * buffer's position will be incremented. Any mark set on this buffer will be * changed by this method! */ public static int bytesToCodePoint(ByteBuffer bytes) { bytes.mark(); byte b = bytes.get(); bytes.reset(); int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; if (extraBytesToRead < 0) return -1; // trailing byte! int ch = 0; switch (extraBytesToRead) { case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += (bytes.get() & 0xFF); ch <<= 6; case 2: ch += (bytes.get() & 0xFF); ch <<= 6; case 1: ch += (bytes.get() & 0xFF); ch <<= 6; case 0: ch += (bytes.get() & 0xFF); } ch -= offsetsFromUTF8[extraBytesToRead]; return ch; } static final int offsetsFromUTF8[] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; /** * For the given string, returns the number of UTF-8 bytes required to encode * the string. * * @param string * text to encode * @return number of UTF-8 bytes required to encode */ public static int utf8Length(String string) { CharacterIterator iter = new StringCharacterIterator(string); char ch = iter.first(); int size = 0; while (ch != CharacterIterator.DONE) { if ((ch >= 0xD800) && (ch < 0xDC00)) { // surrogate pair? char trail = iter.next(); if ((trail > 0xDBFF) && (trail < 0xE000)) { // valid pair size += 4; } else { // invalid pair size += 3; iter.previous(); // rewind one } } else if (ch < 0x80) { size++; } else if (ch < 0x800) { size += 2; } else { // ch < 0x10000, that is, the largest char value size += 3; } ch = iter.next(); } return size; } public static void main(String[] args) { // run some tests on the new code String aTestString = new String("A Test Strnig"); // convert it to bytes byte bytes[] = aTestString.getBytes(); // over allocate an array byte overAllocated[] = new byte[bytes.length * 2]; // copy source System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length); // now allocate a TextBytes TextBytes textBytes = new TextBytes(); // set the overallocated buffer as the backing store textBytes.set(overAllocated, bytes.length, bytes.length); // convert it to string first String toString = textBytes.toString(); // validate equal to original Assert.assertTrue(aTestString.equals(toString)); // ok now write it to output buffer DataOutputBuffer outputBuffer = new DataOutputBuffer(); // write string try { textBytes.write(outputBuffer); // read length DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); int encodedLength = WritableUtils.readVInt(inputBuffer); // validate arrays match ... Assert.assertTrue(encodedLength == bytes.length); Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(), inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0); // ok reset input buffer again ... inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); // read in fields textBytes.readFields(inputBuffer); // ok see if we are not using the original backing store ... Assert.assertTrue(textBytes.getBytes() != overAllocated); // validate buffers match to original Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(), textBytes.getOffset(), textBytes .getLength()), 0); } catch (IOException e) { e.printStackTrace(); } } static class TextBytesArray extends ArrayWritable { public TextBytesArray() { super(TextBytes.class); } } }