/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.hadoop.zebra.pig.comparator;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.io.BytesWritable;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
/**
* Extended from ByteArrayOutputStream with direct access to the underlying byte
* array and explicit capacity expansion.
*
* Also adding the capability of escaping:
*
* <code>
* el - escape level for 0x00, valid value 0-252
* cel - escape level for 0xff, valid value 0-252
* escaped
* el 0x00 0x01 others
* 0 AS-IS AS-IS AS-IS
* >0 0x01 0x01+el 0x01FD AS-IS
* escaped
* cel 0xFF 0xFE others
* 0 AS-IS AS-IS AS-IS
* >0 0xFE 0xFE-cel 0xFE02 AS-IS
* </code>
*/
class EncodingOutputStream extends ByteArrayOutputStream {
int escapeLevel = 0;
int comescLevel = 0;
boolean complement = false;
public EncodingOutputStream() {
super();
}
public EncodingOutputStream(int size) {
super(size);
}
public byte[] get() {
return buf;
}
public void ensureAvailable(int len) {
int newcount = count + len;
if (newcount > buf.length) {
buf = Arrays.copyOf(buf, Math.max(buf.length << 1, newcount));
}
}
public void setEscapeParams(int el, int cel, boolean c) {
escapeLevel = el;
comescLevel = cel;
complement = c;
}
public int getEscapeLevel() {
return escapeLevel;
}
public int getComescLevel() {
return comescLevel;
}
public boolean getComplement() {
return complement;
}
void writeEscaped(int v, boolean c) {
ensureAvailable(2);
buf[count] = 0x01;
buf[count + 1] = (byte) v;
if (c) {
buf[count] = (byte) (~buf[count]);
buf[count + 1] = (byte) (~buf[count + 1]);
}
count += 2;
}
/**
* Write an escaped 0x00.
*/
void escape00() {
writeEscaped(escapeLevel + 1, complement);
}
/**
* Write an escaped 0x01.
*/
void escape01() {
writeEscaped(0xFD, complement);
}
/**
* Write an escaped 0xFE
*/
void escapeFE() {
writeEscaped(0xFD, !complement);
}
/**
* write an escaped 0xFF
*/
void escapeFF() {
writeEscaped(comescLevel + 1, !complement);
}
void complement(byte b[], int begin, int end) {
if (begin >= end) return;
ensureAvailable(end - begin);
if (!complement) {
System.arraycopy(b, begin, buf, count, end - begin);
count += (end - begin);
} else {
for (int i = begin; i < end; ++i) {
buf[count++] = (byte) ~b[i];
}
}
}
void escape(int b) {
switch (b) {
case 0:
escape00();
break;
case 1:
escape01();
break;
case 0xfe:
escapeFE();
break;
case 0xff:
escapeFF();
break;
}
}
public void write(int b) {
if (!shouldEscape(b, escapeLevel > 0, comescLevel > 0)) {
ensureAvailable(1);
if (complement) {
buf[count++] = (byte) ~b;
} else {
buf[count++] = (byte) b;
}
} else {
escape(b);
}
}
public void write(byte b[]) {
write(b, 0, b.length);
}
static boolean shouldEscape(int b, boolean checkLow, boolean checkHigh) {
if (checkLow && b < 0x2) return true;
if (checkHigh && b > 0xfd) return true;
return false;
}
public void write(byte b[], int off, int len) {
if ((escapeLevel > 0) || (comescLevel > 0)) {
ensureAvailable(len);
int begin = off;
int next = begin;
int end = off + len;
for (; begin < end; begin = next) {
while ((next < end)
&& (!shouldEscape(b[next] & 0xff, escapeLevel > 0, comescLevel > 0))) {
++next;
}
complement(b, begin, next);
if (next < end) {
escape(b[next] & 0xff);
++next;
}
}
} else {
complement(b, off, off + len);
}
}
}
/**
* Generating binary keys for algorithmic comparators. A user may construct an
* algorithmic comparator by creating a ComparatorExpr object (through various
* static methods in this class). She could then create a KeyGenerator object
* and use it to create binary keys for tuple. The KeyGenerator object can be
* reused for different tuples that conform to the same schema. Sorting the
* tuples by the binary key yields the same ordering as sorting by the
* algorithmic comparator.
*
* Basic idea (without optimization):
* <ul>
* <li>define two operations: escape and complement, that takes in a byte array,
* and outputs a byte array:
*
* <pre>
* escape(byte[] bytes) {
* for (byte b : bytes) {
* if (b == 0)
* emit(0x1, 0x0);
* else if (b == 1)
* emit(0x1, 0x2);
* else emit(b);
* }
* }
*
* complement(byte[] bytes) {
* for (byte b : bytes) {
* emit(˜b);
* }
* }
* </pre>
*
* <li>find ways to convert primitive types to bytes that compares in the same
* order as those objects.
* <li>operations:
* <ul>
* <li>negate(byte[] bytes) == complement(escape(bytes) + 0x0);
* <li>tuple(byte[] bytes1, byte[] bytes2) == escape(bytes1) + 0x0 +
* escape(bytes2)
* <li>bag(byte[] bytes1, byte[] bytes2, ... ) = escape(bytes1) + 0x0 +
* escape(bytes2) + ...
* </ul>
* <li>optimizations:
* <ul>
* <li>negate(negate(bytes)) == bytes;
* <li>tuple(a) == a;
* <li>tuple(a, tuple(b, c)) == tuple(a, b, c)
* <li>the actual output would be a concatenation of f1(o1), f2(o2), ..., where
* o1, o2, are leaf datums in the tuple or 0x0, and fi(oi) is a nested function
* of escape() and complement() calls.
* <li>The invariance we want to preserve is that escape(0x1) >
* escape(escape(0x0)) > escape(0x0) > 0x0. In the basic algorithm, these are
* escaped as 0x0102, 0x010100, 0x0100, 0x00, and are thus variable length. We
* can actually collapse nested consecutive calls of
* escape(escape(...escape(0))...) to escape(i, 0), where i is the level of
* nesting, and fi may be represented as nested inter-leaved calling of
* complement(bytes) and escape(i, bytes), where escape (i, 0x0) == 0x01 +
* (0x01+i) and escape(i, 1) == 0x010xFD. We do limit the total nesting depth by
* 252, which should be plenty.
* <li>we can further optimize fi as either escape(i, j, bytes) or
* complement(escape(i, j, bytes), where i is the level of nesting for escaping
* 0x0 and 0x1, and j the level of nesting for escaping 0xff and 0xfe.
* <li>If the binary keys being generated from a certain comparator either
* compare equal or differ at some byte position, but never the case where one
* is a prefix of another, then we do not need to add padding 0x0 for negate()
* or tuple(). This is captured by the method implicitBound() in ComparatorExpr.
* <li>We figure out how datums should be extracted from a tuple and being
* escaped only once for any expression, and write to a modified
* ByteArrayOutputStream in one pass.
* </ul>
* </ul>
*
* TODO Remove the strong dependency with Pig by adding a DatumExtractor
* interface that allow applications to extract leaf datum from user objects,
* something like the following:
*
* <pre>
* interface DatumExtractor {
* Object extract(Object o);
* }
* </pre>
*
* And user may do something like this:
*
* <pre>
* class MyObject {
* int a;
* String b;
* }
*
* ComparatorExpr expr = KeyBuilder.createLeafExpr(new DatumExtractor {
* Object extract(Object o) {
* MyObject obj = (MyObject)o;
* return obj.b;
* } }, DataType.CHARARRAY);
* </pre>
*
* TODO Change BagExpr to IteratorExpr, so that it may be used in more general
* context (any Java collection).
*
* TODO Add an ArrayExpr (for Java []).
*/
public class KeyGenerator {
private EncodingOutputStream out;
private List<LeafGenerator> list;
/**
* Create a key builder that can generate binary keys for the input key
* expression.
*
* @param expr
* comparator expression
*/
public KeyGenerator(ComparatorExpr expr) {
out = new EncodingOutputStream();
list = new ArrayList<LeafGenerator>();
expr.appendLeafGenerator(list, 0, 0, false, false);
// illustrate(System.out);
}
/**
* Reset the key builder for a new expression.
*
* @param expr
* comparator expression
*/
public void reset(ComparatorExpr expr) {
list.clear();
expr.appendLeafGenerator(list, 0, 0, false, false);
}
/**
* Generate the binary key for the input tuple
*
* @param t
* input tuple
* @return A {@link BytesWritable} containing the binary sorting key for the
* input tuple.
* @throws ExecException
*/
public BytesWritable generateKey(Tuple t) throws ExecException {
out.reset();
for (Iterator<LeafGenerator> it = list.iterator(); it.hasNext();) {
LeafGenerator e = it.next();
e.append(out, t);
}
BytesWritable ret = new BytesWritable();
ret.set(out.get(), 0, out.size());
return ret;
}
/**
* Illustrate how the key would be generated from source.
*
* @param ps
* The output print stream.
*/
public void illustrate(PrintStream ps) {
for (Iterator<LeafGenerator> it = list.iterator(); it.hasNext();) {
LeafGenerator e = it.next();
e.illustrate(ps);
}
ps.print("\n");
}
}