TupleUtil.java example

Explorer
incubator-tajo-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.engine.utils;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.fs.Path;
import org.apache.tajo.catalog.Column;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.SortSpec;
import org.apache.tajo.catalog.statistics.ColumnStats;
import org.apache.tajo.datum.DatumFactory;
import org.apache.tajo.datum.NullDatum;
import org.apache.tajo.engine.eval.EvalNode;
import org.apache.tajo.storage.RowStoreUtil;
import org.apache.tajo.storage.Tuple;
import org.apache.tajo.storage.TupleRange;
import org.apache.tajo.storage.VTuple;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

public class TupleUtil {

  public static String rangeToQuery(Schema schema, TupleRange range, boolean last)
      throws UnsupportedEncodingException {
    StringBuilder sb = new StringBuilder();
    byte [] firstKeyBytes = RowStoreUtil.RowStoreEncoder
        .toBytes(schema, range.getStart());
    byte [] endKeyBytes = RowStoreUtil.RowStoreEncoder
        .toBytes(schema, range.getEnd());

    String firstKeyBase64 = new String(Base64.encodeBase64(firstKeyBytes));
    String lastKeyBase64 = new String(Base64.encodeBase64(endKeyBytes));

    sb.append("start=")
        .append(URLEncoder.encode(firstKeyBase64, "utf-8"))
        .append("&")
        .append("end=")
        .append(URLEncoder.encode(lastKeyBase64, "utf-8"));

    if (last) {
      sb.append("&final=true");
    }

    return sb.toString();
  }

  public static TupleRange columnStatToRange(SortSpec [] sortSpecs, Schema target, List<ColumnStats> colStats) {

    Map<Column, ColumnStats> statSet = Maps.newHashMap();
    for (ColumnStats stat : colStats) {
      statSet.put(stat.getColumn(), stat);
    }

    for (Column col : target.getColumns()) {
      Preconditions.checkState(statSet.containsKey(col),
          "ERROR: Invalid Column Stats (column stats: " + colStats + ", there exists not target " + col);
    }

    Tuple startTuple = new VTuple(target.size());
    Tuple endTuple = new VTuple(target.size());
    int i = 0;

    // In outer join, empty table could be searched.
    // As a result, min value and max value would be null.
    // So, we should put NullDatum for this case.
    for (Column col : target.getColumns()) {
      if (sortSpecs[i].isAscending()) {
        if (statSet.get(col).getMinValue() != null)
          startTuple.put(i, statSet.get(col).getMinValue());
        else
          startTuple.put(i, DatumFactory.createNullDatum());

        if (statSet.get(col).getMaxValue() != null)
          endTuple.put(i, statSet.get(col).getMaxValue());
        else
          endTuple.put(i, DatumFactory.createNullDatum());
      } else {
        if (statSet.get(col).getMaxValue() != null)
          startTuple.put(i, statSet.get(col).getMaxValue());
        else
          startTuple.put(i, DatumFactory.createNullDatum());

        if (statSet.get(col).getMinValue() != null)
          endTuple.put(i, statSet.get(col).getMinValue());
        else
          endTuple.put(i, DatumFactory.createNullDatum());
      }
      i++;
    }
    return new TupleRange(sortSpecs, startTuple, endTuple);
  }

  /**
   * It creates a tuple of a given size filled with NULL values in all fields
   * It is usually used in outer join algorithms.
   *
   * @param size The number of columns of a creating tuple
   * @return The created tuple filled with NULL values
   */
  public static Tuple createNullPaddedTuple(int size){
    VTuple aTuple = new VTuple(size);
    int i;
    for(i = 0; i < size; i++){
      aTuple.put(i, DatumFactory.createNullDatum());
    }
    return aTuple;
  }

  @SuppressWarnings("unused")
  public static Collection<Tuple> filterTuple(Schema schema, Collection<Tuple> tupleBlock, EvalNode filterCondition) {
    TupleBlockFilterScanner filter = new TupleBlockFilterScanner(schema, tupleBlock, filterCondition);
    return filter.nextBlock();
  }

  private static class TupleBlockFilterScanner {
    private EvalNode qual;
    private Iterator<Tuple> iterator;
    private Schema schema;

    public TupleBlockFilterScanner(Schema schema, Collection<Tuple> tuples, EvalNode qual) {
      this.schema = schema;
      this.qual = qual;
      this.iterator = tuples.iterator();
    }

    public List<Tuple> nextBlock() {
      List<Tuple> results = Lists.newArrayList();

      Tuple tuple;
      while (iterator.hasNext()) {
        tuple = iterator.next();
        if (qual.eval(schema, tuple).isTrue()) {
          results.add(tuple);
        }
      }
      return results;
    }
  }

  /**
   * Take a look at a column partition path. A partition path consists
   * of a table path part and column values part. This method transforms
   * a partition path into a tuple with a given partition column schema.
   *
   * hdfs://192.168.0.1/tajo/warehouse/table1/col1=abc/col2=def/col3=ghi
   *                   ^^^^^^^^^^^^^^^^^^^^^  ^^^^^^^^^^^^^^^^^^^^^^^^^^
   *                      table path part        column values part
   *
   * When a file path is given, it can perform two ways depending on beNullIfFile flag.
   * If it is true, it returns NULL when a given path is a file.
   * Otherwise, it returns a built tuple regardless of file or directory.
   *
   * @param partitionColumnSchema The partition column schema
   * @param partitionPath The partition path
   * @param beNullIfFile If true, this method returns NULL when a given path is a file.
   * @return The tuple transformed from a column values part.
   */
  public static Tuple buildTupleFromPartitionPath(Schema partitionColumnSchema, Path partitionPath,
                                                  boolean beNullIfFile) {
    int startIdx = partitionPath.toString().indexOf(getColumnPartitionPathPrefix(partitionColumnSchema));

    if (startIdx == -1) { // if there is no partition column in the patch
      return null;
    }
    String columnValuesPart = partitionPath.toString().substring(startIdx);

    String [] columnValues = columnValuesPart.split("/");

    // true means this is a file.
    if (beNullIfFile && partitionColumnSchema.size() < columnValues.length) {
      return null;
    }

    Tuple tuple = new VTuple(partitionColumnSchema.size());
    int i = 0;
    for (; i < columnValues.length && i < partitionColumnSchema.size(); i++) {
      String [] parts = columnValues[i].split("=");
      if (parts.length != 2) {
        return null;
      }
      int columnId = partitionColumnSchema.getColumnIdByName(parts[0]);
      Column keyColumn = partitionColumnSchema.getColumn(columnId);
      tuple.put(columnId, DatumFactory.createFromString(keyColumn.getDataType(), parts[1]));
    }
    for (; i < partitionColumnSchema.size(); i++) {
      tuple.put(i, NullDatum.get());
    }
    return tuple;
  }

  /**
   * Get a prefix of column partition path. For example, consider a column partition (col1, col2).
   * Then, you will get a string 'col1='.
   *
   * @param partitionColumn the schema of column partition
   * @return The first part string of column partition path.
   */
  private static String getColumnPartitionPathPrefix(Schema partitionColumn) {
    StringBuilder sb = new StringBuilder();
    sb.append(partitionColumn.getColumn(0).getSimpleName()).append("=");
    return sb.toString();
  }
}