KeyDistribution.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.hadoop.zebra.io;

import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.io.IOException;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.zebra.tfile.RawComparable;
import org.apache.hadoop.zebra.tfile.ByteArray;

/**
 * Class used to convey the information of how on-disk data are distributed
 * among key-partitioned buckets. This class is used by the MapReduce layer to
 * calculate intelligent splits.
 */
public class KeyDistribution {
  private long uniqueBytes;
  private long minStepSize = -1;
  private SortedMap<RawComparable, BlockDistribution> data;

  KeyDistribution(Comparator<? super RawComparable> comparator) {
    data = new TreeMap<RawComparable, BlockDistribution>(comparator);
  }

  void add(RawComparable key) {
    data.put(key, null);
  }
  
  void add(RawComparable key, BlockDistribution bucket)
  {
    uniqueBytes += bucket.getLength();
    data.put(key, BlockDistribution.sum(data.get(key), bucket));
  }
  
  void setMinStepSize(long minStepSize)
  {
    this.minStepSize = minStepSize;
  }

  /**
   * Get the total unique bytes contained in the key-partitioned buckets.
   * 
   * @return The total number of bytes contained in the key-partitioned buckets.
   */
  public long length() {
    return uniqueBytes;
  }

  /**
   * Get the size of the key sampling.
   * 
   * @return Number of key samples.
   */
  public int size() {
    return data.size();
  }

  /**
   * Get the minimum split step size from all tables in union
   */
  public long getMinStepSize() {
    return minStepSize;
  }
 
  /** Get the list of sampling keys
   * 
   * @return A list of sampling keys
   */
  public RawComparable[] getKeys() {
    RawComparable[] ret = new RawComparable[data.size()];
    return data.keySet().toArray(ret);
  }
  
  public BlockDistribution getBlockDistribution(RawComparable key) {
    return data.get(key);
  }
  
  /**
   * Merge the key samples
   * 
   * Algorithm: select the smallest key from all clean source ranges and ranges subsequent to
   *            respective dirty ranges. A dirty range is a range that has been partially needed
   *            by one or more of the previous final ranges.  
   *
   * @param sourceKeys
   *          key samples to be merged
   * @return the merged key samples
   */
  public static KeyDistribution merge(KeyDistribution[] sourceKeys) throws IOException {
    if (sourceKeys == null || sourceKeys.length == 0)
      return null;
    int srcSize = sourceKeys.length;
    if (srcSize == 1)
      return sourceKeys[0];
    
    Comparator<? super RawComparable> comp = sourceKeys[0].data.comparator();
    // TODO check the identical comparators used in the source keys
    /*
    for (int i = 1; i < srcSize; i++)
      if (!comp.equals(sourceKeys[i].data.comparator()))
        throw new IOException("Incompatible sort keys found:" + comp.toString() + " vs. "+ sourceKeys[i].data.comparator().toString());
     */
    
    KeyDistribution result = new KeyDistribution(comp);
    
    result.minStepSize = sourceKeys[0].minStepSize;
    for (int i = 1; i < srcSize; i++)
      if (result.minStepSize > sourceKeys[i].minStepSize)
        result.minStepSize = sourceKeys[i].minStepSize;
    
    RawComparable[][] its = new RawComparable[srcSize][];
    for (int i = 0; i < srcSize; i++)
      its[i] = sourceKeys[i].getKeys();
    RawComparable min, current;
    int minIndex = -1;
    int[] index = new int[srcSize];
    boolean[] dirty = new boolean[srcSize];
    while (true)
    {
      min = null;
      BlockDistribution bd = new BlockDistribution();
      for (int i = 0; i < srcSize; i++)
      {
        if (index[i] >= its[i].length)
          continue;
        current = its[i][index[i]];
        bd.add(sourceKeys[i].getBlockDistribution(current));
        if (min == null || comp.compare(min, current) > 0)
        {
          min = current;
          minIndex = i;
        }
      }
      if (min == null)
        break;

      result.add(min, bd);
      for (int i = 0; i < srcSize; i++)
      {
        if (index[i] >= its[i].length)
          continue;
        current = its[i][index[i]];
        if (i != minIndex)
        {
          if (comp.compare(min, current) != 0)
          {
            if (!dirty[i])
            {
              dirty[i] = true;
              index[i]++;
            } else if (comp.compare(min, its[i][index[i] - 1]) > 0 )
              index[i]++;
          } else {
            if (dirty[i])
              dirty[i] = false;
            index[i]++;
          }
        } else {
          if (dirty[i])
            dirty[i] = false;
          index[i]++;
        }
      }
    }
    return result;
  }
  
  public int resize(BlockDistribution lastBd)
  {
    Iterator<Map.Entry<RawComparable, BlockDistribution>> it =
      data.entrySet().iterator();
    KeyDistribution adjusted = new KeyDistribution(data.comparator());
    long realSize = 0, mySize = 0;
    RawComparable key = null;
    BlockDistribution bd = null, bd0 = null;
    while (it.hasNext())
    {
      Map.Entry<RawComparable, BlockDistribution> mapEntry = it.next();
      bd0 = mapEntry.getValue();
      mySize = bd0.getLength();
      if (realSize >= minStepSize/2 ||
          (realSize + mySize >= minStepSize*ColumnGroup.SPLIT_SLOP && 
              realSize >= minStepSize * (ColumnGroup.SPLIT_SLOP-1)))
      {
        adjusted.add(key, bd);
        bd = null;
        realSize = 0;
      }
      key = mapEntry.getKey();
      realSize += mySize;
      bd = BlockDistribution.sum(bd, bd0);
    }
    if (bd != null)
    {
      realSize += lastBd.getLength();
      if (realSize >= minStepSize/2 || adjusted.size() == 0)
      {
         // the last plus would contain more than liked, don't merge them.
        adjusted.add(key, bd);
      } else
        BlockDistribution.sum(lastBd, bd);
    }
    swap(adjusted);
    return data.size();
  }
  
  private void swap(KeyDistribution other) {
    long tmp = minStepSize;
    minStepSize = other.minStepSize;
    other.minStepSize = tmp;
    SortedMap<RawComparable, BlockDistribution> tmp2 = data;
    data = other.data;
    other.data = tmp2;
  }
}