TestUtils.java example

Explorer
myria-master
package edu.washington.escience.myria.util;

import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;

import javax.annotation.Nonnull;

import org.apache.commons.lang.ArrayUtils;
import org.junit.Assert;
import org.junit.Assume;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;

import edu.washington.escience.myria.MyriaConstants;
import edu.washington.escience.myria.RelationKey;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.operator.DbInsert;
import edu.washington.escience.myria.operator.EOSSource;
import edu.washington.escience.myria.operator.EmptySink;
import edu.washington.escience.myria.operator.Operator;
import edu.washington.escience.myria.operator.failures.InitFailureInjector;
import edu.washington.escience.myria.operator.network.Consumer;
import edu.washington.escience.myria.operator.network.GenericShuffleProducer;
import edu.washington.escience.myria.operator.network.distribute.DistributeFunction;
import edu.washington.escience.myria.parallel.ExchangePairID;
import edu.washington.escience.myria.parallel.SubQuery;
import edu.washington.escience.myria.parallel.SubQueryPlan;
import edu.washington.escience.myria.storage.TupleBatchBuffer;

public final class TestUtils {

  public static class EntryComparator implements Comparator<Entry<Long, String>> {

    @Override
    public int compare(final Entry<Long, String> o1, final Entry<Long, String> o2) {
      int res = o1.getKey().compareTo(o2.getKey());
      if (res != 0) {
        return res;
      }
      return o1.getValue().compareTo(o2.getValue());
    }
  }

  private static Random random = null;

  /**
   * See http://docs.travis-ci.com/user/ci-environment/#Environment-variables
   *
   * @return <code>true</code> if the system is currently in a Travis CI build.
   */
  public static boolean inTravis() {
    String travis = System.getenv("TRAVIS");
    return (travis != null) && travis.equals("true");
  }

  /** Only run this test in Travis. */
  public static void requireTravis() {
    Assume.assumeTrue(inTravis());
  }

  /** Skip this test if in Travis. */
  public static void skipIfInTravis() {
    Assume.assumeFalse(inTravis());
  }

  private synchronized static Random getRandom() {
    if (random == null) {
      random = new Random();
    }
    return random;
  }

  public static void setSeed(final long seed) {
    getRandom().setSeed(seed);
  }

  public static void resetRandom() {
    random = null;
  }

  public static void assertEqualsToStringBuilder(
      final StringBuilder errorMessageHolder,
      final String currentEM,
      final Object expected,
      final Object actual) {
    if (expected == null) {
      if (actual != null) {
        errorMessageHolder.append(currentEM);
        errorMessageHolder.append(": ");
        errorMessageHolder.append("expected: <null>");
        errorMessageHolder.append("but was: <");
        errorMessageHolder.append(actual);
        errorMessageHolder.append(">\n");
      }
    } else {
      if (!expected.equals(actual)) {
        errorMessageHolder.append(currentEM);
        errorMessageHolder.append(": ");
        errorMessageHolder.append("expected: <");
        errorMessageHolder.append(expected);
        errorMessageHolder.append('>');
        errorMessageHolder.append("but was: <");
        errorMessageHolder.append(actual);
        errorMessageHolder.append(">\n");
      }
    }
  }

  public static void assertTupleBagEqual(
      final HashMap<Tuple, Integer> expectedResult, final HashMap<Tuple, Integer> actualResult) {
    final StringBuilder errorMessageHolder = new StringBuilder();
    assertEqualsToStringBuilder(
        errorMessageHolder, "Number of unique tuples", expectedResult.size(), actualResult.size());
    final HashSet<Tuple> keySet = new HashSet<Tuple>();
    keySet.addAll(expectedResult.keySet());
    keySet.addAll(actualResult.keySet());
    for (final Tuple k : keySet) {
      Integer expected = expectedResult.get(k);
      Integer actual = actualResult.get(k);
      if (expected == null) {
        expected = 0;
      }
      if (actual == null) {
        actual = 0;
      }
      assertEqualsToStringBuilder(errorMessageHolder, "Tuple entry{" + k + "}", expected, actual);
    }
    if (errorMessageHolder.length() != 0) {
      Assert.fail(errorMessageHolder.toString());
    }
  }

  public static String intToString(final long v, final int length) {
    final StringBuilder sb = new StringBuilder("" + v);
    while (sb.length() < length) {
      sb.insert(0, "0");
    }
    return sb.toString();
  }

  public static HashMap<Tuple, Integer> mergeBags(final List<HashMap<Tuple, Integer>> bags) {
    final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>();
    result.putAll(bags.get(0));
    for (int i = 1; i < bags.size(); i++) {
      for (final Map.Entry<Tuple, Integer> e : bags.get(i).entrySet()) {
        final Tuple t = e.getKey();
        final Integer occ = e.getValue();
        final Integer existingOcc = result.get(t);
        if (existingOcc == null) {
          result.put(t, occ);
        } else {
          result.put(t, occ + existingOcc);
        }
      }
    }
    return result;
  }

  @SuppressWarnings("rawtypes")
  public static HashMap<Tuple, Integer> naturalJoin(
      final TupleBatchBuffer child1,
      final TupleBatchBuffer child2,
      final int child1JoinColumn,
      final int child2JoinColumn) {

    /** join key -> {tuple->num occur} */
    final HashMap<Comparable, HashMap<Tuple, Integer>> child1Hash =
        new HashMap<Comparable, HashMap<Tuple, Integer>>();

    int numChild1Column = 0;
    final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>();
    final List<List<? extends Column<?>>> child1TBIt = child1.getAllAsRawColumn();
    for (final List<? extends Column<?>> child1RawData : child1TBIt) {
      final int numRow = child1RawData.get(0).size();
      final int numColumn = child1RawData.size();
      numChild1Column = numColumn;

      for (int i = 0; i < numRow; i++) {
        final Tuple t = new Tuple(numColumn);
        for (int j = 0; j < numColumn; j++) {
          t.set(j, child1RawData.get(j).getObject(i));
        }
        final Object joinKey = t.get(child1JoinColumn);
        HashMap<Tuple, Integer> tupleOccur = child1Hash.get(joinKey);
        if (tupleOccur == null) {
          tupleOccur = new HashMap<Tuple, Integer>();
          tupleOccur.put(t, 1);
          child1Hash.put((Comparable<?>) joinKey, tupleOccur);
        } else {
          Integer occur = tupleOccur.get(t);
          if (occur == null) {
            occur = 0;
          }
          tupleOccur.put(t, occur + 1);
        }
      }
    }

    final Iterator<List<? extends Column<?>>> child2TBIt = child2.getAllAsRawColumn().iterator();
    while (child2TBIt.hasNext()) {
      final List<? extends Column<?>> child2Columns = child2TBIt.next();
      final int numRow = child2Columns.get(0).size();
      final int numChild2Column = child2Columns.size();
      for (int i = 0; i < numRow; i++) {
        final Object joinKey = child2Columns.get(child2JoinColumn).getObject(i);
        final HashMap<Tuple, Integer> matchedTuples = child1Hash.get(joinKey);
        if (matchedTuples != null) {
          final Tuple child2Tuple = new Tuple(numChild2Column);

          for (int j = 0; j < numChild2Column; j++) {
            child2Tuple.set(j, child2Columns.get(j).getObject(i));
          }

          for (final Entry<Tuple, Integer> entry : matchedTuples.entrySet()) {
            final Tuple child1Tuple = entry.getKey();
            final int numChild1Occur = entry.getValue();

            final Tuple t = new Tuple(numChild1Column + numChild2Column);
            t.setAll(0, child1Tuple);
            t.setAll(numChild1Column, child2Tuple);
            final Integer occur = result.get(t);
            if (occur == null) {
              result.put(t, numChild1Occur);
            } else {
              result.put(t, occur + numChild1Occur);
            }
          }
        }
      }
    }
    return result;
  }

  public static HashMap<Tuple, Integer> groupByAvgLongColumn(
      final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) {
    final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn();
    final HashMap<Object, Long> sum = new HashMap<Object, Long>();
    final HashMap<Object, Integer> count = new HashMap<Object, Integer>();
    for (final List<? extends Column<?>> rawData : tbs) {
      final int numTuples = rawData.get(0).size();
      for (int i = 0; i < numTuples; i++) {
        final Object groupByValue = rawData.get(groupByColumn).getObject(i);
        final Long aggValue = (Long) rawData.get(aggColumn).getObject(i);
        Long currentSum = sum.get(groupByValue);
        if (currentSum == null) {
          currentSum = 0L;
          count.put(groupByValue, 1);
        } else {
          count.put(groupByValue, count.get(groupByValue) + 1);
        }
        sum.put(groupByValue, currentSum + aggValue);
      }
    }
    final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>();

    for (final Map.Entry<Object, Long> e : sum.entrySet()) {
      final Object gValue = e.getKey();
      final Long sumV = e.getValue();
      final Tuple t = new Tuple(2);
      t.set(0, (Comparable<?>) gValue);
      t.set(1, sumV * 1.0 / count.get(gValue));
      result.put(t, 1);
    }
    return result;
  }

  public static <T extends Comparable<T>> HashMap<Tuple, Integer> groupByMax(
      final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) {
    final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn();
    final HashMap<Object, T> max = new HashMap<Object, T>();
    for (final List<? extends Column<?>> rawData : tbs) {
      final int numTuples = rawData.get(0).size();
      for (int i = 0; i < numTuples; i++) {
        final Object groupByValue = rawData.get(groupByColumn).getObject(i);
        @SuppressWarnings("unchecked")
        final T aggValue = (T) rawData.get(aggColumn).getObject(i);
        final T currentMax = max.get(groupByValue);
        if (currentMax == null) {
          max.put(groupByValue, aggValue);
        } else if (aggValue.compareTo(currentMax) > 0) {
          max.put(groupByValue, aggValue);
        }
      }
    }
    final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>();

    for (final Map.Entry<Object, T> e : max.entrySet()) {
      final Object gValue = e.getKey();
      final T maxV = e.getValue();
      final Tuple t = new Tuple(2);
      t.set(0, (Comparable<?>) gValue);
      t.set(1, maxV);
      result.put(t, 1);
    }
    return result;
  }

  public static <T extends Comparable<T>> HashMap<Tuple, Integer> groupByMin(
      final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) {
    final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn();
    final HashMap<Object, T> min = new HashMap<Object, T>();
    for (final List<? extends Column<?>> rawData : tbs) {
      final int numTuples = rawData.get(0).size();
      for (int i = 0; i < numTuples; i++) {
        final Object groupByValue = rawData.get(groupByColumn).getObject(i);
        @SuppressWarnings("unchecked")
        final T aggValue = (T) rawData.get(aggColumn).getObject(i);
        final T currentMin = min.get(groupByValue);
        if (currentMin == null) {
          min.put(groupByValue, aggValue);
        } else if (aggValue.compareTo(currentMin) < 0) {
          min.put(groupByValue, aggValue);
        }
      }
    }
    final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>();

    for (final Map.Entry<Object, T> e : min.entrySet()) {
      final Object gValue = e.getKey();
      final T minV = e.getValue();
      final Tuple t = new Tuple(2);
      t.set(0, (Comparable<?>) gValue);
      t.set(1, minV);
      result.put(t, 1);
    }
    return result;
  }

  public static HashMap<Tuple, Integer> groupBySumLongColumn(
      final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) {
    final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn();
    final HashMap<Object, Long> sum = new HashMap<Object, Long>();
    for (final List<? extends Column<?>> rawData : tbs) {
      final int numTuples = rawData.get(0).size();
      for (int i = 0; i < numTuples; i++) {
        final Object groupByValue = rawData.get(groupByColumn).getObject(i);
        final Long aggValue = (Long) rawData.get(aggColumn).getObject(i);
        Long currentSum = sum.get(groupByValue);
        if (currentSum == null) {
          currentSum = 0L;
        }
        sum.put(groupByValue, currentSum + aggValue);
      }
    }
    final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>();

    for (final Map.Entry<Object, Long> e : sum.entrySet()) {
      final Object gValue = e.getKey();
      final Long sumV = e.getValue();
      final Tuple t = new Tuple(2);
      t.set(0, (Comparable<?>) gValue);
      t.set(1, sumV);
      result.put(t, 1);
    }
    return result;
  }

  /***/
  public static String[] randomFixedLengthNumericString(
      final int min, final int max, final int size, final int length) {

    final String[] result = new String[size];
    final long[] intV = randomLong(min, max, size);

    for (int i = 0; i < size; i++) {
      result[i] = intToString(intV[i], length);
    }
    return result;
  }

  public static long[] randomLong(final long min, final long max, final int size) {
    final long[] result = new long[size];
    final long top = max - min + 1;
    for (int i = 0; i < size; i++) {
      result[i] = getRandom().nextInt((int) top) + min;
    }
    return result;
  }

  public static HashMap<Tuple, Integer> tupleBatchToTupleBag(final TupleBatchBuffer tbb) {
    final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>();
    final Iterator<List<? extends Column<?>>> it = tbb.getAllAsRawColumn().iterator();

    while (it.hasNext()) {
      final List<? extends Column<?>> columns = it.next();
      final int numColumn = columns.size();
      final int numRow = columns.get(0).size();
      for (int row = 0; row < numRow; row++) {
        final Tuple t = new Tuple(numColumn);
        for (int column = 0; column < numColumn; column++) {
          t.set(column, columns.get(column).getObject(row));
        }
        final Integer numOccur = result.get(t);
        if (numOccur == null) {
          result.put(t, new Integer(1));
        } else {
          result.put(t, numOccur + 1);
        }
      }
    }
    return result;
  }

  /**
   * @param numTuples how many tuples in output
   * @param sampleSize how many different values should be created at random (around numTuples/sampleSize duplicates)
   * @param sorted Generate sorted tuples, sorted by id
   * @return
   */
  public static TupleBatchBuffer generateRandomTuples(
      final int numTuples, final int sampleSize, final boolean sorted) {
    final ArrayList<Entry<Long, String>> entries = new ArrayList<Entry<Long, String>>();

    final long[] ids = randomLong(0, sampleSize, numTuples);
    final String[] names = randomFixedLengthNumericString(0, sampleSize, numTuples, 20);

    for (int i = 0; i < numTuples; i++) {
      entries.add(new SimpleEntry<Long, String>(ids[i], names[i]));
    }

    Comparator<Entry<Long, String>> comparator = new EntryComparator();
    if (sorted) {
      Collections.sort(entries, comparator);
    }

    final Schema schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name"));

    final TupleBatchBuffer tbb = new TupleBatchBuffer(schema);

    for (Entry<Long, String> entry : entries) {
      tbb.putLong(0, entry.getKey());
      tbb.putString(1, entry.getValue());
    }
    return tbb;
  }

  /**
   * Construct a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the
   * specified relation key and partition function.
   *
   * @param masterSource the source of tuples, from the master.
   * @param dest the name of the relation into which tuples will be inserted (using overwrite!).
   * @param df how tuples will be distributed on the cluster.
   * @param workers the set of workers on which the data will be stored.
   * @return a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the
   *         specified relation key and partition function.
   */
  public static final SubQuery insertRelation(
      @Nonnull final Operator masterSource,
      @Nonnull final RelationKey dest,
      @Nonnull final DistributeFunction df,
      @Nonnull final Set<Integer> workers) {
    return insertRelation(
        masterSource,
        dest,
        df,
        ArrayUtils.toPrimitive(workers.toArray(new Integer[workers.size()])));
  }

  /**
   * Construct a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the
   * specified relation key and partition function.
   *
   * @param masterSource the source of tuples, from the master.
   * @param dest the name of the relation into which tuples will be inserted (using overwrite!).
   * @param df how tuples will be distributed on the cluster.
   * @param workers the set of workers on which the data will be stored.
   * @return a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the
   *         specified relation key and partition function.
   */
  public static final SubQuery insertRelation(
      @Nonnull final Operator masterSource,
      @Nonnull final RelationKey dest,
      @Nonnull final DistributeFunction df,
      @Nonnull final int[] workers) {
    final ExchangePairID id = ExchangePairID.newID();
    /* Master plan */
    GenericShuffleProducer sp =
        new GenericShuffleProducer(masterSource, new ExchangePairID[] {id}, workers, df);
    SubQueryPlan masterPlan = new SubQueryPlan(sp);

    /* Worker plan */
    Consumer sc =
        new Consumer(masterSource.getSchema(), id, ImmutableSet.of(MyriaConstants.MASTER_ID));
    DbInsert insert = new DbInsert(sc, dest, true);
    Map<Integer, SubQueryPlan> workerPlans = Maps.newHashMap();
    for (int i : workers) {
      workerPlans.put(i, new SubQueryPlan(insert));
    }

    return new SubQuery(masterPlan, workerPlans);
  }

  /** Construct a SubQuery that will fail on the master during initialization. Useful for testing failures. */
  public static final SubQuery failOnMasterInit() {
    /* Master plan */
    EOSSource src = new EOSSource();
    Operator fail = new InitFailureInjector(src);
    EmptySink root = new EmptySink(fail);

    Map<Integer, SubQueryPlan> workerPlans = Maps.newHashMap();

    return new SubQuery(new SubQueryPlan(root), workerPlans);
  }

  /** Construct a SubQuery that will fail on one worker during initialization. Useful for testing failures. */
  public static final SubQuery failOnFirstWorkerInit(@Nonnull final int[] workers) {
    Preconditions.checkElementIndex(1, workers.length);

    /* Master plan */
    SubQueryPlan masterPlan = new SubQueryPlan(new EmptySink(new EOSSource()));

    /* Worker plans */
    Map<Integer, SubQueryPlan> workerPlans = Maps.newHashMap();
    /* First worker */
    workerPlans.put(
        workers[0], new SubQueryPlan(new EmptySink(new InitFailureInjector(new EOSSource()))));
    return new SubQuery(masterPlan, workerPlans);
  }

  /**
   * Returns a {@link TupleBatchBuffer} containing the values 0 to {@code n-1}. The column is of type
   * {@Link Type#INT_TYPE} and the column name is {@code "val"}.
   *
   * @param n the number of values in the buffer.
   * @return a {@link TupleBatchBuffer} containing the values 0 to {@code n-1}
   */
  public static TupleBatchBuffer range(final int n) {
    TupleBatchBuffer sourceBuffer = new TupleBatchBuffer(Schema.ofFields(Type.INT_TYPE, "val"));
    for (int i = 0; i < n; ++i) {
      sourceBuffer.putInt(0, i);
    }
    return sourceBuffer;
  }
}