package edu.washington.escience.myria.util; import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import javax.annotation.Nonnull; import org.apache.commons.lang.ArrayUtils; import org.junit.Assert; import org.junit.Assume; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import edu.washington.escience.myria.MyriaConstants; import edu.washington.escience.myria.RelationKey; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.column.Column; import edu.washington.escience.myria.operator.DbInsert; import edu.washington.escience.myria.operator.EOSSource; import edu.washington.escience.myria.operator.EmptySink; import edu.washington.escience.myria.operator.Operator; import edu.washington.escience.myria.operator.failures.InitFailureInjector; import edu.washington.escience.myria.operator.network.Consumer; import edu.washington.escience.myria.operator.network.GenericShuffleProducer; import edu.washington.escience.myria.operator.network.distribute.DistributeFunction; import edu.washington.escience.myria.parallel.ExchangePairID; import edu.washington.escience.myria.parallel.SubQuery; import edu.washington.escience.myria.parallel.SubQueryPlan; import edu.washington.escience.myria.storage.TupleBatchBuffer; public final class TestUtils { public static class EntryComparator implements Comparator<Entry<Long, String>> { @Override public int compare(final Entry<Long, String> o1, final Entry<Long, String> o2) { int res = o1.getKey().compareTo(o2.getKey()); if (res != 0) { return res; } return o1.getValue().compareTo(o2.getValue()); } } private static Random random = null; /** * See http://docs.travis-ci.com/user/ci-environment/#Environment-variables * * @return <code>true</code> if the system is currently in a Travis CI build. */ public static boolean inTravis() { String travis = System.getenv("TRAVIS"); return (travis != null) && travis.equals("true"); } /** Only run this test in Travis. */ public static void requireTravis() { Assume.assumeTrue(inTravis()); } /** Skip this test if in Travis. */ public static void skipIfInTravis() { Assume.assumeFalse(inTravis()); } private synchronized static Random getRandom() { if (random == null) { random = new Random(); } return random; } public static void setSeed(final long seed) { getRandom().setSeed(seed); } public static void resetRandom() { random = null; } public static void assertEqualsToStringBuilder( final StringBuilder errorMessageHolder, final String currentEM, final Object expected, final Object actual) { if (expected == null) { if (actual != null) { errorMessageHolder.append(currentEM); errorMessageHolder.append(": "); errorMessageHolder.append("expected: <null>"); errorMessageHolder.append("but was: <"); errorMessageHolder.append(actual); errorMessageHolder.append(">\n"); } } else { if (!expected.equals(actual)) { errorMessageHolder.append(currentEM); errorMessageHolder.append(": "); errorMessageHolder.append("expected: <"); errorMessageHolder.append(expected); errorMessageHolder.append('>'); errorMessageHolder.append("but was: <"); errorMessageHolder.append(actual); errorMessageHolder.append(">\n"); } } } public static void assertTupleBagEqual( final HashMap<Tuple, Integer> expectedResult, final HashMap<Tuple, Integer> actualResult) { final StringBuilder errorMessageHolder = new StringBuilder(); assertEqualsToStringBuilder( errorMessageHolder, "Number of unique tuples", expectedResult.size(), actualResult.size()); final HashSet<Tuple> keySet = new HashSet<Tuple>(); keySet.addAll(expectedResult.keySet()); keySet.addAll(actualResult.keySet()); for (final Tuple k : keySet) { Integer expected = expectedResult.get(k); Integer actual = actualResult.get(k); if (expected == null) { expected = 0; } if (actual == null) { actual = 0; } assertEqualsToStringBuilder(errorMessageHolder, "Tuple entry{" + k + "}", expected, actual); } if (errorMessageHolder.length() != 0) { Assert.fail(errorMessageHolder.toString()); } } public static String intToString(final long v, final int length) { final StringBuilder sb = new StringBuilder("" + v); while (sb.length() < length) { sb.insert(0, "0"); } return sb.toString(); } public static HashMap<Tuple, Integer> mergeBags(final List<HashMap<Tuple, Integer>> bags) { final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>(); result.putAll(bags.get(0)); for (int i = 1; i < bags.size(); i++) { for (final Map.Entry<Tuple, Integer> e : bags.get(i).entrySet()) { final Tuple t = e.getKey(); final Integer occ = e.getValue(); final Integer existingOcc = result.get(t); if (existingOcc == null) { result.put(t, occ); } else { result.put(t, occ + existingOcc); } } } return result; } @SuppressWarnings("rawtypes") public static HashMap<Tuple, Integer> naturalJoin( final TupleBatchBuffer child1, final TupleBatchBuffer child2, final int child1JoinColumn, final int child2JoinColumn) { /** join key -> {tuple->num occur} */ final HashMap<Comparable, HashMap<Tuple, Integer>> child1Hash = new HashMap<Comparable, HashMap<Tuple, Integer>>(); int numChild1Column = 0; final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>(); final List<List<? extends Column<?>>> child1TBIt = child1.getAllAsRawColumn(); for (final List<? extends Column<?>> child1RawData : child1TBIt) { final int numRow = child1RawData.get(0).size(); final int numColumn = child1RawData.size(); numChild1Column = numColumn; for (int i = 0; i < numRow; i++) { final Tuple t = new Tuple(numColumn); for (int j = 0; j < numColumn; j++) { t.set(j, child1RawData.get(j).getObject(i)); } final Object joinKey = t.get(child1JoinColumn); HashMap<Tuple, Integer> tupleOccur = child1Hash.get(joinKey); if (tupleOccur == null) { tupleOccur = new HashMap<Tuple, Integer>(); tupleOccur.put(t, 1); child1Hash.put((Comparable<?>) joinKey, tupleOccur); } else { Integer occur = tupleOccur.get(t); if (occur == null) { occur = 0; } tupleOccur.put(t, occur + 1); } } } final Iterator<List<? extends Column<?>>> child2TBIt = child2.getAllAsRawColumn().iterator(); while (child2TBIt.hasNext()) { final List<? extends Column<?>> child2Columns = child2TBIt.next(); final int numRow = child2Columns.get(0).size(); final int numChild2Column = child2Columns.size(); for (int i = 0; i < numRow; i++) { final Object joinKey = child2Columns.get(child2JoinColumn).getObject(i); final HashMap<Tuple, Integer> matchedTuples = child1Hash.get(joinKey); if (matchedTuples != null) { final Tuple child2Tuple = new Tuple(numChild2Column); for (int j = 0; j < numChild2Column; j++) { child2Tuple.set(j, child2Columns.get(j).getObject(i)); } for (final Entry<Tuple, Integer> entry : matchedTuples.entrySet()) { final Tuple child1Tuple = entry.getKey(); final int numChild1Occur = entry.getValue(); final Tuple t = new Tuple(numChild1Column + numChild2Column); t.setAll(0, child1Tuple); t.setAll(numChild1Column, child2Tuple); final Integer occur = result.get(t); if (occur == null) { result.put(t, numChild1Occur); } else { result.put(t, occur + numChild1Occur); } } } } } return result; } public static HashMap<Tuple, Integer> groupByAvgLongColumn( final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) { final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn(); final HashMap<Object, Long> sum = new HashMap<Object, Long>(); final HashMap<Object, Integer> count = new HashMap<Object, Integer>(); for (final List<? extends Column<?>> rawData : tbs) { final int numTuples = rawData.get(0).size(); for (int i = 0; i < numTuples; i++) { final Object groupByValue = rawData.get(groupByColumn).getObject(i); final Long aggValue = (Long) rawData.get(aggColumn).getObject(i); Long currentSum = sum.get(groupByValue); if (currentSum == null) { currentSum = 0L; count.put(groupByValue, 1); } else { count.put(groupByValue, count.get(groupByValue) + 1); } sum.put(groupByValue, currentSum + aggValue); } } final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>(); for (final Map.Entry<Object, Long> e : sum.entrySet()) { final Object gValue = e.getKey(); final Long sumV = e.getValue(); final Tuple t = new Tuple(2); t.set(0, (Comparable<?>) gValue); t.set(1, sumV * 1.0 / count.get(gValue)); result.put(t, 1); } return result; } public static <T extends Comparable<T>> HashMap<Tuple, Integer> groupByMax( final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) { final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn(); final HashMap<Object, T> max = new HashMap<Object, T>(); for (final List<? extends Column<?>> rawData : tbs) { final int numTuples = rawData.get(0).size(); for (int i = 0; i < numTuples; i++) { final Object groupByValue = rawData.get(groupByColumn).getObject(i); @SuppressWarnings("unchecked") final T aggValue = (T) rawData.get(aggColumn).getObject(i); final T currentMax = max.get(groupByValue); if (currentMax == null) { max.put(groupByValue, aggValue); } else if (aggValue.compareTo(currentMax) > 0) { max.put(groupByValue, aggValue); } } } final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>(); for (final Map.Entry<Object, T> e : max.entrySet()) { final Object gValue = e.getKey(); final T maxV = e.getValue(); final Tuple t = new Tuple(2); t.set(0, (Comparable<?>) gValue); t.set(1, maxV); result.put(t, 1); } return result; } public static <T extends Comparable<T>> HashMap<Tuple, Integer> groupByMin( final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) { final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn(); final HashMap<Object, T> min = new HashMap<Object, T>(); for (final List<? extends Column<?>> rawData : tbs) { final int numTuples = rawData.get(0).size(); for (int i = 0; i < numTuples; i++) { final Object groupByValue = rawData.get(groupByColumn).getObject(i); @SuppressWarnings("unchecked") final T aggValue = (T) rawData.get(aggColumn).getObject(i); final T currentMin = min.get(groupByValue); if (currentMin == null) { min.put(groupByValue, aggValue); } else if (aggValue.compareTo(currentMin) < 0) { min.put(groupByValue, aggValue); } } } final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>(); for (final Map.Entry<Object, T> e : min.entrySet()) { final Object gValue = e.getKey(); final T minV = e.getValue(); final Tuple t = new Tuple(2); t.set(0, (Comparable<?>) gValue); t.set(1, minV); result.put(t, 1); } return result; } public static HashMap<Tuple, Integer> groupBySumLongColumn( final TupleBatchBuffer source, final int groupByColumn, final int aggColumn) { final List<List<? extends Column<?>>> tbs = source.getAllAsRawColumn(); final HashMap<Object, Long> sum = new HashMap<Object, Long>(); for (final List<? extends Column<?>> rawData : tbs) { final int numTuples = rawData.get(0).size(); for (int i = 0; i < numTuples; i++) { final Object groupByValue = rawData.get(groupByColumn).getObject(i); final Long aggValue = (Long) rawData.get(aggColumn).getObject(i); Long currentSum = sum.get(groupByValue); if (currentSum == null) { currentSum = 0L; } sum.put(groupByValue, currentSum + aggValue); } } final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>(); for (final Map.Entry<Object, Long> e : sum.entrySet()) { final Object gValue = e.getKey(); final Long sumV = e.getValue(); final Tuple t = new Tuple(2); t.set(0, (Comparable<?>) gValue); t.set(1, sumV); result.put(t, 1); } return result; } /***/ public static String[] randomFixedLengthNumericString( final int min, final int max, final int size, final int length) { final String[] result = new String[size]; final long[] intV = randomLong(min, max, size); for (int i = 0; i < size; i++) { result[i] = intToString(intV[i], length); } return result; } public static long[] randomLong(final long min, final long max, final int size) { final long[] result = new long[size]; final long top = max - min + 1; for (int i = 0; i < size; i++) { result[i] = getRandom().nextInt((int) top) + min; } return result; } public static HashMap<Tuple, Integer> tupleBatchToTupleBag(final TupleBatchBuffer tbb) { final HashMap<Tuple, Integer> result = new HashMap<Tuple, Integer>(); final Iterator<List<? extends Column<?>>> it = tbb.getAllAsRawColumn().iterator(); while (it.hasNext()) { final List<? extends Column<?>> columns = it.next(); final int numColumn = columns.size(); final int numRow = columns.get(0).size(); for (int row = 0; row < numRow; row++) { final Tuple t = new Tuple(numColumn); for (int column = 0; column < numColumn; column++) { t.set(column, columns.get(column).getObject(row)); } final Integer numOccur = result.get(t); if (numOccur == null) { result.put(t, new Integer(1)); } else { result.put(t, numOccur + 1); } } } return result; } /** * @param numTuples how many tuples in output * @param sampleSize how many different values should be created at random (around numTuples/sampleSize duplicates) * @param sorted Generate sorted tuples, sorted by id * @return */ public static TupleBatchBuffer generateRandomTuples( final int numTuples, final int sampleSize, final boolean sorted) { final ArrayList<Entry<Long, String>> entries = new ArrayList<Entry<Long, String>>(); final long[] ids = randomLong(0, sampleSize, numTuples); final String[] names = randomFixedLengthNumericString(0, sampleSize, numTuples, 20); for (int i = 0; i < numTuples; i++) { entries.add(new SimpleEntry<Long, String>(ids[i], names[i])); } Comparator<Entry<Long, String>> comparator = new EntryComparator(); if (sorted) { Collections.sort(entries, comparator); } final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); for (Entry<Long, String> entry : entries) { tbb.putLong(0, entry.getKey()); tbb.putString(1, entry.getValue()); } return tbb; } /** * Construct a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the * specified relation key and partition function. * * @param masterSource the source of tuples, from the master. * @param dest the name of the relation into which tuples will be inserted (using overwrite!). * @param df how tuples will be distributed on the cluster. * @param workers the set of workers on which the data will be stored. * @return a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the * specified relation key and partition function. */ public static final SubQuery insertRelation( @Nonnull final Operator masterSource, @Nonnull final RelationKey dest, @Nonnull final DistributeFunction df, @Nonnull final Set<Integer> workers) { return insertRelation( masterSource, dest, df, ArrayUtils.toPrimitive(workers.toArray(new Integer[workers.size()]))); } /** * Construct a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the * specified relation key and partition function. * * @param masterSource the source of tuples, from the master. * @param dest the name of the relation into which tuples will be inserted (using overwrite!). * @param df how tuples will be distributed on the cluster. * @param workers the set of workers on which the data will be stored. * @return a SubQuery that will insert the given tuples (starting on the master) on the specified workers using the * specified relation key and partition function. */ public static final SubQuery insertRelation( @Nonnull final Operator masterSource, @Nonnull final RelationKey dest, @Nonnull final DistributeFunction df, @Nonnull final int[] workers) { final ExchangePairID id = ExchangePairID.newID(); /* Master plan */ GenericShuffleProducer sp = new GenericShuffleProducer(masterSource, new ExchangePairID[] {id}, workers, df); SubQueryPlan masterPlan = new SubQueryPlan(sp); /* Worker plan */ Consumer sc = new Consumer(masterSource.getSchema(), id, ImmutableSet.of(MyriaConstants.MASTER_ID)); DbInsert insert = new DbInsert(sc, dest, true); Map<Integer, SubQueryPlan> workerPlans = Maps.newHashMap(); for (int i : workers) { workerPlans.put(i, new SubQueryPlan(insert)); } return new SubQuery(masterPlan, workerPlans); } /** Construct a SubQuery that will fail on the master during initialization. Useful for testing failures. */ public static final SubQuery failOnMasterInit() { /* Master plan */ EOSSource src = new EOSSource(); Operator fail = new InitFailureInjector(src); EmptySink root = new EmptySink(fail); Map<Integer, SubQueryPlan> workerPlans = Maps.newHashMap(); return new SubQuery(new SubQueryPlan(root), workerPlans); } /** Construct a SubQuery that will fail on one worker during initialization. Useful for testing failures. */ public static final SubQuery failOnFirstWorkerInit(@Nonnull final int[] workers) { Preconditions.checkElementIndex(1, workers.length); /* Master plan */ SubQueryPlan masterPlan = new SubQueryPlan(new EmptySink(new EOSSource())); /* Worker plans */ Map<Integer, SubQueryPlan> workerPlans = Maps.newHashMap(); /* First worker */ workerPlans.put( workers[0], new SubQueryPlan(new EmptySink(new InitFailureInjector(new EOSSource())))); return new SubQuery(masterPlan, workerPlans); } /** * Returns a {@link TupleBatchBuffer} containing the values 0 to {@code n-1}. The column is of type * {@Link Type#INT_TYPE} and the column name is {@code "val"}. * * @param n the number of values in the buffer. * @return a {@link TupleBatchBuffer} containing the values 0 to {@code n-1} */ public static TupleBatchBuffer range(final int n) { TupleBatchBuffer sourceBuffer = new TupleBatchBuffer(Schema.ofFields(Type.INT_TYPE, "val")); for (int i = 0; i < n; ++i) { sourceBuffer.putInt(0, i); } return sourceBuffer; } }