/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.tez;
import static org.junit.Assert.*;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.mock;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.math.stat.descriptive.SummaryStatistics;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcSplit;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TestHostAffinitySplitLocationProvider {
private final Logger LOG = LoggerFactory.getLogger(TestHostAffinitySplitLocationProvider.class);
private static final List<String> locations = new ArrayList<>();
private static final Set<String> locationsSet = new HashSet<>();
private static final List<String> executorLocations = new ArrayList<>();
private static final Set<String> executorLocationsSet = new HashSet<>();
static {
for (int i = 0 ; i < 5 ; i++) {
locations.add("location" + i);
locationsSet.add(locations.get(i));
}
for (int i = 0 ; i < 9 ; i++) {
executorLocations.add("execLocation" + i);
executorLocationsSet.add(executorLocations.get(i));
}
}
@Test (timeout = 5000)
public void testNonFileSplits() throws IOException {
HostAffinitySplitLocationProvider locationProvider = new HostAffinitySplitLocationProvider(executorLocations);
InputSplit inputSplit1 = createMockInputSplit(new String[] {locations.get(0), locations.get(1)});
InputSplit inputSplit2 = createMockInputSplit(new String[] {locations.get(2), locations.get(3)});
assertArrayEquals(new String[] {locations.get(0), locations.get(1)}, locationProvider.getLocations(inputSplit1));
assertArrayEquals(new String[] {locations.get(2), locations.get(3)}, locationProvider.getLocations(inputSplit2));
}
@Test (timeout = 5000)
public void testOrcSplitsBasic() throws IOException {
HostAffinitySplitLocationProvider locationProvider = new HostAffinitySplitLocationProvider(executorLocations);
InputSplit os1 = createMockFileSplit(true, "path1", 0, 1000, new String[] {locations.get(0), locations.get(1)});
InputSplit os2 = createMockFileSplit(true, "path2", 0, 2000, new String[] {locations.get(2), locations.get(3)});
InputSplit os3 = createMockFileSplit(true, "path3", 1000, 2000, new String[] {locations.get(0), locations.get(3)});
String[] retLoc1 = locationProvider.getLocations(os1);
String[] retLoc2 = locationProvider.getLocations(os2);
String[] retLoc3 = locationProvider.getLocations(os3);
assertEquals(1, retLoc1.length);
assertFalse(locationsSet.contains(retLoc1[0]));
assertTrue(executorLocationsSet.contains(retLoc1[0]));
assertEquals(1, retLoc2.length);
assertFalse(locationsSet.contains(retLoc2[0]));
assertTrue(executorLocationsSet.contains(retLoc2[0]));
assertEquals(1, retLoc3.length);
assertFalse(locationsSet.contains(retLoc3[0]));
assertTrue(executorLocationsSet.contains(retLoc3[0]));
}
@Test (timeout = 10000)
public void testConsistentHashing() throws IOException {
final int LOC_COUNT = 20, MIN_LOC_COUNT = 4, SPLIT_COUNT = 100;
List<String> locations = createLocations(LOC_COUNT);
InputSplit[] splits = createSplits(SPLIT_COUNT);
StringBuilder failBuilder = new StringBuilder("\n");
String[] lastLocations = new String[splits.length];
double movedRatioSum = 0, newRatioSum = 0,
movedRatioWorst = 0, newRatioWorst = Double.MAX_VALUE;
for (int locs = MIN_LOC_COUNT; locs <= locations.size(); ++locs) {
List<String> partLoc = locations.subList(0, locs);
HostAffinitySplitLocationProvider lp = new HostAffinitySplitLocationProvider(partLoc);
int moved = 0, newLoc = 0;
String newNode = partLoc.get(locs - 1);
for (int splitIx = 0; splitIx < splits.length; ++splitIx) {
String[] splitLocations = lp.getLocations(splits[splitIx]);
assertEquals(1, splitLocations.length);
String splitLocation = splitLocations[0];
if (locs > MIN_LOC_COUNT && !splitLocation.equals(lastLocations[splitIx])) {
++moved;
}
if (newNode.equals(splitLocation)) {
++newLoc;
}
lastLocations[splitIx] = splitLocation;
}
if (locs == MIN_LOC_COUNT) continue;
String msgTail = " when going to " + locs + " locations";
String movedMsg = moved + " splits moved",
newMsg = newLoc + " splits went to the new node";
LOG.info(movedMsg + " and " + newMsg + msgTail);
double maxMoved = 1.0f * splits.length / locs, minNew = 1.0f * splits.length / locs;
movedRatioSum += moved / maxMoved;
movedRatioWorst = Math.max(moved / maxMoved, movedRatioWorst);
newRatioSum += newLoc / minNew;
newRatioWorst = Math.min(newLoc / minNew, newRatioWorst);
logBadRatios(failBuilder, moved, newLoc, msgTail, movedMsg, newMsg, maxMoved, minNew);
}
int count = locations.size() - MIN_LOC_COUNT;
double moveRatioAvg = movedRatioSum / count, newRatioAvg = newRatioSum / count;
String errorMsg = "Move counts: average " + moveRatioAvg + ", worst " + movedRatioWorst
+ "; assigned to new node: average " + newRatioAvg + ", worst " + newRatioWorst;
LOG.info(errorMsg);
// Give it a LOT of slack, since on low numbers consistent hashing is very imprecise.
if (moveRatioAvg > 1.2f || newRatioAvg < 0.8f
|| movedRatioWorst > 1.67f || newRatioWorst < 0.5f) {
fail(errorMsg + "; example failures: " + failBuilder.toString());
}
}
public FileSplit[] createSplits(final int splitCount) throws IOException {
FileSplit[] splits = new FileSplit[splitCount];
for (int i = 0; i < splits.length; ++i) {
splits[i] = createMockFileSplit(true, "path" + i, 0, 1000, new String[] {});
}
return splits;
}
public List<String> createLocations(final int locCount) {
List<String> locations = new ArrayList<>(locCount);
for (int i = 0; i < locCount; ++i) {
locations.add(String.valueOf(i));
}
return locations;
}
@Test (timeout = 20000)
public void testConsistentHashingFallback() throws IOException {
final int LOC_COUNT_TO = 20, SPLIT_COUNT = 500, MAX_MISS_COUNT = 4,
LOC_COUNT_FROM = MAX_MISS_COUNT + 1;
FileSplit[] splits = createSplits(SPLIT_COUNT);
AtomicInteger errorCount = new AtomicInteger(0);
int cvErrorCount = 0;
for (int locs = LOC_COUNT_FROM; locs <= LOC_COUNT_TO; ++locs) {
int aboveAvgCount = 0;
double sum = 0;
double[] cvs = new double[MAX_MISS_COUNT + 1];
for (int missCount = 0; missCount <= MAX_MISS_COUNT; ++missCount) {
double cv = cvs[missCount] = testHashDistribution(locs, missCount, splits, errorCount);
sum += cv;
if (missCount > 0 && cv > sum / (missCount + 1)) {
++aboveAvgCount;
}
}
if (aboveAvgCount > 2) {
LOG.info("CVs for " + locs + " locations aren't to our liking: " + Arrays.toString(cvs));
++cvErrorCount;
}
}
assertTrue("Found " + errorCount.get() + " abnormalities", errorCount.get() < 3);
// TODO: the way we add hash fns does exhibit some irregularities.
// Seems like the 3rd iter has a better distribution in many cases, even better
// that the original hash. That trips the "above MA" criteria, even if the rest is flat.
assertTrue("Found " + cvErrorCount + " abnormalities", cvErrorCount< 7);
}
private double testHashDistribution(int locs, final int missCount, FileSplit[] splits,
AtomicInteger errorCount) {
// This relies heavily on what method determineSplits ... calls and doesn't.
// We could do a wrapper with only size() and get() methods instead of List, to be sure.
@SuppressWarnings("unchecked")
List<String> partLocs = (List<String>)Mockito.mock(List.class);
Mockito.when(partLocs.size()).thenReturn(locs);
final AtomicInteger state = new AtomicInteger(0);
Mockito.when(partLocs.get(Mockito.anyInt())).thenAnswer(new Answer<String>() {
@Override
public String answer(InvocationOnMock invocation) throws Throwable {
return (state.getAndIncrement() == missCount) ? "not-null" : null;
}
});
int[] hitCounts = new int[locs];
for (int splitIx = 0; splitIx < splits.length; ++splitIx) {
state.set(0);
int index = HostAffinitySplitLocationProvider.determineLocation(partLocs,
splits[splitIx].getPath().toString(), splits[splitIx].getStart(), null);
++hitCounts[index];
}
SummaryStatistics ss = new SummaryStatistics();
for (int hitCount : hitCounts) {
ss.addValue(hitCount);
}
// All of this is completely bogus and mostly captures the following function:
// f(output) = I-eyeballed-the(output) == they-look-ok.
// It's pretty much a golden file...
// The fact that stdev doesn't increase with increasing missCount is captured outside.
double avg = ss.getSum()/ss.getN(), stdev = ss.getStandardDeviation(), cv = stdev/avg;
double allowedMin = avg - 2.5 * stdev, allowedMax = avg + 2.5 * stdev;
if (allowedMin > ss.getMin() || allowedMax < ss.getMax() || cv > 0.22) {
LOG.info("The distribution for " + locs + " locations, " + missCount + " misses isn't to "
+ "our liking: avg " + avg + ", stdev " + stdev + ", cv " + cv + ", min " + ss.getMin()
+ ", max " + ss.getMax());
errorCount.incrementAndGet();
}
return cv;
}
private void logBadRatios(StringBuilder failBuilder, int moved, int newLoc, String msgTail,
String movedMsg, String newMsg, double maxMoved, double minNew) {
boolean logged = false;
if (moved > maxMoved * 1.33f) {
failBuilder.append(movedMsg).append(" (threshold ").append(maxMoved).append(") ");
logged = true;
}
if (newLoc < minNew * 0.75f) {
failBuilder.append(newMsg).append(" (threshold ").append(minNew).append(") ");
logged = true;
}
if (logged) {
failBuilder.append(msgTail).append(";\n");
}
}
@Test (timeout = 5000)
public void testOrcSplitsLocationAffinity() throws IOException {
HostAffinitySplitLocationProvider locationProvider = new HostAffinitySplitLocationProvider(executorLocations);
// Same file, offset, different lengths
InputSplit os11 = createMockFileSplit(true, "path1", 0, 15000, new String[] {locations.get(0), locations.get(1)});
InputSplit os12 = createMockFileSplit(true, "path1", 0, 30000, new String[] {locations.get(0), locations.get(1)});
// Same file, different offset
InputSplit os13 = createMockFileSplit(true, "path1", 15000, 30000, new String[] {locations.get(0), locations.get(1)});
String[] retLoc11 = locationProvider.getLocations(os11);
String[] retLoc12 = locationProvider.getLocations(os12);
String[] retLoc13 = locationProvider.getLocations(os13);
assertEquals(1, retLoc11.length);
assertFalse(locationsSet.contains(retLoc11[0]));
assertTrue(executorLocationsSet.contains(retLoc11[0]));
assertEquals(1, retLoc12.length);
assertFalse(locationsSet.contains(retLoc12[0]));
assertTrue(executorLocationsSet.contains(retLoc12[0]));
assertEquals(1, retLoc13.length);
assertFalse(locationsSet.contains(retLoc13[0]));
assertTrue(executorLocationsSet.contains(retLoc13[0]));
// Verify the actual locations being correct.
// os13 should be on a different location. Splits are supposed to be consistent across JVMs,
// the test is setup to verify a different host (make sure not to hash to the same host as os11,os12).
// If the test were to fail because the host is the same - the assumption about consistent across JVM
// instances is likely incorrect.
assertEquals(retLoc11[0], retLoc12[0]);
assertNotEquals(retLoc11[0], retLoc13[0]);
// Get locations again, and make sure they're the same.
String[] retLoc112 = locationProvider.getLocations(os11);
String[] retLoc122 = locationProvider.getLocations(os12);
String[] retLoc132 = locationProvider.getLocations(os13);
assertArrayEquals(retLoc11, retLoc112);
assertArrayEquals(retLoc12, retLoc122);
assertArrayEquals(retLoc13, retLoc132);
}
private InputSplit createMockInputSplit(String[] locations) throws IOException {
InputSplit inputSplit = mock(InputSplit.class);
doReturn(locations).when(inputSplit).getLocations();
return inputSplit;
}
private FileSplit createMockFileSplit(boolean createOrcSplit, String fakePathString, long start,
long length, String[] locations) throws IOException {
FileSplit fileSplit;
if (createOrcSplit) {
fileSplit = mock(OrcSplit.class);
} else {
fileSplit = mock(FileSplit.class);
}
doReturn(start).when(fileSplit).getStart();
doReturn(length).when(fileSplit).getLength();
doReturn(new Path(fakePathString)).when(fileSplit).getPath();
doReturn(locations).when(fileSplit).getLocations();
doReturn(locations).when(fileSplit).getLocations();
return fileSplit;
}
}