/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.test.mapreduce;
import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
import static java.lang.System.currentTimeMillis;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.BatchWriterConfig;
import org.apache.accumulo.core.client.ClientConfiguration;
import org.apache.accumulo.core.client.ClientConfiguration.ClientProperty;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.admin.NewTableConfiguration;
import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat;
import org.apache.accumulo.core.client.mapreduce.RangeInputSplit;
import org.apache.accumulo.core.client.mapreduce.impl.BatchInputSplit;
import org.apache.accumulo.core.client.sample.RowSampler;
import org.apache.accumulo.core.client.sample.SamplerConfiguration;
import org.apache.accumulo.core.client.security.tokens.PasswordToken;
import org.apache.accumulo.core.conf.AccumuloConfiguration;
import org.apache.accumulo.core.conf.ConfigurationCopy;
import org.apache.accumulo.core.conf.DefaultConfiguration;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.util.Pair;
import org.apache.accumulo.harness.AccumuloClusterHarness;
import org.apache.accumulo.minicluster.impl.MiniAccumuloConfigImpl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
public class AccumuloInputFormatIT extends AccumuloClusterHarness {
AccumuloInputFormat inputFormat;
@Override
protected int defaultTimeoutSeconds() {
return 4 * 60;
}
@Override
public void configureMiniCluster(MiniAccumuloConfigImpl cfg, Configuration hadoopCoreSite) {
cfg.setNumTservers(1);
}
@Before
public void before() {
inputFormat = new AccumuloInputFormat();
}
/**
* Tests several different paths through the getSplits() method by setting different properties and verifying the results.
*/
@Test
public void testGetSplits() throws Exception {
Connector conn = getConnector();
String table = getUniqueNames(1)[0];
conn.tableOperations().create(table);
insertData(table, currentTimeMillis());
ClientConfiguration clientConf = cluster.getClientConfig();
AccumuloConfiguration clusterClientConf = new ConfigurationCopy(DefaultConfiguration.getInstance());
// Pass SSL and CredentialProvider options into the ClientConfiguration given to AccumuloInputFormat
boolean sslEnabled = Boolean.valueOf(clusterClientConf.get(Property.INSTANCE_RPC_SSL_ENABLED));
if (sslEnabled) {
ClientProperty[] sslProperties = new ClientProperty[] {ClientProperty.INSTANCE_RPC_SSL_ENABLED, ClientProperty.INSTANCE_RPC_SSL_CLIENT_AUTH,
ClientProperty.RPC_SSL_KEYSTORE_PATH, ClientProperty.RPC_SSL_KEYSTORE_TYPE, ClientProperty.RPC_SSL_KEYSTORE_PASSWORD,
ClientProperty.RPC_SSL_TRUSTSTORE_PATH, ClientProperty.RPC_SSL_TRUSTSTORE_TYPE, ClientProperty.RPC_SSL_TRUSTSTORE_PASSWORD,
ClientProperty.RPC_USE_JSSE, ClientProperty.GENERAL_SECURITY_CREDENTIAL_PROVIDER_PATHS};
for (ClientProperty prop : sslProperties) {
// The default property is returned if it's not in the ClientConfiguration so we don't have to check if the value is actually defined
clientConf.setProperty(prop, clusterClientConf.get(prop.getKey()));
}
}
Job job = Job.getInstance();
AccumuloInputFormat.setInputTableName(job, table);
AccumuloInputFormat.setZooKeeperInstance(job, clientConf);
AccumuloInputFormat.setConnectorInfo(job, getAdminPrincipal(), getAdminToken());
// split table
TreeSet<Text> splitsToAdd = new TreeSet<>();
for (int i = 0; i < 10000; i += 1000)
splitsToAdd.add(new Text(String.format("%09d", i)));
conn.tableOperations().addSplits(table, splitsToAdd);
sleepUninterruptibly(500, TimeUnit.MILLISECONDS); // wait for splits to be propagated
// get splits without setting any range
Collection<Text> actualSplits = conn.tableOperations().listSplits(table);
List<InputSplit> splits = inputFormat.getSplits(job);
assertEquals(actualSplits.size() + 1, splits.size()); // No ranges set on the job so it'll start with -inf
// set ranges and get splits
List<Range> ranges = new ArrayList<>();
for (Text text : actualSplits)
ranges.add(new Range(text));
AccumuloInputFormat.setRanges(job, ranges);
splits = inputFormat.getSplits(job);
assertEquals(actualSplits.size(), splits.size());
// offline mode
AccumuloInputFormat.setOfflineTableScan(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IOException e) {}
conn.tableOperations().offline(table, true);
splits = inputFormat.getSplits(job);
assertEquals(actualSplits.size(), splits.size());
// auto adjust ranges
ranges = new ArrayList<>();
for (int i = 0; i < 5; i++)
// overlapping ranges
ranges.add(new Range(String.format("%09d", i), String.format("%09d", i + 2)));
AccumuloInputFormat.setRanges(job, ranges);
splits = inputFormat.getSplits(job);
assertEquals(2, splits.size());
AccumuloInputFormat.setAutoAdjustRanges(job, false);
splits = inputFormat.getSplits(job);
assertEquals(ranges.size(), splits.size());
// BatchScan not available for offline scans
AccumuloInputFormat.setBatchScan(job, true);
// Reset auto-adjust ranges too
AccumuloInputFormat.setAutoAdjustRanges(job, true);
AccumuloInputFormat.setOfflineTableScan(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IllegalArgumentException e) {}
conn.tableOperations().online(table, true);
AccumuloInputFormat.setOfflineTableScan(job, false);
// test for resumption of success
splits = inputFormat.getSplits(job);
assertEquals(2, splits.size());
// BatchScan not available with isolated iterators
AccumuloInputFormat.setScanIsolation(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IllegalArgumentException e) {}
AccumuloInputFormat.setScanIsolation(job, false);
// test for resumption of success
splits = inputFormat.getSplits(job);
assertEquals(2, splits.size());
// BatchScan not available with local iterators
AccumuloInputFormat.setLocalIterators(job, true);
try {
inputFormat.getSplits(job);
fail("An exception should have been thrown");
} catch (IllegalArgumentException e) {}
AccumuloInputFormat.setLocalIterators(job, false);
// Check we are getting back correct type pf split
conn.tableOperations().online(table);
splits = inputFormat.getSplits(job);
for (InputSplit split : splits)
assert (split instanceof BatchInputSplit);
// We should divide along the tablet lines similar to when using `setAutoAdjustRanges(job, true)`
assertEquals(2, splits.size());
}
private void insertData(String tableName, long ts) throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
BatchWriter bw = getConnector().createBatchWriter(tableName, null);
for (int i = 0; i < 10000; i++) {
String row = String.format("%09d", i);
Mutation m = new Mutation(new Text(row));
m.put(new Text("cf1"), new Text("cq1"), ts, new Value(("" + i).getBytes()));
bw.addMutation(m);
}
bw.close();
}
// track errors in the map reduce job; jobs insert a dummy error for the map and cleanup tasks (to ensure test correctness),
// so error tests should check to see if there is at least one error (could be more depending on the test) rather than zero
private static Multimap<String,AssertionError> assertionErrors = ArrayListMultimap.create();
private static class MRTester extends Configured implements Tool {
private static class TestMapper extends Mapper<Key,Value,Key,Value> {
Key key = null;
int count = 0;
@Override
protected void map(Key k, Value v, Context context) throws IOException, InterruptedException {
String table = context.getConfiguration().get("MRTester_tableName");
assertNotNull(table);
try {
if (key != null)
assertEquals(key.getRow().toString(), new String(v.get()));
assertEquals(k.getRow(), new Text(String.format("%09x", count + 1)));
assertEquals(new String(v.get()), String.format("%09x", count));
} catch (AssertionError e) {
assertionErrors.put(table + "_map", e);
}
key = new Key(k);
count++;
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
String table = context.getConfiguration().get("MRTester_tableName");
assertNotNull(table);
try {
assertEquals(100, count);
} catch (AssertionError e) {
assertionErrors.put(table + "_cleanup", e);
}
}
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 2 && args.length != 4) {
throw new IllegalArgumentException("Usage : " + MRTester.class.getName() + " <table> <inputFormatClass> [<batchScan> <scan sample>]");
}
String table = args[0];
String inputFormatClassName = args[1];
Boolean batchScan = false;
boolean sample = false;
if (args.length == 4) {
batchScan = Boolean.parseBoolean(args[2]);
sample = Boolean.parseBoolean(args[3]);
}
assertionErrors.put(table + "_map", new AssertionError("Dummy_map"));
assertionErrors.put(table + "_cleanup", new AssertionError("Dummy_cleanup"));
@SuppressWarnings("unchecked")
Class<? extends InputFormat<?,?>> inputFormatClass = (Class<? extends InputFormat<?,?>>) Class.forName(inputFormatClassName);
Job job = Job.getInstance(getConf(), this.getClass().getSimpleName() + "_" + System.currentTimeMillis());
job.setJarByClass(this.getClass());
job.getConfiguration().set("MRTester_tableName", table);
job.setInputFormatClass(inputFormatClass);
AccumuloInputFormat.setZooKeeperInstance(job, cluster.getClientConfig());
AccumuloInputFormat.setConnectorInfo(job, getAdminPrincipal(), getAdminToken());
AccumuloInputFormat.setInputTableName(job, table);
AccumuloInputFormat.setBatchScan(job, batchScan);
if (sample) {
AccumuloInputFormat.setSamplerConfiguration(job, SAMPLER_CONFIG);
}
job.setMapperClass(TestMapper.class);
job.setMapOutputKeyClass(Key.class);
job.setMapOutputValueClass(Value.class);
job.setOutputFormatClass(NullOutputFormat.class);
job.setNumReduceTasks(0);
job.waitForCompletion(true);
return job.isSuccessful() ? 0 : 1;
}
public static int main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
conf.set("mapreduce.cluster.local.dir", new File(System.getProperty("user.dir"), "target/mapreduce-tmp").getAbsolutePath());
return ToolRunner.run(conf, new MRTester(), args);
}
}
@Test
public void testMap() throws Exception {
final String TEST_TABLE_1 = getUniqueNames(1)[0];
Connector c = getConnector();
c.tableOperations().create(TEST_TABLE_1);
BatchWriter bw = c.createBatchWriter(TEST_TABLE_1, new BatchWriterConfig());
for (int i = 0; i < 100; i++) {
Mutation m = new Mutation(new Text(String.format("%09x", i + 1)));
m.put(new Text(), new Text(), new Value(String.format("%09x", i).getBytes()));
bw.addMutation(m);
}
bw.close();
Assert.assertEquals(0, MRTester.main(new String[] {TEST_TABLE_1, AccumuloInputFormat.class.getName()}));
assertEquals(1, assertionErrors.get(TEST_TABLE_1 + "_map").size());
assertEquals(1, assertionErrors.get(TEST_TABLE_1 + "_cleanup").size());
}
private static final SamplerConfiguration SAMPLER_CONFIG = new SamplerConfiguration(RowSampler.class.getName()).addOption("hasher", "murmur3_32").addOption(
"modulus", "3");
@Test
public void testSample() throws Exception {
final String TEST_TABLE_3 = getUniqueNames(1)[0];
Connector c = getConnector();
c.tableOperations().create(TEST_TABLE_3, new NewTableConfiguration().enableSampling(SAMPLER_CONFIG));
BatchWriter bw = c.createBatchWriter(TEST_TABLE_3, new BatchWriterConfig());
for (int i = 0; i < 100; i++) {
Mutation m = new Mutation(new Text(String.format("%09x", i + 1)));
m.put(new Text(), new Text(), new Value(String.format("%09x", i).getBytes()));
bw.addMutation(m);
}
bw.close();
Assert.assertEquals(0, MRTester.main(new String[] {TEST_TABLE_3, AccumuloInputFormat.class.getName(), "False", "True"}));
assertEquals(39, assertionErrors.get(TEST_TABLE_3 + "_map").size());
assertEquals(2, assertionErrors.get(TEST_TABLE_3 + "_cleanup").size());
assertionErrors.clear();
Assert.assertEquals(0, MRTester.main(new String[] {TEST_TABLE_3, AccumuloInputFormat.class.getName(), "False", "False"}));
assertEquals(1, assertionErrors.get(TEST_TABLE_3 + "_map").size());
assertEquals(1, assertionErrors.get(TEST_TABLE_3 + "_cleanup").size());
assertionErrors.clear();
Assert.assertEquals(0, MRTester.main(new String[] {TEST_TABLE_3, AccumuloInputFormat.class.getName(), "True", "True"}));
assertEquals(39, assertionErrors.get(TEST_TABLE_3 + "_map").size());
assertEquals(2, assertionErrors.get(TEST_TABLE_3 + "_cleanup").size());
}
@Test
public void testMapWithBatchScanner() throws Exception {
final String TEST_TABLE_2 = getUniqueNames(1)[0];
Connector c = getConnector();
c.tableOperations().create(TEST_TABLE_2);
BatchWriter bw = c.createBatchWriter(TEST_TABLE_2, new BatchWriterConfig());
for (int i = 0; i < 100; i++) {
Mutation m = new Mutation(new Text(String.format("%09x", i + 1)));
m.put(new Text(), new Text(), new Value(String.format("%09x", i).getBytes()));
bw.addMutation(m);
}
bw.close();
Assert.assertEquals(0, MRTester.main(new String[] {TEST_TABLE_2, AccumuloInputFormat.class.getName(), "True", "False"}));
assertEquals(1, assertionErrors.get(TEST_TABLE_2 + "_map").size());
assertEquals(1, assertionErrors.get(TEST_TABLE_2 + "_cleanup").size());
}
@Test
public void testCorrectRangeInputSplits() throws Exception {
Job job = Job.getInstance();
String table = getUniqueNames(1)[0];
Authorizations auths = new Authorizations("foo");
Collection<Pair<Text,Text>> fetchColumns = Collections.singleton(new Pair<>(new Text("foo"), new Text("bar")));
boolean isolated = true, localIters = true;
Level level = Level.WARN;
Connector connector = getConnector();
connector.tableOperations().create(table);
AccumuloInputFormat.setZooKeeperInstance(job, cluster.getClientConfig());
AccumuloInputFormat.setConnectorInfo(job, getAdminPrincipal(), getAdminToken());
AccumuloInputFormat.setInputTableName(job, table);
AccumuloInputFormat.setScanAuthorizations(job, auths);
AccumuloInputFormat.setScanIsolation(job, isolated);
AccumuloInputFormat.setLocalIterators(job, localIters);
AccumuloInputFormat.fetchColumns(job, fetchColumns);
AccumuloInputFormat.setLogLevel(job, level);
AccumuloInputFormat aif = new AccumuloInputFormat();
List<InputSplit> splits = aif.getSplits(job);
Assert.assertEquals(1, splits.size());
InputSplit split = splits.get(0);
Assert.assertEquals(RangeInputSplit.class, split.getClass());
RangeInputSplit risplit = (RangeInputSplit) split;
Assert.assertEquals(getAdminPrincipal(), risplit.getPrincipal());
Assert.assertEquals(table, risplit.getTableName());
Assert.assertEquals(getAdminToken(), risplit.getToken());
Assert.assertEquals(auths, risplit.getAuths());
Assert.assertEquals(getConnector().getInstance().getInstanceName(), risplit.getInstanceName());
Assert.assertEquals(isolated, risplit.isIsolatedScan());
Assert.assertEquals(localIters, risplit.usesLocalIterators());
Assert.assertEquals(fetchColumns, risplit.getFetchedColumns());
Assert.assertEquals(level, risplit.getLogLevel());
}
@Test
public void testPartialInputSplitDelegationToConfiguration() throws Exception {
String table = getUniqueNames(1)[0];
Connector c = getConnector();
c.tableOperations().create(table);
BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig());
for (int i = 0; i < 100; i++) {
Mutation m = new Mutation(new Text(String.format("%09x", i + 1)));
m.put(new Text(), new Text(), new Value(String.format("%09x", i).getBytes()));
bw.addMutation(m);
}
bw.close();
Assert.assertEquals(0, MRTester.main(new String[] {table, EmptySplitsAccumuloInputFormat.class.getName()}));
assertEquals(1, assertionErrors.get(table + "_map").size());
assertEquals(1, assertionErrors.get(table + "_cleanup").size());
}
@Test
public void testPartialFailedInputSplitDelegationToConfiguration() throws Exception {
String table = getUniqueNames(1)[0];
Connector c = getConnector();
c.tableOperations().create(table);
BatchWriter bw = c.createBatchWriter(table, new BatchWriterConfig());
for (int i = 0; i < 100; i++) {
Mutation m = new Mutation(new Text(String.format("%09x", i + 1)));
m.put(new Text(), new Text(), new Value(String.format("%09x", i).getBytes()));
bw.addMutation(m);
}
bw.close();
Assert.assertEquals(1, MRTester.main(new String[] {table, BadPasswordSplitsAccumuloInputFormat.class.getName()}));
assertEquals(1, assertionErrors.get(table + "_map").size());
// We should fail when the RecordReader fails to get the next key/value pair, because the record reader is set up with a clientcontext, rather than a
// connector, so it doesn't do fast-fail on bad credentials
assertEquals(2, assertionErrors.get(table + "_cleanup").size());
}
/**
* AccumuloInputFormat which returns an "empty" RangeInputSplit
*/
public static class BadPasswordSplitsAccumuloInputFormat extends AccumuloInputFormat {
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
List<InputSplit> splits = super.getSplits(context);
for (InputSplit split : splits) {
org.apache.accumulo.core.client.mapreduce.RangeInputSplit rangeSplit = (org.apache.accumulo.core.client.mapreduce.RangeInputSplit) split;
rangeSplit.setToken(new PasswordToken("anythingelse"));
}
return splits;
}
}
/**
* AccumuloInputFormat which returns an "empty" RangeInputSplit
*/
public static class EmptySplitsAccumuloInputFormat extends AccumuloInputFormat {
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
List<InputSplit> oldSplits = super.getSplits(context);
List<InputSplit> newSplits = new ArrayList<>(oldSplits.size());
// Copy only the necessary information
for (InputSplit oldSplit : oldSplits) {
org.apache.accumulo.core.client.mapreduce.RangeInputSplit newSplit = new org.apache.accumulo.core.client.mapreduce.RangeInputSplit(
(org.apache.accumulo.core.client.mapreduce.RangeInputSplit) oldSplit);
newSplits.add(newSplit);
}
return newSplits;
}
}
}