/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.internal.processors.hadoop.impl;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.net.URI;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.examples.terasort.TeraGen;
import org.apache.hadoop.examples.terasort.TeraInputFormat;
import org.apache.hadoop.examples.terasort.TeraOutputFormat;
import org.apache.hadoop.examples.terasort.TeraSort;
import org.apache.hadoop.examples.terasort.TeraValidate;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.ignite.IgniteException;
import org.apache.ignite.configuration.HadoopConfiguration;
import org.apache.ignite.configuration.IgniteConfiguration;
import org.apache.ignite.hadoop.io.TextPartiallyRawComparator;
import org.apache.ignite.internal.IgniteInternalFuture;
import org.apache.ignite.internal.processors.hadoop.HadoopJobId;
import org.apache.ignite.internal.processors.hadoop.HadoopJobProperty;
import static org.apache.ignite.internal.processors.hadoop.impl.HadoopUtils.createJobInfo;
/**
* Implements TeraSort Hadoop sample as a unit test.
*/
public class HadoopTeraSortTest extends HadoopAbstractSelfTest {
/** Copy of Hadoop constant of package-private visibility. */
public static final String PARTITION_FILENAME = getPartitionFileNameConstant();
/** Out destination dir. */
protected final String generateOutDir = getFsBase() + "/tera-generated";
/** Sort destination dir. */
protected final String sortOutDir = getFsBase() + "/tera-sorted";
/** Validation destination dir. */
protected final String validateOutDir = getFsBase() + "/tera-validated";
/**
* Extracts value of Hadoop package-private constant.
*
* @return TeraInputFormat.PARTITION_FILENAME.
*/
private static String getPartitionFileNameConstant() {
try {
Field f = TeraInputFormat.class.getDeclaredField("PARTITION_FILENAME");
f.setAccessible(true);
return (String)f.get(null);
}
catch (Exception e) {
throw new IgniteException(e);
}
}
/**
* Gets base directory.
* Note that this directory will be completely deleted in the and of the test.
* @return The base directory.
*/
protected String getFsBase() {
return "file:///tmp/" + getUser() + "/hadoop-terasort-test";
}
/**
* @return Full input data size, in bytes.
*/
protected long dataSizeBytes() {
return 100_000_000;
}
/**
* Desired number of maps in TeraSort job.
* @return The number of maps.
*/
protected int numMaps() {
return gridCount() * 10;
}
/**
* Desired number of reduces in TeraSort job.
* @return The number of reduces.
*/
protected int numReduces() {
return gridCount() * 8;
}
/**
* The user to run Hadoop job on behalf of.
* @return The user to run Hadoop job on behalf of.
*/
protected String getUser() {
return System.getProperty("user.name");
}
/** {@inheritDoc} */
@Override protected void afterTest() throws Exception {
stopAllGrids(true);
// Delete files used:
getFileSystem().delete(new Path(getFsBase()), true);
}
/** {@inheritDoc} */
@Override protected final boolean igfsEnabled() {
return false;
}
/**
* Does actual test TeraSort job Through Ignite API
*
* @param gzip Whether to use GZIP.
*/
protected final void teraSort(boolean gzip) throws Exception {
System.out.println("TeraSort ===============================================================");
getFileSystem().delete(new Path(sortOutDir), true);
final JobConf jobConf = new JobConf();
jobConf.setUser(getUser());
jobConf.set("fs.defaultFS", getFsBase());
log().info("Desired number of reduces: " + numReduces());
jobConf.set("mapreduce.job.reduces", String.valueOf(numReduces()));
log().info("Desired number of maps: " + numMaps());
final long splitSize = dataSizeBytes() / numMaps();
log().info("Desired split size: " + splitSize);
// Force the split to be of the desired size:
jobConf.set("mapred.min.split.size", String.valueOf(splitSize));
jobConf.set("mapred.max.split.size", String.valueOf(splitSize));
jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), true);
jobConf.setInt(HadoopJobProperty.SHUFFLE_MSG_SIZE.propertyName(), 4096);
if (gzip)
jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MSG_GZIP.propertyName(), true);
jobConf.set(HadoopJobProperty.JOB_PARTIALLY_RAW_COMPARATOR.propertyName(),
TextPartiallyRawComparator.class.getName());
Job job = setupConfig(jobConf);
HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1);
IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));
fut.get();
}
/**
* Gets the file system we work upon.
* @return The file system.
* @throws Exception
*/
FileSystem getFileSystem() throws Exception{
return FileSystem.get(new URI(getFsBase()), new Configuration());
}
/**
* Represents the data generation stage.
* @throws Exception
*/
private void teraGenerate() throws Exception {
System.out.println("TeraGenerate ===============================================================");
getFileSystem().delete(new Path(generateOutDir), true);
final long numLines = dataSizeBytes() / 100; // TeraGen makes 100 bytes ber line
if (numLines < 1)
throw new IllegalStateException("Data size is too small: " + dataSizeBytes());
// Generate input data:
int res = ToolRunner.run(new Configuration(), new TeraGen(), new String[] {"-Dmapreduce.framework.name=local",
String.valueOf(numLines), generateOutDir});
assertEquals(0, res);
FileStatus[] fileStatuses = getFileSystem().listStatus(new Path(generateOutDir));
long sumLen = 0;
for (FileStatus fs: fileStatuses)
sumLen += fs.getLen();
assertEquals(dataSizeBytes(), sumLen); // Ensure correct size data is generated.
}
/**
* Creates Job instance and sets up necessary properties for it.
* @param conf The Job config.
* @return The job.
* @throws Exception On error.
*/
private Job setupConfig(JobConf conf) throws Exception {
Job job = Job.getInstance(conf);
Path inputDir = new Path(generateOutDir);
Path outputDir = new Path(sortOutDir);
boolean useSimplePartitioner = TeraSort.getUseSimplePartitioner(job);
TeraInputFormat.setInputPaths(job, inputDir);
FileOutputFormat.setOutputPath(job, outputDir);
job.setJobName("TeraSort");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TeraInputFormat.class);
job.setOutputFormatClass(TeraOutputFormat.class);
if (useSimplePartitioner)
job.setPartitionerClass(TeraSort.SimplePartitioner.class);
else {
long start = System.currentTimeMillis();
Path partFile = new Path(outputDir, PARTITION_FILENAME);
URI partUri = new URI(partFile.toString() + "#" + PARTITION_FILENAME);
try {
TeraInputFormat.writePartitionFile(job, partFile);
} catch (Throwable e) {
throw new RuntimeException(e);
}
job.addCacheFile(partUri);
long end = System.currentTimeMillis();
System.out.println("Spent " + (end - start) + "ms computing partitions. " +
"Partition file added to distributed cache: " + partUri);
job.setPartitionerClass(getTeraSortTotalOrderPartitioner()/*TeraSort.TotalOrderPartitioner.class*/);
}
job.getConfiguration().setInt("dfs.replication", TeraSort.getOutputReplication(job));
/* TeraOutputFormat.setFinalSync(job, true); */
Method m = TeraOutputFormat.class.getDeclaredMethod("setFinalSync", JobContext.class, boolean.class);
m.setAccessible(true);
m.invoke(null, job, true);
return job;
}
/**
* Extracts package-private TeraSort total order partitioner class.
*
* @return The class.
*/
@SuppressWarnings("unchecked")
private Class<? extends Partitioner> getTeraSortTotalOrderPartitioner() {
Class[] classes = TeraSort.class.getDeclaredClasses();
Class<? extends Partitioner> totalOrderPartitionerCls = null;
for (Class<?> x: classes) {
if ("TotalOrderPartitioner".equals(x.getSimpleName())) {
totalOrderPartitionerCls = (Class<? extends Partitioner>)x;
break;
}
}
if (totalOrderPartitionerCls == null)
throw new IllegalStateException("Failed to find TeraSort total order partitioner class.");
return totalOrderPartitionerCls;
}
/**
* Implements validation phase of the sample.
* @throws Exception
*/
private void teraValidate() throws Exception {
System.out.println("TeraValidate ===============================================================");
getFileSystem().delete(new Path(validateOutDir), true);
// Generate input data:
int res = ToolRunner.run(new Configuration(), new TeraValidate(),
new String[] {"-Dmapreduce.framework.name=local", sortOutDir, validateOutDir});
assertEquals(0, res);
FileStatus[] fileStatuses = getFileSystem().listStatus(new Path(validateOutDir), new PathFilter() {
@Override public boolean accept(Path path) {
// Typically name is "part-r-00000":
return path.getName().startsWith("part-r-");
}
});
// TeraValidate has only 1 reduce, so should be only 1 result file:
assertEquals(1, fileStatuses.length);
// The result file must contain only 1 line with the checksum, like this:
// "checksum 7a27e2d0d55de",
// typically it has length of 23 bytes.
// If sorting was not correct, the result contains list of K-V pairs that are not ordered correctly.
// In such case the size of the output will be much larger.
long len = fileStatuses[0].getLen();
assertTrue("TeraValidate length: " + len, len >= 16 && len <= 32);
}
/** {@inheritDoc} */
@Override protected void beforeTest() throws Exception {
super.beforeTest();
getFileSystem().delete(new Path(getFsBase()), true);
startGrids(gridCount());
}
/**
* Runs generate/sort/validate phases of the terasort sample.
*
* @throws Exception If failed.
*/
public void testTeraSort() throws Exception {
checkTeraSort(false);
}
/**
* Runs generate/sort/validate phases of the terasort sample.
*
* @throws Exception If failed.
*/
public void testTeraSortGzip() throws Exception {
checkTeraSort(true);
}
/**
* Check terasort.
*
* @param gzip GZIP flag.
* @throws Exception If failed.
*/
private void checkTeraSort(boolean gzip) throws Exception {
teraGenerate();
teraSort(gzip);
teraValidate();
}
/** {@inheritDoc} */
@Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception {
IgniteConfiguration igc = super.getConfiguration(igniteInstanceName);
HadoopConfiguration hc = createHadoopConfiguration();
igc.setHadoopConfiguration(hc);
return igc;
}
/**
* Creates Hadoop configuration for the test.
* @return The {@link HadoopConfiguration}.
*/
protected HadoopConfiguration createHadoopConfiguration() {
HadoopConfiguration hadoopCfg = new HadoopConfiguration();
// See org.apache.ignite.configuration.HadoopConfiguration.DFLT_MAX_TASK_QUEUE_SIZE
hadoopCfg.setMaxTaskQueueSize(30_000);
return hadoopCfg;
}
}