HadoopTeraSortTest.java example

Explorer

ignite-master
- examples
  - src
    - main
      - java
        org
        apache
        ignite
        examples
        ExampleNodeStartup.java
        ExamplesUtils.java
        binary
        computegrid
        ComputeClientBinaryTaskExecutionExample.java
        ComputeClientTask.java
        package-info.java
        datagrid
        CacheClientBinaryPutGetExample.java
        CacheClientBinaryQueryExample.java
        package-info.java
        store
        auto
        CacheBinaryAutoStoreExample.java
        package-info.java
        package-info.java
        computegrid
        ComputeAsyncExample.java
        ComputeBroadcastExample.java
        ComputeCallableExample.java
        ComputeClosureExample.java
        ComputeContinuousMapperExample.java
        ComputeFibonacciContinuationExample.java
        ComputeReducerExample.java
        ComputeRunnableExample.java
        ComputeTaskMapExample.java
        ComputeTaskSplitExample.java
        cluster
        ClusterGroupExample.java
        package-info.java
        failover
        ComputeFailoverExample.java
        ComputeFailoverNodeStartup.java
        package-info.java
        montecarlo
        Credit.java
        CreditRiskExample.java
        CreditRiskManager.java
        package-info.java
        package-info.java
        datagrid
        CacheAffinityExample.java
        CacheApiExample.java
        CacheAsyncApiExample.java
        CacheContinuousAsyncQueryExample.java
        CacheContinuousQueryExample.java
        CacheDataStreamerExample.java
        CacheEntryProcessorExample.java
        CacheEventsExample.java
        CachePutGetExample.java
        CacheQueryDmlExample.java
        CacheQueryExample.java
        CacheTransactionExample.java
        MemoryPoliciesExample.java
        package-info.java
        starschema
        CacheStarSchemaExample.java
        DimProduct.java
        DimStore.java
        FactPurchase.java
        package-info.java
        store
        CacheLoadOnlyStoreExample.java
        auto
        CacheAutoStoreExample.java
        package-info.java
        jdbc
        CacheJdbcPersonStore.java
        CacheJdbcStoreExample.java
        package-info.java
        package-info.java
        spring
        CacheSpringPersonStore.java
        CacheSpringStoreExample.java
        package-info.java
        datastructures
        IgniteAtomicLongExample.java
        IgniteAtomicReferenceExample.java
        IgniteAtomicSequenceExample.java
        IgniteAtomicStampedExample.java
        IgniteCountDownLatchExample.java
        IgniteExecutorServiceExample.java
        IgniteLockExample.java
        IgniteQueueExample.java
        IgniteSemaphoreExample.java
        IgniteSetExample.java
        package-info.java
        events
        EventsExample.java
        package-info.java
        igfs
        IgfsExample.java
        IgfsMapReduceExample.java
        IgfsNodeStartup.java
        package-info.java
        messaging
        MessagingExample.java
        MessagingPingPongExample.java
        MessagingPingPongListenActorExample.java
        package-info.java
        misc
        client
        memcache
        MemcacheRestExample.java
        MemcacheRestExampleNodeStartup.java
        package-info.java
        package-info.java
        deployment
        DeploymentExample.java
        package-info.java
        lifecycle
        LifecycleExample.java
        package-info.java
        package-info.java
        springbean
        SpringBeanExample.java
        package-info.java
        model
        Address.java
        Employee.java
        EmployeeKey.java
        Organization.java
        OrganizationType.java
        Person.java
        package-info.java
        package-info.java
        servicegrid
        ServicesExample.java
        SimpleMapService.java
        SimpleMapServiceImpl.java
        package-info.java
        springdata
        PersonRepository.java
        SpringAppCfg.java
        SpringDataExample.java
        streaming
        StreamTransformerExample.java
        StreamVisitorExample.java
        package-info.java
        wordcount
        CacheConfig.java
        QueryWords.java
        StreamWords.java
        package-info.java
        socket
        WordsSocketStreamerClient.java
        WordsSocketStreamerServer.java
        package-info.java
        util
        DbH2ServerStartup.java
        package-info.java
      - java-lgpl
        org
        apache
        ignite
        examples
        datagrid
        SpatialQueryExample.java
        hibernate
        HibernateL2CacheExample.java
        Post.java
        User.java
        package-info.java
        store
        hibernate
        CacheHibernatePersonStore.java
        CacheHibernateStoreExample.java
        package-info.java
        misc
        schedule
        ComputeScheduleExample.java
        package-info.java
      - java8
        org
        apache
        ignite
        examples
        java8
        cluster
        ClusterGroupExample.java
        package-info.java
        computegrid
        ComputeAsyncExample.java
        ComputeBroadcastExample.java
        ComputeCallableExample.java
        ComputeClosureExample.java
        ComputeRunnableExample.java
        package-info.java
        datagrid
        CacheAffinityExample.java
        CacheApiExample.java
        CacheAsyncApiExample.java
        CacheEntryProcessorExample.java
        package-info.java
        datastructures
        IgniteExecutorServiceExample.java
        package-info.java
        events
        EventsExample.java
        package-info.java
        messaging
        MessagingExample.java
        MessagingPingPongExample.java
        package-info.java
        package-info.java
        streaming
        StreamTransformerExample.java
        StreamVisitorExample.java
        package-info.java
      - ml
        org
        apache
        ignite
        examples
        ml
        math
        decompositions
        CholeskyDecompositionExample.java
        EigenDecompositionExample.java
        LUDecompositionExample.java
        SingularValueDecompositionExample.java
        package-info.java
        matrix
        CacheMatrixExample.java
        ExampleMatrixStorage.java
        MatrixCustomStorageExample.java
        MatrixExample.java
        MatrixExampleUtil.java
        OffHeapMatrixExample.java
        SparseDistributedMatrixExample.java
        SparseMatrixExample.java
        package-info.java
        package-info.java
        tracer
        TracerExample.java
        package-info.java
        vector
        CacheVectorExample.java
        ExampleVectorStorage.java
        OffHeapVectorExample.java
        SparseVectorExample.java
        VectorCustomStorageExample.java
        VectorExample.java
        package-info.java
      - spark
        org
        apache
        ignite
        examples
        spark
        SharedRDDExample.java
        package-info.java
    - test
- modules

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.ignite.internal.processors.hadoop.impl;

import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.net.URI;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.examples.terasort.TeraGen;
import org.apache.hadoop.examples.terasort.TeraInputFormat;
import org.apache.hadoop.examples.terasort.TeraOutputFormat;
import org.apache.hadoop.examples.terasort.TeraSort;
import org.apache.hadoop.examples.terasort.TeraValidate;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.ignite.IgniteException;
import org.apache.ignite.configuration.HadoopConfiguration;
import org.apache.ignite.configuration.IgniteConfiguration;
import org.apache.ignite.hadoop.io.TextPartiallyRawComparator;
import org.apache.ignite.internal.IgniteInternalFuture;
import org.apache.ignite.internal.processors.hadoop.HadoopJobId;
import org.apache.ignite.internal.processors.hadoop.HadoopJobProperty;

import static org.apache.ignite.internal.processors.hadoop.impl.HadoopUtils.createJobInfo;

/**
 * Implements TeraSort Hadoop sample as a unit test.
 */
public class HadoopTeraSortTest extends HadoopAbstractSelfTest {
    /** Copy of Hadoop constant of package-private visibility. */
    public static final String PARTITION_FILENAME = getPartitionFileNameConstant();

    /**  Out destination dir. */
    protected final String generateOutDir = getFsBase() + "/tera-generated";

    /** Sort destination dir. */
    protected final String sortOutDir = getFsBase() + "/tera-sorted";

    /** Validation destination dir. */
    protected final String validateOutDir = getFsBase() + "/tera-validated";

    /**
     * Extracts value of Hadoop package-private constant.
     *
     * @return TeraInputFormat.PARTITION_FILENAME.
     */
    private static String getPartitionFileNameConstant() {
        try {
            Field f = TeraInputFormat.class.getDeclaredField("PARTITION_FILENAME");

            f.setAccessible(true);

            return (String)f.get(null);
        }
        catch (Exception e) {
            throw new IgniteException(e);
        }
    }

    /**
     * Gets base directory.
     * Note that this directory will be completely deleted in the and of the test.
     * @return The base directory.
     */
    protected String getFsBase() {
        return "file:///tmp/" + getUser() + "/hadoop-terasort-test";
    }

    /**
     * @return Full input data size, in bytes.
     */
    protected long dataSizeBytes() {
        return 100_000_000;
    }

    /**
     * Desired number of maps in TeraSort job.
     * @return The number of maps.
     */
    protected int numMaps() {
        return gridCount() * 10;
    }

    /**
     * Desired number of reduces in TeraSort job.
     * @return The number of reduces.
     */
    protected int numReduces() {
        return gridCount() * 8;
    }

    /**
     * The user to run Hadoop job on behalf of.
     * @return The user to run Hadoop job on behalf of.
     */
    protected String getUser() {
        return System.getProperty("user.name");
    }

    /** {@inheritDoc} */
    @Override protected void afterTest() throws Exception {
        stopAllGrids(true);

        // Delete files used:
        getFileSystem().delete(new Path(getFsBase()), true);
    }

    /** {@inheritDoc} */
    @Override protected final boolean igfsEnabled() {
        return false;
    }

    /**
     * Does actual test TeraSort job Through Ignite API
     *
     * @param gzip Whether to use GZIP.
     */
    protected final void teraSort(boolean gzip) throws Exception {
        System.out.println("TeraSort ===============================================================");

        getFileSystem().delete(new Path(sortOutDir), true);

        final JobConf jobConf = new JobConf();

        jobConf.setUser(getUser());

        jobConf.set("fs.defaultFS", getFsBase());

        log().info("Desired number of reduces: " + numReduces());

        jobConf.set("mapreduce.job.reduces", String.valueOf(numReduces()));

        log().info("Desired number of maps: " + numMaps());

        final long splitSize = dataSizeBytes() / numMaps();

        log().info("Desired split size: " + splitSize);

        // Force the split to be of the desired size:
        jobConf.set("mapred.min.split.size", String.valueOf(splitSize));
        jobConf.set("mapred.max.split.size", String.valueOf(splitSize));

        jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), true);
        jobConf.setInt(HadoopJobProperty.SHUFFLE_MSG_SIZE.propertyName(), 4096);

        if (gzip)
            jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MSG_GZIP.propertyName(), true);

        jobConf.set(HadoopJobProperty.JOB_PARTIALLY_RAW_COMPARATOR.propertyName(),
            TextPartiallyRawComparator.class.getName());

        Job job = setupConfig(jobConf);

        HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));

        fut.get();
    }

    /**
     * Gets the file system we work upon.
     * @return The file system.
     * @throws Exception
     */
    FileSystem getFileSystem() throws Exception{
        return FileSystem.get(new URI(getFsBase()), new Configuration());
    }

    /**
     * Represents the data generation stage.
     * @throws Exception
     */
    private void teraGenerate() throws Exception {
        System.out.println("TeraGenerate ===============================================================");

        getFileSystem().delete(new Path(generateOutDir), true);

        final long numLines = dataSizeBytes() / 100; // TeraGen makes 100 bytes ber line

        if (numLines < 1)
            throw new IllegalStateException("Data size is too small: " + dataSizeBytes());

        // Generate input data:
        int res = ToolRunner.run(new Configuration(), new TeraGen(), new String[] {"-Dmapreduce.framework.name=local",
            String.valueOf(numLines), generateOutDir});

        assertEquals(0, res);

        FileStatus[] fileStatuses = getFileSystem().listStatus(new Path(generateOutDir));

        long sumLen = 0;

        for (FileStatus fs: fileStatuses)
            sumLen += fs.getLen();

        assertEquals(dataSizeBytes(), sumLen); // Ensure correct size data is generated.
    }

    /**
     * Creates Job instance and sets up necessary properties for it.
     * @param conf The Job config.
     * @return The job.
     * @throws Exception On error.
     */
    private Job setupConfig(JobConf conf) throws Exception {
        Job job = Job.getInstance(conf);

        Path inputDir = new Path(generateOutDir);
        Path outputDir = new Path(sortOutDir);

        boolean useSimplePartitioner = TeraSort.getUseSimplePartitioner(job);

        TeraInputFormat.setInputPaths(job, inputDir);
        FileOutputFormat.setOutputPath(job, outputDir);

        job.setJobName("TeraSort");

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TeraInputFormat.class);
        job.setOutputFormatClass(TeraOutputFormat.class);

        if (useSimplePartitioner)
            job.setPartitionerClass(TeraSort.SimplePartitioner.class);
        else {
            long start = System.currentTimeMillis();

            Path partFile = new Path(outputDir, PARTITION_FILENAME);

            URI partUri = new URI(partFile.toString() + "#" + PARTITION_FILENAME);

            try {
                TeraInputFormat.writePartitionFile(job, partFile);
            } catch (Throwable e) {
                throw new RuntimeException(e);
            }

            job.addCacheFile(partUri);

            long end = System.currentTimeMillis();

            System.out.println("Spent " + (end - start) + "ms computing partitions. " +
                "Partition file added to distributed cache: " + partUri);

            job.setPartitionerClass(getTeraSortTotalOrderPartitioner()/*TeraSort.TotalOrderPartitioner.class*/);
        }

        job.getConfiguration().setInt("dfs.replication", TeraSort.getOutputReplication(job));

        /* TeraOutputFormat.setFinalSync(job, true); */
        Method m = TeraOutputFormat.class.getDeclaredMethod("setFinalSync", JobContext.class, boolean.class);
        m.setAccessible(true);
        m.invoke(null, job, true);

        return job;
    }

    /**
     * Extracts package-private TeraSort total order partitioner class.
     *
     * @return The class.
     */
    @SuppressWarnings("unchecked")
    private Class<? extends Partitioner> getTeraSortTotalOrderPartitioner() {
        Class[] classes = TeraSort.class.getDeclaredClasses();

        Class<? extends Partitioner> totalOrderPartitionerCls = null;

        for (Class<?> x: classes) {
            if ("TotalOrderPartitioner".equals(x.getSimpleName())) {
                totalOrderPartitionerCls = (Class<? extends Partitioner>)x;

                break;
            }
        }

        if (totalOrderPartitionerCls == null)
            throw new IllegalStateException("Failed to find TeraSort total order partitioner class.");

        return totalOrderPartitionerCls;
    }

    /**
     * Implements validation phase of the sample.
     * @throws Exception
     */
    private void teraValidate() throws Exception {
        System.out.println("TeraValidate ===============================================================");

        getFileSystem().delete(new Path(validateOutDir), true);

        // Generate input data:
        int res = ToolRunner.run(new Configuration(), new TeraValidate(),
            new String[] {"-Dmapreduce.framework.name=local", sortOutDir, validateOutDir});

        assertEquals(0, res);

        FileStatus[] fileStatuses = getFileSystem().listStatus(new Path(validateOutDir), new PathFilter() {
            @Override public boolean accept(Path path) {
                // Typically name is "part-r-00000":
                return path.getName().startsWith("part-r-");
            }
        });

        // TeraValidate has only 1 reduce, so should be only 1 result file:
        assertEquals(1, fileStatuses.length);

        // The result file must contain only 1 line with the checksum, like this:
        // "checksum        7a27e2d0d55de",
        // typically it has length of 23 bytes.
        // If sorting was not correct, the result contains list of K-V pairs that are not ordered correctly.
        // In such case the size of the output will be much larger.
        long len = fileStatuses[0].getLen();

        assertTrue("TeraValidate length: " + len, len >= 16 && len <= 32);
    }

    /** {@inheritDoc} */
    @Override protected void beforeTest() throws Exception {
        super.beforeTest();

        getFileSystem().delete(new Path(getFsBase()), true);

        startGrids(gridCount());
    }

    /**
     * Runs generate/sort/validate phases of the terasort sample.
     *
     * @throws Exception If failed.
     */
    public void testTeraSort() throws Exception {
        checkTeraSort(false);
    }

    /**
     * Runs generate/sort/validate phases of the terasort sample.
     *
     * @throws Exception If failed.
     */
    public void testTeraSortGzip() throws Exception {
        checkTeraSort(true);
    }

    /**
     * Check terasort.
     *
     * @param gzip GZIP flag.
     * @throws Exception If failed.
     */
    private void checkTeraSort(boolean gzip) throws Exception {
        teraGenerate();

        teraSort(gzip);

        teraValidate();
    }

    /** {@inheritDoc} */
    @Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception {
        IgniteConfiguration igc = super.getConfiguration(igniteInstanceName);

        HadoopConfiguration hc = createHadoopConfiguration();

        igc.setHadoopConfiguration(hc);

        return igc;
    }

    /**
     * Creates Hadoop configuration for the test.
     * @return The {@link HadoopConfiguration}.
     */
    protected HadoopConfiguration createHadoopConfiguration() {
        HadoopConfiguration hadoopCfg = new HadoopConfiguration();

        // See org.apache.ignite.configuration.HadoopConfiguration.DFLT_MAX_TASK_QUEUE_SIZE
        hadoopCfg.setMaxTaskQueueSize(30_000);

        return hadoopCfg;
    }
}