HadoopPopularWordsTest.java example

Explorer

ignite-master
- examples
  - src
    - main
      - java
        org
        apache
        ignite
        examples
        ExampleNodeStartup.java
        ExamplesUtils.java
        binary
        computegrid
        ComputeClientBinaryTaskExecutionExample.java
        ComputeClientTask.java
        package-info.java
        datagrid
        CacheClientBinaryPutGetExample.java
        CacheClientBinaryQueryExample.java
        package-info.java
        store
        auto
        CacheBinaryAutoStoreExample.java
        package-info.java
        package-info.java
        computegrid
        ComputeAsyncExample.java
        ComputeBroadcastExample.java
        ComputeCallableExample.java
        ComputeClosureExample.java
        ComputeContinuousMapperExample.java
        ComputeFibonacciContinuationExample.java
        ComputeReducerExample.java
        ComputeRunnableExample.java
        ComputeTaskMapExample.java
        ComputeTaskSplitExample.java
        cluster
        ClusterGroupExample.java
        package-info.java
        failover
        ComputeFailoverExample.java
        ComputeFailoverNodeStartup.java
        package-info.java
        montecarlo
        Credit.java
        CreditRiskExample.java
        CreditRiskManager.java
        package-info.java
        package-info.java
        datagrid
        CacheAffinityExample.java
        CacheApiExample.java
        CacheAsyncApiExample.java
        CacheContinuousAsyncQueryExample.java
        CacheContinuousQueryExample.java
        CacheDataStreamerExample.java
        CacheEntryProcessorExample.java
        CacheEventsExample.java
        CachePutGetExample.java
        CacheQueryDmlExample.java
        CacheQueryExample.java
        CacheTransactionExample.java
        MemoryPoliciesExample.java
        package-info.java
        starschema
        CacheStarSchemaExample.java
        DimProduct.java
        DimStore.java
        FactPurchase.java
        package-info.java
        store
        CacheLoadOnlyStoreExample.java
        auto
        CacheAutoStoreExample.java
        package-info.java
        jdbc
        CacheJdbcPersonStore.java
        CacheJdbcStoreExample.java
        package-info.java
        package-info.java
        spring
        CacheSpringPersonStore.java
        CacheSpringStoreExample.java
        package-info.java
        datastructures
        IgniteAtomicLongExample.java
        IgniteAtomicReferenceExample.java
        IgniteAtomicSequenceExample.java
        IgniteAtomicStampedExample.java
        IgniteCountDownLatchExample.java
        IgniteExecutorServiceExample.java
        IgniteLockExample.java
        IgniteQueueExample.java
        IgniteSemaphoreExample.java
        IgniteSetExample.java
        package-info.java
        events
        EventsExample.java
        package-info.java
        igfs
        IgfsExample.java
        IgfsMapReduceExample.java
        IgfsNodeStartup.java
        package-info.java
        messaging
        MessagingExample.java
        MessagingPingPongExample.java
        MessagingPingPongListenActorExample.java
        package-info.java
        misc
        client
        memcache
        MemcacheRestExample.java
        MemcacheRestExampleNodeStartup.java
        package-info.java
        package-info.java
        deployment
        DeploymentExample.java
        package-info.java
        lifecycle
        LifecycleExample.java
        package-info.java
        package-info.java
        springbean
        SpringBeanExample.java
        package-info.java
        model
        Address.java
        Employee.java
        EmployeeKey.java
        Organization.java
        OrganizationType.java
        Person.java
        package-info.java
        package-info.java
        servicegrid
        ServicesExample.java
        SimpleMapService.java
        SimpleMapServiceImpl.java
        package-info.java
        springdata
        PersonRepository.java
        SpringAppCfg.java
        SpringDataExample.java
        streaming
        StreamTransformerExample.java
        StreamVisitorExample.java
        package-info.java
        wordcount
        CacheConfig.java
        QueryWords.java
        StreamWords.java
        package-info.java
        socket
        WordsSocketStreamerClient.java
        WordsSocketStreamerServer.java
        package-info.java
        util
        DbH2ServerStartup.java
        package-info.java
      - java-lgpl
        org
        apache
        ignite
        examples
        datagrid
        SpatialQueryExample.java
        hibernate
        HibernateL2CacheExample.java
        Post.java
        User.java
        package-info.java
        store
        hibernate
        CacheHibernatePersonStore.java
        CacheHibernateStoreExample.java
        package-info.java
        misc
        schedule
        ComputeScheduleExample.java
        package-info.java
      - java8
        org
        apache
        ignite
        examples
        java8
        cluster
        ClusterGroupExample.java
        package-info.java
        computegrid
        ComputeAsyncExample.java
        ComputeBroadcastExample.java
        ComputeCallableExample.java
        ComputeClosureExample.java
        ComputeRunnableExample.java
        package-info.java
        datagrid
        CacheAffinityExample.java
        CacheApiExample.java
        CacheAsyncApiExample.java
        CacheEntryProcessorExample.java
        package-info.java
        datastructures
        IgniteExecutorServiceExample.java
        package-info.java
        events
        EventsExample.java
        package-info.java
        messaging
        MessagingExample.java
        MessagingPingPongExample.java
        package-info.java
        package-info.java
        streaming
        StreamTransformerExample.java
        StreamVisitorExample.java
        package-info.java
      - ml
        org
        apache
        ignite
        examples
        ml
        math
        decompositions
        CholeskyDecompositionExample.java
        EigenDecompositionExample.java
        LUDecompositionExample.java
        SingularValueDecompositionExample.java
        package-info.java
        matrix
        CacheMatrixExample.java
        ExampleMatrixStorage.java
        MatrixCustomStorageExample.java
        MatrixExample.java
        MatrixExampleUtil.java
        OffHeapMatrixExample.java
        SparseDistributedMatrixExample.java
        SparseMatrixExample.java
        package-info.java
        package-info.java
        tracer
        TracerExample.java
        package-info.java
        vector
        CacheVectorExample.java
        ExampleVectorStorage.java
        OffHeapVectorExample.java
        SparseVectorExample.java
        VectorCustomStorageExample.java
        VectorExample.java
        package-info.java
      - spark
        org
        apache
        ignite
        examples
        spark
        SharedRDDExample.java
        package-info.java
    - test
- modules

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.ignite.internal.processors.hadoop.impl;

import com.google.common.collect.MinMaxPriorityQueue;
import java.io.IOException;
import java.util.Comparator;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.ignite.internal.util.typedef.X;
import org.apache.ignite.internal.util.typedef.internal.U;

import static com.google.common.collect.Maps.immutableEntry;
import static com.google.common.collect.MinMaxPriorityQueue.orderedBy;
import static java.util.Collections.reverseOrder;

/**
 * Hadoop-based 10 popular words example: all files in a given directory are tokenized and for each word longer than
 * 3 characters the number of occurrences ins calculated. Finally, 10 words with the highest occurrence count are
 * output.
 *
 * NOTE: in order to run this example on Windows please ensure that cygwin is installed and available in the system
 * path.
 */
public class HadoopPopularWordsTest {
    /** Ignite home. */
    private static final String IGNITE_HOME = U.getIgniteHome();

    /** The path to the input directory. ALl files in that directory will be processed. */
    private static final Path BOOKS_LOCAL_DIR =
        new Path("file:" + IGNITE_HOME, "modules/tests/java/org/apache/ignite/grid/hadoop/books");

    /** The path to the output directory. THe result file will be written to this location. */
    private static final Path RESULT_LOCAL_DIR =
        new Path("file:" + IGNITE_HOME, "modules/tests/java/org/apache/ignite/grid/hadoop/output");

    /** Popular books source dir in DFS. */
    private static final Path BOOKS_DFS_DIR = new Path("tmp/word-count-example/in");

    /** Popular books source dir in DFS. */
    private static final Path RESULT_DFS_DIR = new Path("tmp/word-count-example/out");

    /** Path to the distributed file system configuration. */
    private static final String DFS_CFG = "examples/config/filesystem/core-site.xml";

    /** Top N words to select **/
    private static final int POPULAR_WORDS_CNT = 10;

    /**
     * For each token in the input string the mapper emits a {word, 1} pair.
     */
    private static class TokenizingMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        /** Constant value. */
        private static final IntWritable ONE = new IntWritable(1);

        /** The word converted into the Text. */
        private Text word = new Text();

        /**
         * Emits a entry where the key is the word and the value is always 1.
         *
         * @param key the current position in the input file (not used here)
         * @param val the text string
         * @param ctx mapper context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override protected void map(LongWritable key, Text val, Context ctx)
            throws IOException, InterruptedException {
            // Get the mapped object.
            final String line = val.toString();

            // Splits the given string to words.
            final String[] words = line.split("[^a-zA-Z0-9]");

            for (final String w : words) {
                // Only emit counts for longer words.
                if (w.length() <= 3)
                    continue;

                word.set(w);

                // Write the word into the context with the initial count equals 1.
                ctx.write(word, ONE);
            }
        }
    }

    /**
     * The reducer uses a priority queue to rank the words based on its number of occurrences.
     */
    private static class TopNWordsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private MinMaxPriorityQueue<Entry<Integer, String>> q;

        TopNWordsReducer() {
            q = orderedBy(reverseOrder(new Comparator<Entry<Integer, String>>() {
                @Override public int compare(Entry<Integer, String> o1, Entry<Integer, String> o2) {
                    return o1.getKey().compareTo(o2.getKey());
                }
            })).expectedSize(POPULAR_WORDS_CNT).maximumSize(POPULAR_WORDS_CNT).create();
        }

        /**
         * This method doesn't emit anything, but just keeps track of the top N words.
         *
         * @param key The word.
         * @param vals The words counts.
         * @param ctx Reducer context.
         * @throws IOException If failed.
         * @throws InterruptedException If failed.
         */
        @Override public void reduce(Text key, Iterable<IntWritable> vals, Context ctx) throws IOException,
            InterruptedException {
            int sum = 0;

            for (IntWritable val : vals)
                sum += val.get();

            q.add(immutableEntry(sum, key.toString()));
        }

        /**
         * This method is called after all the word entries have been processed. It writes the accumulated
         * statistics to the job output file.
         *
         * @param ctx The job context.
         * @throws IOException If failed.
         * @throws InterruptedException If failed.
         */
        @Override protected void cleanup(Context ctx) throws IOException, InterruptedException {
            IntWritable i = new IntWritable();

            Text txt = new Text();

            // iterate in desc order
            while (!q.isEmpty()) {
                Entry<Integer, String> e = q.removeFirst();

                i.set(e.getKey());

                txt.set(e.getValue());

                ctx.write(txt, i);
            }
        }
    }

    /**
     * Configures the Hadoop MapReduce job.
     *
     * @return Instance of the Hadoop MapRed job.
     * @throws IOException If failed.
     */
    @SuppressWarnings("deprecation")
    private Job createConfigBasedHadoopJob() throws IOException {
        Job jobCfg = new Job();

        Configuration cfg = jobCfg.getConfiguration();

        // Use explicit configuration of distributed file system, if provided.
        cfg.addResource(U.resolveIgniteUrl(DFS_CFG));

        jobCfg.setJobName("HadoopPopularWordExample");
        jobCfg.setJarByClass(HadoopPopularWordsTest.class);
        jobCfg.setInputFormatClass(TextInputFormat.class);
        jobCfg.setOutputKeyClass(Text.class);
        jobCfg.setOutputValueClass(IntWritable.class);
        jobCfg.setMapperClass(TokenizingMapper.class);
        jobCfg.setReducerClass(TopNWordsReducer.class);

        FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR);
        FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR);

        // Local job tracker allows the only task per wave, but text input format
        // replaces it with the calculated value based on input split size option.
        if ("local".equals(cfg.get("mapred.job.tracker", "local"))) {
            // Split job into tasks using 32MB split size.
            FileInputFormat.setMinInputSplitSize(jobCfg, 32 * 1024 * 1024);
            FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE);
        }

        return jobCfg;
    }

    /**
     * Runs the Hadoop job.
     *
     * @return {@code True} if succeeded, {@code false} otherwise.
     * @throws Exception If failed.
     */
    private boolean runWordCountConfigBasedHadoopJob() throws Exception {
        Job job = createConfigBasedHadoopJob();

        // Distributed file system this job will work with.
        FileSystem fs = FileSystem.get(job.getConfiguration());

        X.println(">>> Using distributed file system: " + fs.getHomeDirectory());

        // Prepare input and output job directories.
        prepareDirectories(fs);

        long time = System.currentTimeMillis();

        // Run job.
        boolean res = job.waitForCompletion(true);

        X.println(">>> Job execution time: " + (System.currentTimeMillis() - time) / 1000 + " sec.");

        // Move job results into local file system, so you can view calculated results.
        publishResults(fs);

        return res;
    }

    /**
     * Prepare job's data: cleanup result directories that might have left over
     * after previous runs, copy input files from the local file system into DFS.
     *
     * @param fs Distributed file system to use in job.
     * @throws IOException If failed.
     */
    private void prepareDirectories(FileSystem fs) throws IOException {
        X.println(">>> Cleaning up DFS result directory: " + RESULT_DFS_DIR);

        fs.delete(RESULT_DFS_DIR, true);

        X.println(">>> Cleaning up DFS input directory: " + BOOKS_DFS_DIR);

        fs.delete(BOOKS_DFS_DIR, true);

        X.println(">>> Copy local files into DFS input directory: " + BOOKS_DFS_DIR);

        fs.copyFromLocalFile(BOOKS_LOCAL_DIR, BOOKS_DFS_DIR);
    }

    /**
     * Publish job execution results into local file system, so you can view them.
     *
     * @param fs Distributed file sytem used in job.
     * @throws IOException If failed.
     */
    private void publishResults(FileSystem fs) throws IOException {
        X.println(">>> Cleaning up DFS input directory: " + BOOKS_DFS_DIR);

        fs.delete(BOOKS_DFS_DIR, true);

        X.println(">>> Cleaning up LOCAL result directory: " + RESULT_LOCAL_DIR);

        fs.delete(RESULT_LOCAL_DIR, true);

        X.println(">>> Moving job results into LOCAL result directory: " + RESULT_LOCAL_DIR);

        fs.copyToLocalFile(true, RESULT_DFS_DIR, RESULT_LOCAL_DIR);
    }

    /**
     * Executes a modified version of the Hadoop word count example. Here, in addition to counting the number of
     * occurrences of the word in the source files, the N most popular words are selected.
     *
     * @param args None.
     */
    public static void main(String[] args) {
        try {
            new HadoopPopularWordsTest().runWordCountConfigBasedHadoopJob();
        }
        catch (Exception e) {
            X.println(">>> Failed to run word count example: " + e.getMessage());
        }

        System.exit(0);
    }
}