/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ignite.internal.processors.hadoop.impl; import com.google.common.collect.MinMaxPriorityQueue; import java.io.IOException; import java.util.Comparator; import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.ignite.internal.util.typedef.X; import org.apache.ignite.internal.util.typedef.internal.U; import static com.google.common.collect.Maps.immutableEntry; import static com.google.common.collect.MinMaxPriorityQueue.orderedBy; import static java.util.Collections.reverseOrder; /** * Hadoop-based 10 popular words example: all files in a given directory are tokenized and for each word longer than * 3 characters the number of occurrences ins calculated. Finally, 10 words with the highest occurrence count are * output. * * NOTE: in order to run this example on Windows please ensure that cygwin is installed and available in the system * path. */ public class HadoopPopularWordsTest { /** Ignite home. */ private static final String IGNITE_HOME = U.getIgniteHome(); /** The path to the input directory. ALl files in that directory will be processed. */ private static final Path BOOKS_LOCAL_DIR = new Path("file:" + IGNITE_HOME, "modules/tests/java/org/apache/ignite/grid/hadoop/books"); /** The path to the output directory. THe result file will be written to this location. */ private static final Path RESULT_LOCAL_DIR = new Path("file:" + IGNITE_HOME, "modules/tests/java/org/apache/ignite/grid/hadoop/output"); /** Popular books source dir in DFS. */ private static final Path BOOKS_DFS_DIR = new Path("tmp/word-count-example/in"); /** Popular books source dir in DFS. */ private static final Path RESULT_DFS_DIR = new Path("tmp/word-count-example/out"); /** Path to the distributed file system configuration. */ private static final String DFS_CFG = "examples/config/filesystem/core-site.xml"; /** Top N words to select **/ private static final int POPULAR_WORDS_CNT = 10; /** * For each token in the input string the mapper emits a {word, 1} pair. */ private static class TokenizingMapper extends Mapper<LongWritable, Text, Text, IntWritable> { /** Constant value. */ private static final IntWritable ONE = new IntWritable(1); /** The word converted into the Text. */ private Text word = new Text(); /** * Emits a entry where the key is the word and the value is always 1. * * @param key the current position in the input file (not used here) * @param val the text string * @param ctx mapper context * @throws IOException * @throws InterruptedException */ @Override protected void map(LongWritable key, Text val, Context ctx) throws IOException, InterruptedException { // Get the mapped object. final String line = val.toString(); // Splits the given string to words. final String[] words = line.split("[^a-zA-Z0-9]"); for (final String w : words) { // Only emit counts for longer words. if (w.length() <= 3) continue; word.set(w); // Write the word into the context with the initial count equals 1. ctx.write(word, ONE); } } } /** * The reducer uses a priority queue to rank the words based on its number of occurrences. */ private static class TopNWordsReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private MinMaxPriorityQueue<Entry<Integer, String>> q; TopNWordsReducer() { q = orderedBy(reverseOrder(new Comparator<Entry<Integer, String>>() { @Override public int compare(Entry<Integer, String> o1, Entry<Integer, String> o2) { return o1.getKey().compareTo(o2.getKey()); } })).expectedSize(POPULAR_WORDS_CNT).maximumSize(POPULAR_WORDS_CNT).create(); } /** * This method doesn't emit anything, but just keeps track of the top N words. * * @param key The word. * @param vals The words counts. * @param ctx Reducer context. * @throws IOException If failed. * @throws InterruptedException If failed. */ @Override public void reduce(Text key, Iterable<IntWritable> vals, Context ctx) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : vals) sum += val.get(); q.add(immutableEntry(sum, key.toString())); } /** * This method is called after all the word entries have been processed. It writes the accumulated * statistics to the job output file. * * @param ctx The job context. * @throws IOException If failed. * @throws InterruptedException If failed. */ @Override protected void cleanup(Context ctx) throws IOException, InterruptedException { IntWritable i = new IntWritable(); Text txt = new Text(); // iterate in desc order while (!q.isEmpty()) { Entry<Integer, String> e = q.removeFirst(); i.set(e.getKey()); txt.set(e.getValue()); ctx.write(txt, i); } } } /** * Configures the Hadoop MapReduce job. * * @return Instance of the Hadoop MapRed job. * @throws IOException If failed. */ @SuppressWarnings("deprecation") private Job createConfigBasedHadoopJob() throws IOException { Job jobCfg = new Job(); Configuration cfg = jobCfg.getConfiguration(); // Use explicit configuration of distributed file system, if provided. cfg.addResource(U.resolveIgniteUrl(DFS_CFG)); jobCfg.setJobName("HadoopPopularWordExample"); jobCfg.setJarByClass(HadoopPopularWordsTest.class); jobCfg.setInputFormatClass(TextInputFormat.class); jobCfg.setOutputKeyClass(Text.class); jobCfg.setOutputValueClass(IntWritable.class); jobCfg.setMapperClass(TokenizingMapper.class); jobCfg.setReducerClass(TopNWordsReducer.class); FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR); FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR); // Local job tracker allows the only task per wave, but text input format // replaces it with the calculated value based on input split size option. if ("local".equals(cfg.get("mapred.job.tracker", "local"))) { // Split job into tasks using 32MB split size. FileInputFormat.setMinInputSplitSize(jobCfg, 32 * 1024 * 1024); FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE); } return jobCfg; } /** * Runs the Hadoop job. * * @return {@code True} if succeeded, {@code false} otherwise. * @throws Exception If failed. */ private boolean runWordCountConfigBasedHadoopJob() throws Exception { Job job = createConfigBasedHadoopJob(); // Distributed file system this job will work with. FileSystem fs = FileSystem.get(job.getConfiguration()); X.println(">>> Using distributed file system: " + fs.getHomeDirectory()); // Prepare input and output job directories. prepareDirectories(fs); long time = System.currentTimeMillis(); // Run job. boolean res = job.waitForCompletion(true); X.println(">>> Job execution time: " + (System.currentTimeMillis() - time) / 1000 + " sec."); // Move job results into local file system, so you can view calculated results. publishResults(fs); return res; } /** * Prepare job's data: cleanup result directories that might have left over * after previous runs, copy input files from the local file system into DFS. * * @param fs Distributed file system to use in job. * @throws IOException If failed. */ private void prepareDirectories(FileSystem fs) throws IOException { X.println(">>> Cleaning up DFS result directory: " + RESULT_DFS_DIR); fs.delete(RESULT_DFS_DIR, true); X.println(">>> Cleaning up DFS input directory: " + BOOKS_DFS_DIR); fs.delete(BOOKS_DFS_DIR, true); X.println(">>> Copy local files into DFS input directory: " + BOOKS_DFS_DIR); fs.copyFromLocalFile(BOOKS_LOCAL_DIR, BOOKS_DFS_DIR); } /** * Publish job execution results into local file system, so you can view them. * * @param fs Distributed file sytem used in job. * @throws IOException If failed. */ private void publishResults(FileSystem fs) throws IOException { X.println(">>> Cleaning up DFS input directory: " + BOOKS_DFS_DIR); fs.delete(BOOKS_DFS_DIR, true); X.println(">>> Cleaning up LOCAL result directory: " + RESULT_LOCAL_DIR); fs.delete(RESULT_LOCAL_DIR, true); X.println(">>> Moving job results into LOCAL result directory: " + RESULT_LOCAL_DIR); fs.copyToLocalFile(true, RESULT_DFS_DIR, RESULT_LOCAL_DIR); } /** * Executes a modified version of the Hadoop word count example. Here, in addition to counting the number of * occurrences of the word in the source files, the N most popular words are selected. * * @param args None. */ public static void main(String[] args) { try { new HadoopPopularWordsTest().runWordCountConfigBasedHadoopJob(); } catch (Exception e) { X.println(">>> Failed to run word count example: " + e.getMessage()); } System.exit(0); } }