/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.*; import java.util.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.*; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.filecache.*; import org.apache.hadoop.io.MD5Hash; import java.net.URI; public class MRSharedCaching { static String testStr = null; static String TEST_ROOT_DIR; static Path concatPath; static { TEST_ROOT_DIR = new Path(System.getProperty("test.build.data","/tmp")) .toString().replace(' ', '+'); concatPath = new Path(TEST_ROOT_DIR, "sharedTest.txt"); } /** * Using the wordcount example and adding caching to it. The cache * archives/files are set and then are checked in the map if they have been * localized or not. */ public static class MapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { JobConf conf; private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void configure(JobConf jconf) { conf = jconf; try { Path[] localArchives = DistributedCache.getLocalSharedCacheArchives(conf); Path[] localFiles = DistributedCache.getLocalSharedCacheFiles(conf); // read the cached files (unzipped, unjarred and text) // and put it into a single file TEST_ROOT_DIR/test.txt String TEST_ROOT_DIR = jconf.get("test.build.data","/tmp"); Path file = new Path("file:///", TEST_ROOT_DIR); FileSystem fs = FileSystem.getLocal(conf); if (!fs.mkdirs(file)) { throw new IOException("Mkdirs failed to create " + file.toString()); } Path fileOut = new Path(file, "sharedTest.txt"); fs.delete(fileOut, true); DataOutputStream out = fs.create(fileOut); if (localArchives != null) { for (int i = 0; i < localArchives.length; i++) { // read out the files from these archives File f = new File(localArchives[i].toString()); File txt = new File(f, "sharedTest.txt"); FileInputStream fin = new FileInputStream(txt); DataInputStream din = new DataInputStream(fin); String str = din.readLine(); din.close(); out.writeBytes(str); out.writeBytes("\n"); } } if (localFiles != null) { for (int i = 0; i < localFiles.length; i++) { // read out the files from these archives File txt = new File(localFiles[i].toString()); FileInputStream fin = new FileInputStream(txt); DataInputStream din = new DataInputStream(fin); String str = din.readLine(); out.writeBytes(str); out.writeBytes("\n"); } } out.close(); } catch (IOException ie) { // If file could not be opened the check at the end of // launchMRCache* will catch the error System.out.println(StringUtils.stringifyException(ie)); } } public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer itr = new StringTokenizer(line); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); output.collect(word, one); } } } /** * Using the wordcount example and adding caching to it. The cache * archives/files are set and then are checked in the map if they have been * symlinked or not. */ public static class MapClass2 extends MapClass { JobConf conf; public void configure(JobConf jconf) { conf = jconf; try { // read the cached files (unzipped, unjarred and text) // and put it into a single file TEST_ROOT_DIR/test.txt String TEST_ROOT_DIR = jconf.get("test.build.data","/tmp"); Path file = new Path("file:///", TEST_ROOT_DIR); FileSystem fs = FileSystem.getLocal(conf); if (!fs.mkdirs(file)) { throw new IOException("Mkdirs failed to create " + file.toString()); } Path fileOut = new Path(file, "sharedTest.txt"); fs.delete(fileOut, true); DataOutputStream out = fs.create(fileOut); String[] symlinks = new String[2]; symlinks[0] = "."; symlinks[1] = "sharedTest.zip"; for (int i = 0; i < symlinks.length; i++) { // read out the files from these archives File f = new File(symlinks[i]); File txt = new File(f, "sharedTest.txt"); FileInputStream fin = new FileInputStream(txt); BufferedReader reader = new BufferedReader(new InputStreamReader(fin)); String str = reader.readLine(); reader.close(); out.writeBytes(str); out.writeBytes("\n"); } out.close(); } catch (IOException ie) { // If file could not be opened the check at the end of // launchMRCache* will catch the error System.out.println(StringUtils.stringifyException(ie)); } } } /** * A reducer class that just emits the sum of the input values. */ public static class ReduceClass extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } public static class TestResult { public RunningJob job; public boolean isOutputOk; TestResult(RunningJob job, boolean isOutputOk) { this.job = job; this.isOutputOk = isOutputOk; } } // Boilerplate code public static FileSystem setupJob(String indir, String outdir, String cacheDir, JobConf conf, String input) throws IOException { return setupJob(indir, outdir, cacheDir, conf, input, false); } // Boilerplate code public static FileSystem setupJob(String indir, String outdir, String cacheDir, JobConf conf, String input, boolean withSymlink) throws IOException { final Path inDir = new Path(indir); final Path outDir = new Path(outdir); FileSystem fs = FileSystem.get(conf); fs.delete(outDir, true); if (!fs.mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.toString()); } { DataOutputStream file = fs.create(new Path(inDir, "part-0")); file.writeBytes(input); file.close(); } conf.setJobName("sharedcachetest"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setCombinerClass(MRSharedCaching.ReduceClass.class); conf.setReducerClass(MRSharedCaching.ReduceClass.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setSpeculativeExecution(false); if (!withSymlink) { conf.setMapperClass(MRSharedCaching.MapClass.class); } else { conf.setMapperClass(MRSharedCaching.MapClass2.class); } // Turn on sharing conf.set("mapred.cache.shared.enabled", "true"); return fs; } private static String getCARDir() { return "CAR/"; } private static String getSharedFilesDir() { return getCARDir() + "files/"; } private static String getSharedArchivesDir() { return getCARDir() + "archives/"; } /** * Loads a file and an archive. This test checks the basic functionality of * the DistribtedCache */ public static TestResult launchMRCache(String indir, String outdir, String cacheDir, JobConf conf, String input, boolean withSymlink) throws IOException { conf.set("test.build.data", TEST_ROOT_DIR); FileSystem fs = setupJob(indir, outdir, cacheDir, conf, input, withSymlink); URI localFS = URI.create("file:///"); Path cachePath = new Path(localFS.toString(), System.getProperty("test.cache.data")); cachePath = new Path(cachePath, "sharedTest1"); Path txtPath = new Path(cachePath, "sharedTest.txt"); Path zipPath = new Path(cachePath, "sharedTest.zip"); conf.set("tmpfiles", txtPath.toUri().toString()); conf.set("tmparchives", zipPath.toUri().toString()); String md5Txt = MD5Hash.digest(new FileInputStream(txtPath.toUri().getPath())).toString(); String md5Zip = MD5Hash.digest(new FileInputStream(zipPath.toUri().getPath())).toString(); // Read string from source file if (testStr == null) { testStr = new BufferedReader (new InputStreamReader(FileSystem.getLocal(conf). open(txtPath))).readLine(); } RunningJob job = JobClient.runJob(conf); // after the job ran check to see if the input from the localized cache // match the real string. check if there are 2 instances or not. int count = 0; Path result = concatPath; { BufferedReader file = new BufferedReader (new InputStreamReader(FileSystem.getLocal(conf).open(result))); String line = file.readLine(); while (line != null) { if (!testStr.equals(line)) { return new TestResult(job, false); } count++; line = file.readLine(); } file.close(); } if (count != 2) { return new TestResult(job, false); } // Also check that the files were loaded correctly into hdfs Path basePath = fs.makeQualified(new Path(conf.get("mapred.system.dir"))); if (!fs.exists(new Path(basePath, getSharedFilesDir() + md5Txt + "_sharedTest.txt"))) { return new TestResult(job, false); } if (!fs.exists(new Path(basePath, getSharedArchivesDir() + md5Zip + "_sharedTest.zip"))) { return new TestResult(job, false); } return new TestResult(job, true); } /** * Loads 2 different files with the same filename. This test checks that * when there are two different files with the same filename, * DistributedCache still works well */ public static TestResult launchMRCache2(String indir, String outdir, String cacheDir, JobConf conf, String input) throws IOException { conf.set("test.build.data", TEST_ROOT_DIR); FileSystem fs = setupJob(indir, outdir, cacheDir, conf, input); URI localFS = URI.create("file:///"); Path cachePath = new Path(localFS.toString(), System.getProperty("test.cache.data")); cachePath = new Path(cachePath, "sharedTest2"); Path txtPath = new Path(cachePath, "sharedTest.txt"); conf.set("tmpfiles", txtPath.toUri().toString()); String md5 = MD5Hash.digest(new FileInputStream(txtPath.toUri().getPath())).toString(); RunningJob job = JobClient.runJob(conf); // In this second test, we want to make sure we are not reading the // sharedTest.txt file from test one Path result = concatPath; { BufferedReader file = new BufferedReader (new InputStreamReader(FileSystem.getLocal(conf).open(result))); String line = file.readLine(); while (line != null) { // If the strings are equal, that means we are accessing the wrong // sharedTest.txt if (testStr.equals(line)) { return new TestResult(job, false); } line = file.readLine(); } file.close(); } // Also check that the file was loaded correctly into hdfs Path basePath = fs.makeQualified(new Path(conf.get("mapred.system.dir"))); if (!fs.exists(new Path(basePath, getSharedFilesDir() + md5 + "_sharedTest.txt"))) { return new TestResult(job, false); } return new TestResult(job, true); } /** * Loads 2 files with the same content, but different filenames. This test * checks that when there are two identical files with different filenames, * DistributedCache still works well */ public static TestResult launchMRCache3(String indir, String outdir, String cacheDir, JobConf conf, String input) throws IOException { conf.set("test.build.data", TEST_ROOT_DIR); FileSystem fs = setupJob(indir, outdir, cacheDir, conf, input); URI localFS = URI.create("file:///"); Path cachePath = new Path(localFS.toString(), System.getProperty("test.cache.data")); cachePath = new Path(cachePath, "sharedTest1"); String path1 = new Path(cachePath, "sharedTest.txt").toUri().toString(); String path2 = new Path(cachePath, "sharedTest2.txt").toUri().toString(); conf.set("tmpfiles", path1 + "," + path2); RunningJob job = JobClient.runJob(conf); int count = 0; Path result = concatPath; { BufferedReader file = new BufferedReader (new InputStreamReader(FileSystem.getLocal(conf).open(result))); String line = file.readLine(); while (line != null) { if (!testStr.equals(line)) { return new TestResult(job, false); } line = file.readLine(); count++; } file.close(); } if (count != 2) { return new TestResult(job, false); } return new TestResult(job, true); } }