/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.streaming; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.StringTokenizer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.ClusterMapReduceTestCase; import org.apache.hadoop.mapred.Counters; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputLogFilter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SkipBadRecords; public class TestStreamingBadRecords extends ClusterMapReduceTestCase { private static final Log LOG = LogFactory.getLog(TestStreamingBadRecords.class); private static final List<String> MAPPER_BAD_RECORDS = Arrays.asList("hey022","hey023","hey099"); private static final List<String> REDUCER_BAD_RECORDS = Arrays.asList("hey001","hey018"); private static final String badMapper = StreamUtil.makeJavaCommand(BadApp.class, new String[]{}); private static final String badReducer = StreamUtil.makeJavaCommand(BadApp.class, new String[]{"true"}); private static final int INPUTSIZE=100; public TestStreamingBadRecords() throws IOException { UtilTest utilTest = new UtilTest(getClass().getName()); utilTest.checkUserDir(); utilTest.redirectIfAntJunit(); } private void createInput() throws Exception { OutputStream os = getFileSystem().create(new Path(getInputDir(), "text.txt")); Writer wr = new OutputStreamWriter(os); //increasing the record size so that we have stream flushing String prefix = new String(new byte[20*1024]); for(int i=1;i<=INPUTSIZE;i++) { String str = ""+i; int zerosToPrepend = 3 - str.length(); for(int j=0;j<zerosToPrepend;j++){ str = "0"+str; } wr.write(prefix + "hey"+str+"\n"); }wr.close(); } private void validateOutput(RunningJob runningJob, boolean validateCount) throws Exception { LOG.info(runningJob.getCounters().toString()); assertTrue(runningJob.isSuccessful()); if(validateCount) { //validate counters String counterGrp = "org.apache.hadoop.mapred.Task$Counter"; Counters counters = runningJob.getCounters(); assertEquals(counters.findCounter(counterGrp, "MAP_SKIPPED_RECORDS"). getCounter(),MAPPER_BAD_RECORDS.size()); int mapRecs = INPUTSIZE - MAPPER_BAD_RECORDS.size(); assertEquals(counters.findCounter(counterGrp, "MAP_INPUT_RECORDS"). getCounter(),mapRecs); assertEquals(counters.findCounter(counterGrp, "MAP_OUTPUT_RECORDS"). getCounter(),mapRecs); int redRecs = mapRecs - REDUCER_BAD_RECORDS.size(); assertEquals(counters.findCounter(counterGrp, "REDUCE_SKIPPED_RECORDS"). getCounter(),REDUCER_BAD_RECORDS.size()); assertEquals(counters.findCounter(counterGrp, "REDUCE_SKIPPED_GROUPS"). getCounter(),REDUCER_BAD_RECORDS.size()); assertEquals(counters.findCounter(counterGrp, "REDUCE_INPUT_GROUPS"). getCounter(),redRecs); assertEquals(counters.findCounter(counterGrp, "REDUCE_INPUT_RECORDS"). getCounter(),redRecs); assertEquals(counters.findCounter(counterGrp, "REDUCE_OUTPUT_RECORDS"). getCounter(),redRecs); } List<String> badRecs = new ArrayList<String>(); badRecs.addAll(MAPPER_BAD_RECORDS); badRecs.addAll(REDUCER_BAD_RECORDS); Path[] outputFiles = FileUtil.stat2Paths( getFileSystem().listStatus(getOutputDir(), new OutputLogFilter())); if (outputFiles.length > 0) { InputStream is = getFileSystem().open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = reader.readLine(); int counter = 0; while (line != null) { counter++; StringTokenizer tokeniz = new StringTokenizer(line, "\t"); String value = tokeniz.nextToken(); int index = value.indexOf("hey"); assertTrue(index>-1); if(index>-1) { String heyStr = value.substring(index); assertTrue(!badRecs.contains(heyStr)); } line = reader.readLine(); } reader.close(); if(validateCount) { assertEquals(INPUTSIZE-badRecs.size(), counter); } } } public void testSkip() throws Exception { JobConf clusterConf = createJobConf(); createInput(); int attSkip =0; SkipBadRecords.setAttemptsToStartSkipping(clusterConf,attSkip); //the no of attempts to successfully complete the task depends //on the no of bad records. int mapperAttempts = attSkip+1+MAPPER_BAD_RECORDS.size(); int reducerAttempts = attSkip+1+REDUCER_BAD_RECORDS.size(); String[] args = new String[] { "-input", (new Path(getInputDir(), "text.txt")).toString(), "-output", getOutputDir().toString(), "-mapper", badMapper, "-reducer", badReducer, "-verbose", "-inputformat", "org.apache.hadoop.mapred.KeyValueTextInputFormat", "-jobconf", "mapred.skip.attempts.to.start.skipping="+attSkip, "-jobconf", "mapred.skip.out.dir=none", "-jobconf", "mapred.map.max.attempts="+mapperAttempts, "-jobconf", "mapred.reduce.max.attempts="+reducerAttempts, "-jobconf", "mapred.skip.map.max.skip.records="+Long.MAX_VALUE, "-jobconf", "mapred.skip.reduce.max.skip.groups="+Long.MAX_VALUE, "-jobconf", "mapred.map.tasks=1", "-jobconf", "mapred.reduce.tasks=1", "-jobconf", "fs.default.name="+clusterConf.get("fs.default.name"), "-jobconf", "mapred.job.tracker="+clusterConf.get("mapred.job.tracker"), "-jobconf", "mapred.job.tracker.http.address=" +clusterConf.get("mapred.job.tracker.http.address"), "-jobconf", "stream.debug=set", "-jobconf", "keep.failed.task.files=true", "-jobconf", "stream.tmpdir="+System.getProperty("test.build.data","/tmp") }; StreamJob job = new StreamJob(args, false); job.go(); validateOutput(job.running_, false); //validate that there is no skip directory as it has been set to "none" assertTrue(SkipBadRecords.getSkipOutputPath(job.jobConf_)==null); } public void testNarrowDown() throws Exception { createInput(); JobConf clusterConf = createJobConf(); String[] args = new String[] { "-input", (new Path(getInputDir(), "text.txt")).toString(), "-output", getOutputDir().toString(), "-mapper", badMapper, "-reducer", badReducer, "-verbose", "-inputformat", "org.apache.hadoop.mapred.KeyValueTextInputFormat", "-jobconf", "mapred.skip.attempts.to.start.skipping=1", //actually fewer attempts are required than specified //but to cater to the case of slow processed counter update, need to //have more attempts "-jobconf", "mapred.map.max.attempts=20", "-jobconf", "mapred.reduce.max.attempts=15", "-jobconf", "mapred.skip.map.max.skip.records=1", "-jobconf", "mapred.skip.reduce.max.skip.groups=1", "-jobconf", "mapred.map.tasks=1", "-jobconf", "mapred.reduce.tasks=1", "-jobconf", "fs.default.name="+clusterConf.get("fs.default.name"), "-jobconf", "mapred.job.tracker="+clusterConf.get("mapred.job.tracker"), "-jobconf", "mapred.job.tracker.http.address=" +clusterConf.get("mapred.job.tracker.http.address"), "-jobconf", "stream.debug=set", "-jobconf", "keep.failed.task.files=true", "-jobconf", "stream.tmpdir="+System.getProperty("test.build.data","/tmp") }; StreamJob job = new StreamJob(args, false); job.go(); validateOutput(job.running_, true); assertTrue(SkipBadRecords.getSkipOutputPath(job.jobConf_)!=null); } static class App{ boolean isReducer; public App(String[] args) throws Exception{ if(args.length>0) { isReducer = Boolean.parseBoolean(args[0]); } String counter = SkipBadRecords.COUNTER_MAP_PROCESSED_RECORDS; if(isReducer) { counter = SkipBadRecords.COUNTER_REDUCE_PROCESSED_GROUPS; } BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; int count = 0; while ((line = in.readLine()) != null) { processLine(line); count++; if(count>=10) { System.err.println("reporter:counter:"+SkipBadRecords.COUNTER_GROUP+ ","+counter+","+count); count = 0; } } } protected void processLine(String line) throws Exception{ System.out.println(line); } public static void main(String[] args) throws Exception{ new App(args); } } static class BadApp extends App{ public BadApp(String[] args) throws Exception { super(args); } protected void processLine(String line) throws Exception { List<String> badRecords = MAPPER_BAD_RECORDS; if(isReducer) { badRecords = REDUCER_BAD_RECORDS; } if(badRecords.size()>0 && line.contains(badRecords.get(0))) { LOG.warn("Encountered BAD record"); System.exit(-1); } else if(badRecords.size()>1 && line.contains(badRecords.get(1))) { LOG.warn("Encountered BAD record"); throw new Exception("Got bad record..crashing"); } else if(badRecords.size()>2 && line.contains(badRecords.get(2))) { LOG.warn("Encountered BAD record"); System.exit(-1); } super.processLine(line); } public static void main(String[] args) throws Exception{ new BadApp(args); } } }