/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_BYTES; import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapred.IFile.Writer; import org.apache.hadoop.mapred.Merger.Segment; import org.apache.hadoop.mapred.Task.TaskReporter; import org.apache.hadoop.util.LexicographicalComparerHolder; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ResourceCalculatorPlugin.ProcResourceValues; public class BlockMapOutputBuffer<K extends BytesWritable, V extends BytesWritable> implements BlockMapOutputCollector<K, V> { private static final Log LOG = LogFactory.getLog(BlockMapOutputBuffer.class.getName()); private final Partitioner<K, V> partitioner; private final int partitions; private final JobConf job; private final TaskReporter reporter; private final Class<K> keyClass; private final Class<V> valClass; private final int softBufferLimit; // Compression for map-outputs private CompressionCodec codec = null; // main output buffer private byte[] kvbuffer; private int kvBufferSize; // spill accounting private volatile int numSpills = 0; // number of spills for big records private volatile int numBigRecordsSpills = 0; private volatile int numBigRecordsWarnThreshold = 500; private final FileSystem localFs; private final FileSystem rfs; private final Counters.Counter mapOutputByteCounter; private final Counters.Counter mapOutputRecordCounter; private MapSpillSortCounters mapSpillSortCounter; private MapTask task; private ReducePartition<K, V>[] reducePartitions; private ArrayList<SpillRecord> indexCacheList; // an array of memory segments, one for each reduce partition. private Segment<K,V>[] inMemorySegments; private boolean hasInMemorySpill; private boolean lastSpillInMem; private int totalIndexCacheMemory; private static final int INDEX_CACHE_MEMORY_LIMIT = 2 * 1024 * 1024; private final MemoryBlockAllocator memoryBlockAllocator; @SuppressWarnings( { "unchecked", "deprecation" }) public BlockMapOutputBuffer(TaskUmbilicalProtocol umbilical, JobConf job, TaskReporter reporter, MapTask task) throws IOException, ClassNotFoundException { this.task = task; this.job = job; this.reporter = reporter; localFs = FileSystem.getLocal(job); partitions = job.getNumReduceTasks(); indexCacheList = new ArrayList<SpillRecord>(); if (partitions > 0) { partitioner = (Partitioner<K, V>) ReflectionUtils.newInstance(job .getPartitionerClass(), job); } else { partitioner = new Partitioner() { @Override public int getPartition(Object key, Object value, int numPartitions) { return -1; } @Override public void configure(JobConf job) { } }; } rfs = ((LocalFileSystem) localFs).getRaw(); float spillper = job.getFloat("io.sort.spill.percent", (float) 0.9); if (spillper > (float) 1.0 || spillper < (float) 0.0) { LOG.error("Invalid \"io.sort.spill.percent\": " + spillper); spillper = 0.8f; } lastSpillInMem = job.getBoolean("mapred.map.lastspill.memory", true); numBigRecordsWarnThreshold = job.getInt("mapred.map.bigrecord.spill.warn.threshold", 500); int sortmb = job.getInt("io.sort.mb", 100); boolean localMode = job.get("mapred.job.tracker", "local").equals("local"); if (localMode) { sortmb = job.getInt("io.sort.mb.localmode", 100); } if ((sortmb & 0x7FF) != sortmb) { throw new IOException("Invalid \"io.sort.mb\": " + sortmb); } LOG.info("io.sort.mb = " + sortmb); // buffers and accounting kvBufferSize = sortmb << 20; kvbuffer = new byte[kvBufferSize]; softBufferLimit = (int) (kvbuffer.length * spillper); // k/v serialization keyClass = (Class<K>) job.getMapOutputKeyClass(); valClass = (Class<V>) job.getMapOutputValueClass(); if (!BytesWritable.class.isAssignableFrom(keyClass) || !BytesWritable.class.isAssignableFrom(valClass)) { throw new IOException(this.getClass().getName() + " only support " + BytesWritable.class.getName() + " as key and value classes, MapOutputKeyClass is " + keyClass.getName() + ", MapOutputValueClass is " + valClass.getName()); } int numMappers = job.getNumMapTasks(); memoryBlockAllocator = new MemoryBlockAllocator(kvBufferSize, softBufferLimit, numMappers, partitions, this); // counters mapOutputByteCounter = reporter.getCounter(MAP_OUTPUT_BYTES); mapOutputRecordCounter = reporter.getCounter(MAP_OUTPUT_RECORDS); mapSpillSortCounter = new MapSpillSortCounters(reporter); reducePartitions = new ReducePartition[partitions]; inMemorySegments = new Segment[partitions]; for (int i = 0; i < partitions; i++) { reducePartitions[i] = new ReducePartition(i, this.memoryBlockAllocator, this.kvbuffer, this, this.reporter); } // compression if (job.getCompressMapOutput()) { Class<? extends CompressionCodec> codecClass = job .getMapOutputCompressorClass(DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, job); } } private TaskAttemptID getTaskID() { return task.getTaskID(); } public void collect(K key, V value, int partition) throws IOException { reporter.progress(); if (key.getClass() != keyClass) { throw new IOException("Type mismatch in key from map: expected " + keyClass.getName() + ", recieved " + key.getClass().getName()); } if (value.getClass() != valClass) { throw new IOException("Type mismatch in value from map: expected " + valClass.getName() + ", recieved " + value.getClass().getName()); } int collected = reducePartitions[partition].collect(key, value); mapOutputRecordCounter.increment(1); mapOutputByteCounter.increment(collected); } @SuppressWarnings("deprecation") @Override public void collect(K key, V value) throws IOException { collect(key, value, partitioner.getPartition(key, value, partitions)); } /* * return the value of ProcResourceValues for later use */ protected ProcResourceValues sortReduceParts() { long sortStartMilli = System.currentTimeMillis(); ProcResourceValues sortStartProcVals = task.getCurrentProcResourceValues(); long sortStart = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); // sort for (int i = 0; i < reducePartitions.length; i++) { reducePartitions[i].groupOrSort(); } long sortEndMilli = System.currentTimeMillis(); ProcResourceValues sortEndProcVals = task.getCurrentProcResourceValues(); long sortEnd = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); mapSpillSortCounter.incCountersPerSort(sortStartProcVals, sortEndProcVals, sortEndMilli - sortStartMilli); mapSpillSortCounter.incJVMCPUPerSort(sortStart, sortEnd); return sortEndProcVals; } @Override public void sortAndSpill() throws IOException { ProcResourceValues sortEndProcVals = sortReduceParts(); long sortEndMilli = System.currentTimeMillis(); long spillStart = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); // spill FSDataOutputStream out = null; long spillBytes = 0; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = task.mapOutputFile .getSpillFileForWrite(getTaskID(), numSpills, this.memoryBlockAllocator.getEstimatedSize()); out = rfs.create(filename); for (int i = 0; i < partitions; ++i) { IndexRecord rec = reducePartitions[i].spill(job, out, keyClass, valClass, codec, task.spilledRecordsCounter); // record offsets spillBytes += rec.partLength; spillRec.putIndex(rec, i); } if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = task.mapOutputFile.getSpillIndexFileForWrite(getTaskID(), numSpills, partitions * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill " + numSpills); ++numSpills; } finally { if (out != null) out.close(); } long spillEndMilli = System.currentTimeMillis(); ProcResourceValues spillEndProcVals = task.getCurrentProcResourceValues(); long spillEnd = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); mapSpillSortCounter.incCountersPerSpill(sortEndProcVals, spillEndProcVals, spillEndMilli - sortEndMilli, spillBytes); mapSpillSortCounter.incJVMCPUPerSpill(spillStart, spillEnd); } public void spillSingleRecord(K key, V value, int part) throws IOException { ProcResourceValues spillStartProcVals = task.getCurrentProcResourceValues(); long spillStartMilli = System.currentTimeMillis(); long spillStart = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); // spill FSDataOutputStream out = null; long spillBytes = 0; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = task.mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, key.getLength() + value.getLength()); out = rfs.create(filename); IndexRecord rec = new IndexRecord(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer<K, V>(job, out, keyClass, valClass, codec, task.spilledRecordsCounter); if (i == part) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter .increment(out.getPos() - recordStart); } writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillBytes += writer.getCompressedLength(); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = task.mapOutputFile.getSpillIndexFileForWrite(getTaskID(), numSpills, partitions * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill big record " + numBigRecordsSpills); ++numBigRecordsSpills; ++numSpills; } finally { if (out != null) out.close(); } long spillEndMilli = System.currentTimeMillis(); ProcResourceValues spillEndProcVals = task.getCurrentProcResourceValues(); mapSpillSortCounter.incCountersPerSpill(spillStartProcVals, spillEndProcVals, spillEndMilli - spillStartMilli, spillBytes); long spillEnd = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); mapSpillSortCounter.incJVMCPUPerSpill(spillStart, spillEnd); mapSpillSortCounter.incSpillSingleRecord(); } public synchronized void flush() throws IOException, ClassNotFoundException, InterruptedException { if (numSpills > 0 && lastSpillInMem) { // if there is already one spills, we can try to hold this last spill in // memory. sortReduceParts(); for (int i = 0; i < partitions; i++) { this.inMemorySegments[i] = new Segment<K, V>(this.reducePartitions[i].getIReader(), true); } hasInMemorySpill=true; } else { sortAndSpill(); } long mergeStartMilli = System.currentTimeMillis(); ProcResourceValues mergeStartProcVals = task.getCurrentProcResourceValues(); long mergeStart = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); mergeParts(); long mergeEndMilli = System.currentTimeMillis(); ProcResourceValues mergeEndProcVals = task.getCurrentProcResourceValues(); long mergeEnd = task.jmxThreadInfoTracker.getTaskCPUTime("MAIN_TASK"); mapSpillSortCounter.incMergeCounters(mergeStartProcVals, mergeEndProcVals, mergeEndMilli - mergeStartMilli); mapSpillSortCounter.incJVMCPUMerge(mergeStart, mergeEnd); } private void mergeParts() throws IOException, InterruptedException, ClassNotFoundException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final TaskAttemptID mapId = getTaskID(); for (int i = 0; i < numSpills; i++) { filename[i] = task.mapOutputFile.getSpillFile(mapId, i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); } for (Segment<K, V> segement : this.inMemorySegments) { if(segement != null) { finalOutFileSize += segement.getLength(); } } // the spill is the final output if (numSpills == 1 && !hasInMemorySpill) { Path outFile = new Path(filename[0].getParent(), "file.out"); rfs.rename(filename[0], outFile); if (indexCacheList.size() == 0) { rfs.rename(task.mapOutputFile.getSpillIndexFile(mapId, 0), new Path( filename[0].getParent(), "file.out.index")); } else { indexCacheList.get(0).writeToFile( new Path(filename[0].getParent(), "file.out.index"), job); } return; } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = task.mapOutputFile.getSpillIndexFile(mapId, i); indexCacheList.add(new SpillRecord(indexFileName, job)); } // make correction in the length to include the file header // lengths for each partition finalOutFileSize += partitions * MapTask.APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = task.mapOutputFile.getOutputFileForWrite(mapId, finalOutFileSize); Path finalIndexFile = task.mapOutputFile.getOutputIndexFileForWrite(mapId, finalIndexFileSize); // The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); if (numSpills == 0) { // create dummy files IndexRecord rec = new IndexRecord(); SpillRecord sr = new SpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec, null); writer.close(); rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, job); } finally { finalOut.close(); } return; } { IndexRecord rec = new IndexRecord(); final SpillRecord spillRec = new SpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { // create the segments to be merged List<Segment<K, V>> segmentList = new ArrayList<Segment<K, V>>( numSpills + this.inMemorySegments.length); for (int i = 0; i < numSpills; i++) { IndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment<K, V> s = new Segment<K, V>(job, rfs, filename[i], indexRecord.startOffset, indexRecord.partLength, codec, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug("MapId=" + mapId + " Reducer=" + parts + "Spill =" + i + "(" + indexRecord.startOffset + "," + indexRecord.rawLength + ", " + indexRecord.partLength + ")"); } } if(this.inMemorySegments[parts] != null) { // add the in memory spill to the end of segmentList segmentList.add(numSpills, this.inMemorySegments[parts]); } // merge RawKeyValueIterator kvIter = Merger.merge(job, rfs, keyClass, valClass, codec, segmentList, job.getInt("io.sort.factor", 100), new Path(mapId.toString()), new RawComparator<K>() { @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { return LexicographicalComparerHolder.compareBytes( b1, s1 + WritableUtils.INT_LENGTH_BYTES, l1 - WritableUtils.INT_LENGTH_BYTES, b2, s2 + WritableUtils.INT_LENGTH_BYTES, l2 - WritableUtils.INT_LENGTH_BYTES); } @Override public int compare(K o1, K o2) { return LexicographicalComparerHolder.compareBytes( o1.getBytes(), 0, o1.getLength(), o2.getBytes(), 0, o2.getLength()); } }, reporter, null, task.spilledRecordsCounter); // write merged output to disk long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec, task.spilledRecordsCounter); Merger.writeFile(kvIter, writer, reporter, job); // close writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, job); finalOut.close(); for (int i = 0; i < numSpills; i++) { rfs.delete(filename[i], true); } } } public void close() { this.mapSpillSortCounter.finalCounterUpdate(); if(numBigRecordsSpills > numBigRecordsWarnThreshold) { LOG.warn("Spilled a large number of big records: " + numBigRecordsSpills); } } }