package org.apache.hadoop.mapred; import static org.apache.hadoop.mapred.Task.Counter.COMBINE_INPUT_RECORDS; import static org.apache.hadoop.mapred.Task.Counter.COMBINE_OUTPUT_RECORDS; import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_BYTES; import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_MATERIALIZED_BYTES; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapred.CachePool.CacheUnit; import org.apache.hadoop.mapred.IFile.InMemoryReader; import org.apache.hadoop.mapred.IFile.Reader; import org.apache.hadoop.mapred.IFile.Writer; import org.apache.hadoop.mapred.MemoryElement.MemoryElementFullException; import org.apache.hadoop.mapred.Merger.Segment; import org.apache.hadoop.mapred.Task.CombineOutputCollector; import org.apache.hadoop.mapred.Task.CombinerRunner; import org.apache.hadoop.mapred.Task.Counter; import org.apache.hadoop.mapred.Task.TaskReporter; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; //默认的jvm manager 采用内存池方式 public class DefaultJvmMemoryManager implements JvmMemoryManager{ //static final int MEMORY_BLOCK_SIZE = 4*1024*1024; //默认每个块为4M class DefaultMapRamManager implements MapRamManager{ class MapReserveElement { MapReserveElement(TaskAttemptID tid) { taskId = tid; waitCond = crm.getReserveLock().newCondition(); reserveSize = 0; waitTime = 0; reservedSize = 0; maxSize = 0; } long reservedSize; TaskAttemptID taskId; Condition waitCond; long reserveSize; long maxSize; int waitTime; } private static final float MAX_STALLED_SPILL_THREADS_FRACTION = 0.75f; private ChildRamManager crm; private Map<TaskAttemptID, MapReserveElement> mapREs = new ConcurrentHashMap<TaskAttemptID, MapReserveElement>(); private List<MapReserveElement> waitMREs = new ArrayList<MapReserveElement>(); // waiting MRE Queue private TaskAttemptID curReserveID; private TaskAttemptID scheduledTask = null; private int maxWaitTime; private long size = 0; private long mapMaxHeapSize = 0; private final int maxInMemSpills; private int numRequiredMap = 0; private Object dataAvailable = new Object(); private int numPendingRequests = 0; private int numClosed = 0; private boolean isSpilling = false; private long fullSize = 0; private boolean isScheduled2Spill = false; public DefaultMapRamManager(Configuration conf, ChildRamManager centralManager) throws IOException { this.maxInMemSpills = conf.getInt("mapred.inmem.spill.num.threshold", 1000); maxWaitTime = conf.getInt("map.reserve.max.time", 3); crm = centralManager; crm.register(this); } public void registerMap(TaskAttemptID tid) { synchronized (dataAvailable) { mapREs.put(tid, new MapReserveElement(tid)); this.numRequiredMap++; crm.setCurMapsNum(mapREs.size()); } } public void unregisterMap(TaskAttemptID tid) { synchronized (dataAvailable) { this.numRequiredMap--; MapReserveElement mre = mapREs.remove(tid); if (mre.maxSize > mapMaxHeapSize) { mapMaxHeapSize = mre.maxSize; crm.setMaxRamForMaps(mapMaxHeapSize); } crm.setCurMapsNum(mapREs.size()); dataAvailable.notify(); } } public void await() throws InterruptedException{ LOG.info("await"); LOG.info("waitMRE num : " + waitMREs.size()); try { crm.getReserveLock().lock(); mapREs.get(curReserveID).waitCond.await(); } finally { crm.getReserveLock().unlock(); } } public void awake() { LOG.info("awake"); LOG.info("waitMRE num : " + waitMREs.size()); waitMREs.remove(mapREs.get(scheduledTask)); mapREs.get(scheduledTask).waitTime = 0; mapREs.get(scheduledTask).reserveSize = 0; for (MapReserveElement mre : waitMREs) { mre.waitTime++; } mapREs.get(scheduledTask).waitCond.signal(); } //true : should wait public boolean tryReserve(int requestedSize) { return crm.tryReserve(null, ChildRamManager.BufType.spill, requestedSize); } public void reserve(TaskAttemptID tid, long requestedSize) throws InterruptedException { try { long s = System.currentTimeMillis(); crm.getReserveLock().lock(); boolean isWait = crm.tryReserve(tid, ChildRamManager.BufType.spill, requestedSize); MapReserveElement mre = mapREs.get(tid); if(isWait) { LOG.info("reserve wait 111"); curReserveID = tid; mre.reserveSize = requestedSize; mre.waitTime = 0; boolean added = false; for (int i = 0; i < waitMREs.size(); i++) { if (mre.reserveSize < waitMREs.get(i).reserveSize) { waitMREs.add(i, mre); added = true; break; } } if (!added) { waitMREs.add(mre); } synchronized (dataAvailable) { ++numPendingRequests; dataAvailable.notify(); } LOG.info("reserve wait 111"); } LOG.info("reserve 111"); crm.reserve(tid, ChildRamManager.BufType.spill, requestedSize); LOG.info("reserve 222"); if(isWait) { LOG.info("reserve waite 222"); synchronized (dataAvailable) { --numPendingRequests; } } size += requestedSize; mre.reservedSize += requestedSize; if (mre.maxSize < mre.reservedSize) { mre.maxSize = mre.reservedSize; } LOG.info(tid + " map reserve : " + (System.currentTimeMillis() - s)); } finally { crm.getReserveLock().unlock(); } } public void unreserve(TaskAttemptID tid,int num, long requestedSize) { LOG.info("unreserve 111"); crm.unreserve(ChildRamManager.BufType.spill, requestedSize); synchronized (dataAvailable) { numClosed -= num; size -= requestedSize; fullSize -= requestedSize; } mapREs.get(tid).reservedSize -= requestedSize; LOG.info("unreserve requestedSize : " + requestedSize); // Notify the threads blocked on RamManager.reserve } public void closeInMemorySpill(int requestedSize) { try { crm.getReserveLock().lock(); ++numClosed; fullSize += requestedSize; crm.closeInMemBuf(ChildRamManager.BufType.spill, requestedSize); } finally { crm.getReserveLock().unlock(); } } //得到处于spill状态的任务数 private int getSpillMapNum() { int num = 0; for(MapSpiller ms : mapSpillers.values()) { if (ms.getState().compareTo(MapState.spill) == 0) { num += ms.getReadySpillNum(); } } return num; } public void waitForDataToSpill() throws InterruptedException { synchronized (dataAvailable) { // Start in-memory merge if manager has been closed or... isSpilling = false; // crm.scheduleTask2Spill(); while( ( // In-memory threshold exceeded and at least two segments // have been fetched (!isScheduled2Spill) && // More than "mapred.inmem.merge.threshold" map outputs // have been fetched into memory (maxInMemSpills <= 0 || numClosed < maxInMemSpills) && // More than MAX... threads are blocked on the RamManager // or the blocked threads are the last map outputs to be // fetched. If numRequiredMapOutputs is zero, either // setNumCopiedMapOutputs has not been called (no map ouputs // have been fetched, so there is nothing to merge) or the // last map outputs being transferred without // contention, so a merge would be premature. ((numRequiredMap < numJvmSlots || numPendingRequests < numJvmSlots*MAX_STALLED_SPILL_THREADS_FRACTION) && (0 >= numRequiredMap || numPendingRequests < numRequiredMap)) ) || getSpillMapNum()==0){ dataAvailable.wait(); String s = "mapSpillers"; for(MapSpiller ms : mapSpillers.values()) { s += " : " + ms.getReadySpillNum() + " , " + ms.getState(); } LOG.info(s + ". percentUsed : " + crm.getPercentUsed() + " maxInMemSpills : " + maxInMemSpills + " numClosed : " + numClosed + " numRequiredMap : " + numRequiredMap + " numPendingRequests : " + numPendingRequests + " waitingsMREs : " + waitMREs.size() + ", fullSize: " + fullSize); } isScheduled2Spill = false; isSpilling = true; } } public void schedule() { LOG.info("schedule 000"); if (waitMREs.size()==0) { LOG.info("schedule 111"); scheduledTask = null; return; } int maxWait = 0; int maxInd = 0; LOG.info("schedule 222"); for (int i = 0; i < waitMREs.size(); i++) { MapReserveElement mre = waitMREs.get(i); if(mre.waitTime > maxWait) { maxWait = mre.waitTime; maxInd = i; } } LOG.info("schedule 333"); if (maxWait > maxWaitTime) { LOG.info("schedule 444"); scheduledTask = waitMREs.get(maxInd).taskId; LOG.info("scheduledTask : " + scheduledTask); } else { LOG.info("schedule 555"); scheduledTask = waitMREs.get(0).taskId; LOG.info("scheduledTask : " + scheduledTask); } LOG.info("scheduledTask : " + scheduledTask); } @Override public long getToReserveSize() { // TODO Auto-generated method stub if (scheduledTask == null || waitMREs.size() == 0) { return 0; } else { return mapREs.get(scheduledTask).reserveSize; } } @Override public long getReservedSize() { // TODO Auto-generated method stub return size; } @Override public int getMapNum() { // TODO Auto-generated method stub return this.numRequiredMap; } @Override public long getMapHeap() { // TODO Auto-generated method stub return mapMaxHeapSize; } @Override public void spill2Disk() { // TODO Auto-generated method stub synchronized(dataAvailable) { if (isScheduled2Spill || isSpilling) { return; } isScheduled2Spill = true; dataAvailable.notify(); } } @Override public boolean isSpilling() { // TODO Auto-generated method stub return isSpilling; } public long getFullSize() { return fullSize; } } enum MapState{spill, merge} class MapSpiller<K,V> { MapState state = MapState.spill; //负责每个map的缓冲区溢出操作 private ArrayList<SpillRecord> indexCacheList; private TaskAttemptID taskId; private int numSpills = 0; private int numSpilled = 0; protected final Counters.Counter spilledRecordsCounter; private final CombinerRunner combinerRunner; private final CombineOutputCollector combineCollector; private final Counters.Counter combineOutputCounter; private final Counters.Counter fileOutputByteCounter; private final Counters.Counter mapOutputByteCounter; private RawKeyValueIterator rKVIter = null; private boolean isKilled = false; private final JobConf conf; private final TaskReporter reporter; private MapOutputFile mapOutputFile = new MapOutputFile(); private int totalIndexCacheMemory = 0; private static final int INDEX_CACHE_MEMORY_LIMIT = 1024 * 1024; public static final int MAP_OUTPUT_INDEX_RECORD_LENGTH = 24; private List<CacheFile> bufList = new ArrayList<CacheFile>(); private List<Integer> singleRecInds = new ArrayList<Integer>(); MapSpiller(JobConf job,TaskAttemptID tid, TaskReporter rep) throws ClassNotFoundException { reporter = rep; conf = job; this.taskId = tid; mapOutputFile.setConf(conf); mapOutputByteCounter = reporter.getCounter(MAP_OUTPUT_BYTES); Counters.Counter combineInputCounter = reporter.getCounter(COMBINE_INPUT_RECORDS); combineOutputCounter = reporter.getCounter(COMBINE_OUTPUT_RECORDS); fileOutputByteCounter = reporter.getCounter(MAP_OUTPUT_MATERIALIZED_BYTES); // combiner combinerRunner = CombinerRunner.create(conf, taskId, combineInputCounter, reporter, null); if (combinerRunner != null) { combineCollector= new CombineOutputCollector(combineOutputCounter, reporter, conf); } else { combineCollector = null; } indexCacheList = new ArrayList<SpillRecord>(); spilledRecordsCounter = reporter.getCounter(Counter.SPILLED_RECORDS); } public MapState getState() { return state; } public void setState(MapState s) { state = s; } public int getReadySpillNum() { return bufList.size(); } // Object o = new Object(); public long writeFile(RawKeyValueIterator records, Writer<K, V> writer, Progressable progressable, Configuration conf) throws IOException { long progressBar = conf.getLong("mapred.merge.recordsBeforeProgress", 8192) -1; long recordCtr = 0; while(records.next()) { writer.append(records.getKey(), records.getValue()); if (((recordCtr++) & progressBar) == 0 ) { progressable.progress(); } } return recordCtr; } private void mergeParts() throws IOException, InterruptedException, ClassNotFoundException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final TaskAttemptID mapId = taskId; LOG.info(taskId + " mergeParts numSpills : " + numSpills + " numSpilled : " + numSpilled + "bufList size : " + bufList.size()); for(int i = 0; i < numSpilled; i++) { filename[i] = mapOutputFile.getSpillFile(i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); } for(Integer ssi : this.singleRecInds) { filename[ssi.intValue()] = mapOutputFile.getSpillFile(ssi.intValue()); finalOutFileSize += rfs.getFileStatus(filename[ssi.intValue()]).getLen(); } for(CacheFile cf : this.bufList) { finalOutFileSize += cf.getLen(); } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = mapOutputFile.getSpillIndexFile(i); indexCacheList.add(new SpillRecord(indexFileName, conf, null)); } //make correction in the length to include the sequence file header //lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); //The output stream for the final single output file FSDataOutputStream finalFileOut = rfs.create(finalOutputFile, true, 4096); long cfCap = CacheFile.size2Cap(finalOutFileSize); //ramManager.reserve(taskId, cfCap); FSDataOutputStream finalOut = new FSDataOutputStream(new SpillOutputStream(CachePool.get(), cfCap ,finalFileOut, taskId, SpillScheduler.SORT), null); if (numSpills == 0) { // create dummy files IndexRecord rec = new IndexRecord(); SpillRecord sr = new SpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(conf, finalOut, keyClass, valClass, codec, null); writer.close(); rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, conf); } finally { finalOut.close(); } return; } { IndexRecord rec = new IndexRecord(); final SpillRecord spillRec = new SpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { // create the segments to be merged List<Segment<K,V>> segmentList = new ArrayList<Segment<K, V>>(numSpills); for(int i = 0; i < numSpills; i++) { IndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s; if (i < numSpilled || this.singleRecInds.contains(new Integer(i))) { FSDataInputStream in =rfs.open(filename[i]); int off = (int)indexRecord.startOffset; in.skip(off); // Reader<K, V> reader = // new InMemoryReader<K, V>(null, taskId, // buf, (int)indexRecord.startOffset, (int)indexRecord.partLength); Reader<K, V> reader = new Reader(conf, in, (int)indexRecord.partLength); s = new Segment<K, V>(reader, true); // s = new Segment<K,V>(conf, rfs, filename[i], indexRecord.startOffset, // indexRecord.partLength, codec, true); } else { int ind = i; if (this.singleRecInds.size()!=0) { for(Integer ssi : this.singleRecInds) { if (ssi.intValue() > i) { break; } else { ind--; } } } ind -= numSpilled; if (ind >= bufList.size()) { LOG.info(taskId + " mergePartsError!!! "); } CacheFile cf = this.bufList.get(ind); //DataInputBuffer dib = new DataInputBuffer(); int off = (int)indexRecord.startOffset; cf.reset(); cf.skip(off); // LOG.info(taskId +", partition: " + parts + "ind: "+ ind +", off: " + off + ", partlength: " + // indexRecord.partLength + ", rawLen: " + indexRecord.rawLength); // Reader<K, V> reader = // new InMemoryReader<K, V>(null, taskId, // buf, (int)indexRecord.startOffset, (int)indexRecord.partLength); Reader<K, V> reader = new Reader(conf, cf, (int)indexRecord.partLength); s = new Segment<K, V>(reader, true); } segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug("MapId=" + mapId + " Reducer=" + parts + "Spill =" + i + "(" + indexRecord.startOffset + "," + indexRecord.rawLength + ", " + indexRecord.partLength + ")"); } } // merge @SuppressWarnings("unchecked") RawKeyValueIterator kvIter = Merger.merge(conf, rfs, keyClass, valClass, codec, segmentList, segmentList.size(), new Path(mapId.toString()), conf.getOutputKeyComparator(), reporter, null, spilledRecordsCounter); //write merged output to disk long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter); if (combinerRunner == null || numSpills < minSpillsForCombine) { // long s = System.currentTimeMillis(); rKVIter = kvIter; long t = writeFile(kvIter, writer, reporter, conf); rKVIter = null; // LOG.info(taskId + " mergeParts Merger.writeFile time: " + (System.currentTimeMillis() - s) // + ", count: " + t); } else { combineCollector.setWriter(writer); combinerRunner.combine(kvIter, combineCollector); } // long s = System.currentTimeMillis(); //close writer.close(); // LOG.info(taskId + " mergeParts writer.close : " + (System.currentTimeMillis() - s)); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillRec.putIndex(rec, parts); } LOG.info("spillRec.size: " + spillRec.size() + " partitions: " + partitions); finalOut.close(); spillRec.writeToFile(finalIndexFile, conf); int total = 0; for(CacheFile cf : bufList) { total += cf.getCap(); cf.clear(); } ramManager.unreserve(taskId, bufList.size(), total); bufList.clear(); for(int i = 0; i < numSpilled; i++) { rfs.delete(filename[i],true); } for(Integer ssi : this.singleRecInds) { rfs.delete(filename[ssi.intValue()],true); } } // long s0 = System.currentTimeMillis(); //finalCf.writeFile(finalFileOut); //finalCf.closeWrite(); // LOG.info(taskId + " mergeFile spill time: " + (System.currentTimeMillis() - s0)); //ramManager.unreserve(taskId, 1, cfCap); //finalCf.clear(); //finalOut.close(); Path outputPath = mapOutputFile.getOutputFile(); fileOutputByteCounter.increment(rfs.getFileStatus(outputPath).getLen()); } public void spillSingleRecord(K key, V val, int partition) throws IOException { LOG.info("spillSingleRecord ! "); long size = MemoryElement.getBigMESize() + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); // we don't run the combiner for a single record IndexRecord rec = new IndexRecord(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer<K,V>(conf, out, keyClass, valClass, codec, spilledRecordsCounter); if (i == partition) { final long recordStart = out.getPos(); writer.append(key, val); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, conf); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } this.singleRecInds.add(new Integer(numSpills)); ++numSpills; } finally { if (out != null) out.close(); } } private void spill2Disk() throws IOException { if (bufList.size() == 0) { return; } FSDataOutputStream out = null; // MemoryElementQueue meQ = spillThread.getCurrentQueue(); for (Integer ssi : this.singleRecInds) { if (numSpilled < ssi.intValue()) { break; }else if (ssi.intValue() == numSpilled) { numSpilled++; } else { this.singleRecInds.remove(ssi); } } CacheFile cf;; synchronized(bufList) { // LOG.info("bufList size : " + bufList.size()); cf = this.bufList.remove(0); //LOG.info("bufList size : " + bufList.size()); } LOG.info(taskId + " spill2Disk write begin "); // create spill file long len = cf.getLen(); final Path filename = mapOutputFile.getSpillFileForWrite(numSpilled, len); out = rfs.create(false, filename); cf.writeFile(out); //out.close(); // LOG.info("sortAndSpill ... 777"); LOG.info(taskId + " Finished spill to disk : " + numSpilled + " : " + len); numSpilled++; ramManager.unreserve(taskId, 1, cf.getCap()); cf.clear(); } private void sortAndSpill2Buffer() throws IOException, InterruptedException, ClassNotFoundException { //memEle内部已有序 FSDataOutputStream out = null; // MemoryElementQueue meQ = spillThread.getCurrentQueue(); MemoryElementQueue meQ = activeMemQueues.remove(taskId); try { // long s = System.currentTimeMillis(); int size = meQ.getSpillFileSize(); if (size % CacheUnit.cap > 0) { size -= size % CacheUnit.cap; size += CacheUnit.cap; } long cfLen = CacheFile.size2Cap(size); ramManager.reserve(taskId, cfLen); // LOG.info(taskId + " sortAndSpill2BufferTime reserve : " + (System.currentTimeMillis() - s)); // create spill file final SpillRecord spillRec = new SpillRecord(partitions); //byte[] buf = new byte[meQ.getSpillFileSize()]; CacheFile cf = new CacheFile(CachePool.get(), cfLen); IndexRecord rec = new IndexRecord(); out = rfs.create(cf); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K,V>>(); // LOG.info("sortAndSpill ... 222"); for(int i = 0; i < partitions; i++) { IFile.Writer writer = null; // LOG.info("mergePart inMemorySegments size : " + inMemorySegments.size()); inMemorySegments.clear(); meQ.createInMemorySegments(inMemorySegments); meQ.setPartition(i); // LOG.info("sortAndSpill ... 111"); RawKeyValueIterator rIter = Merger.merge(conf, rfs, (Class<K>)conf.getMapOutputKeyClass(), (Class<V>)conf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(taskId.toString()), conf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null); // LOG.info("mergePart inMemSegs size : " + inMemorySegments.size()); try { long segmentStart = out.getPos(); writer = new Writer(conf, out, keyClass, valClass); if (combinerRunner == null) { // LOG.info("sortAndSpill ... 333"); rKVIter = rIter; writeFile(rIter, writer, reporter, conf); rKVIter = null; // LOG.info("sortAndSpill ... 444"); } else { // LOG.info("sortAndSpill ... 555"); combineCollector.setWriter(writer); combinerRunner.combine(rIter, combineCollector); // LOG.info("sortAndSpill ... 666"); } writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); //if (i == 1) // LOG.info("partition : " + i + " startOffset : " + rec.startOffset + // " rawLength : " + rec.rawLength + " partLength : " + rec.partLength); spillRec.putIndex(rec, i); writer = null; } finally { if (null != writer) writer.close(); } } // LOG.info("sortAndSpill ... 777"); if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, conf); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } // LOG.info("sortAndSpill2Buffer() 111"); ramManager.closeInMemorySpill(size); // LOG.info("sortAndSpill2Buffer() 222"); synchronized(bufList) { bufList.add(cf); } LOG.info(taskId + " Finished spill to buffer : " + numSpills); ++numSpills; } finally { if (out != null) out.close(); meQ.close2Recycle(); } } public void kill() { if (rKVIter != null) { LOG.info("kill rKVIter.stop() 0"); rKVIter.stop(); LOG.info("kill rKVIter.stop() 1"); } int total = 0; for(CacheFile cf : bufList) { total += cf.getCap(); cf.clear(); } ramManager.unreserve(taskId, bufList.size(), total); bufList.clear(); isKilled = true; } } class SpillThread extends Thread { private boolean isBusy = false; private IntWritable index = new IntWritable(0); private TaskAttemptID currentQue = null; private Object free = new Object(); public boolean isBusy() { return isBusy; } public void toSpillTaskRemoved(int i) { if (i == -1) { return; } int tNum = toSpillMemQInds.size(); synchronized (index) { if (tNum == 0) { index.set(0); } else if (i < index.get()) { index.set((index.get() - 1 + tNum) % tNum); } else if (i == index.get()) { index.set(index.get()%tNum); } } } public TaskAttemptID getCurrentSpillId() { return currentQue; } private TaskAttemptID getNextSpillQue() { //这样的同步顺序可以避免交叉死锁 synchronized (toSpillMemQInds) { synchronized(index) { int tNum = toSpillMemQInds.size(); for (int i = index.get(); i < index.get() + tNum; i++) { TaskAttemptID tid = toSpillMemQInds.get(i%tNum); if (mapSpillers.get(tid).getReadySpillNum() > 0 && mapSpillers.get(tid).getState().compareTo(MapState.spill) == 0) { index.set((i + 1) % tNum); return tid; } } String s = "getNextSpillQue Null"; for (int i = index.get(); i < index.get() + tNum; i++) { s += " index : " + i; TaskAttemptID tid = toSpillMemQInds.get(i%tNum); s += " id : " + tid; s += " spillNum : " + mapSpillers.get(tid).getReadySpillNum(); s += " state : " + mapSpillers.get(tid).getState(); } LOG.info(s); return null; } } } public void waitForFree() throws InterruptedException { synchronized (free) { free.wait(); } } public void run() { while (true) { try { LOG.info("spill thread wait!"); isBusy = false; ramManager.waitForDataToSpill(); isBusy = true; } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } currentQue = getNextSpillQue(); LOG.info("spill thread spill2Disk : " + currentQue); try { if(currentQue == null || mapSpillers.get(currentQue).getState().compareTo(MapState.merge) == 0) { continue; } mapSpillers.get(currentQue).spill2Disk(); synchronized (free) { free.notify(); } currentQue = null; } catch (Throwable t) { String logMsg = "Task " + currentQue + " failed : " + StringUtils.stringifyException(t); Child.getTask(currentQue).reportFatalError(currentQue, t, logMsg); } } } } class MemoryElementQueue<K,V> { private final TaskAttemptID taskId; private int memSize = 0; private int priority = 0; private int spillFileSize = 0; private List<MemoryElement> memElementQueue = new ArrayList<MemoryElement>(); public MemoryElementQueue(TaskAttemptID tId) { taskId = tId; } public void enQueue(MemoryElement me) { if (!me.getTaskID().equals(taskId)) { return; } this.memElementQueue.add(me); if (me.isBig()) { priority = 1; } memSize += me.getSize(); spillFileSize += me.getSpillFileSize(); if (memSize >= perQueToSpillUpLineMemSize || memElementQueue.size() >= perQueToSpillUpLineMENum ){ startSpillQueue(this.taskId); return; } // if (memSize >= perQueToSpillDownLineMemSize && !spillThread.isBusy()){ // startSpillQueue(this.taskId); // } } public void startSpill() { for (MemoryElement me : this.memElementQueue) { me.startSpill(); } } public TaskAttemptID getTaskId() { return this.taskId; } public void close2Recycle() { // LOG.info("recycle called"); for (MemoryElement me : this.memElementQueue) { me.reset(); recycleMemElement(me); } this.memElementQueue.clear(); } public void createInMemorySegments( List<Segment<K,V>> inMemorySegments) throws IOException { for (MemoryElement me : this.memElementQueue) { Reader reader = new IFile.RawMemoryReader(me); Segment segment = new Segment(reader, true); inMemorySegments.add(segment); } } public void setPartition(int p) throws IOException { for (MemoryElement me : this.memElementQueue) { me.setCurSpillPartition(p); } } public int getPriority() { return this.priority; } public int getMemSize() { return memSize; } public int getSpillFileSize() { //(EOF+checksum) * partitions return spillFileSize + (WritableUtils.getVIntSize(-1) * 2 ) * partitions; } public boolean hasReg() { for (MemoryElement me : this.memElementQueue) { if (!me.isBig()) { return true; } } return false; } public int getQueueSize() { return memElementQueue.size(); } public List<MemoryElement> getQueue() { return this.memElementQueue; } } private JobConf conf = null; private int numJvmSlots; private long perQueToSpillUpLineMemSize; //memory 数量,单位为byte //为了避免死锁 private int perQueToSpillUpLineMENum; //MemoryElement的数量 private Map<TaskAttemptID, MemoryElementQueue> activeMemQueues = new ConcurrentHashMap<TaskAttemptID, MemoryElementQueue>(); //private Map<TaskAttemptID, List<MemoryElementQueue>> toSpillMemQueues = // new HashMap<TaskAttemptID, List<MemoryElementQueue>>(); private Map<TaskAttemptID, MapSpiller> mapSpillers = new ConcurrentHashMap<TaskAttemptID, MapSpiller>(); private List<TaskAttemptID> toSpillMemQInds = new ArrayList<TaskAttemptID>(); //用于轮询各个任务 private SpillThread spillThread = new SpillThread(); private List<MemoryElement> regMemElePool = new LinkedList<MemoryElement>(); // private List<MemoryElement> bigMemElePool = new ArrayList<MemoryElement>(); private int partitions; private FileSystem localFs; private FileSystem rfs; private Class keyClass; private Class valClass; //Compression for map-outputs private CompressionCodec codec = null; private int minSpillsForCombine; // private TaskAttemptID flushTaskId = null; //private TaskAttemptID spillSingleRecTaskId = null; //private TaskAttemptID mergeTaskId = null; private final static int APPROX_HEADER_LENGTH = 150; private static final Log LOG = LogFactory.getLog(DefaultJvmMemoryManager.class.getName()); private DefaultMapRamManager ramManager; private long spillTime = 0; private long mergeTime = 0; private long spillWaitTime = 0; private long poolWaitTime = 0; DefaultJvmMemoryManager(int maxSlots) throws IOException { numJvmSlots = maxSlots; } private void initialize() throws IOException { final float recper = conf.getFloat("io.sort.record.percent",(float)0.05); final int regSortkb = conf.getInt("io.sort.element.kb", 1024); long regSortSize = regSortkb << 10; int perQueToSpillUpLineMemSizemb = conf.getInt("io.spill.upline.mb", 24); final int bigSortmb = conf.getInt("io.sort.big.element.mb", perQueToSpillUpLineMemSizemb); this.perQueToSpillUpLineMemSize =perQueToSpillUpLineMemSizemb << 20; long regEleNum = (perQueToSpillUpLineMemSize / (int)((regSortSize) * (1 - recper)) + 2) * this.numJvmSlots; LOG.info("regEleNum : " + regEleNum); perQueToSpillUpLineMENum = (int)regEleNum / this.numJvmSlots -1; MemoryElement.initMemElements(recper, regSortkb, bigSortmb); for (int i = 0; i < regEleNum; i++) { this.regMemElePool.add(new MemoryElement(false)); } LOG.info("total regular pool size: " + regEleNum * regSortSize); LOG.info("total jvm size: " + Runtime.getRuntime().maxMemory()); long maxBufSize = conf.getLong("mapred.child.buf.total.bytes", Runtime.getRuntime().maxMemory() - regEleNum * regSortSize); float maxBufUsePer = conf.getFloat("mapred.child.buf.percent", 0.7f); long maxTotal = (long)(maxBufSize * maxBufUsePer); ChildRamManager.init(maxTotal, conf); int cacheUnitCap = conf.getInt("mapred.cache.unit.kb", 2048) << 10; CachePool.init(maxTotal, cacheUnitCap); ramManager = new DefaultMapRamManager(conf, ChildRamManager.get()); /* for (int i = 0; i < bigEleNum; i++) { this.bigMemElePool.add(new MemoryElement(true)); }*/ spillThread.setDaemon(true); spillThread.setName("SpillThread"); spillThread.start(); SpillScheduler.get().start(); } public JobConf getConf() { return this.conf; } public void setConf(JobConf job) throws IOException { this.conf = job; keyClass = (Class)conf.getMapOutputKeyClass(); valClass = (Class)conf.getMapOutputValueClass(); // compression if (conf.getCompressMapOutput()) { Class<? extends CompressionCodec> codecClass = conf.getMapOutputCompressorClass(DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); } minSpillsForCombine = conf.getInt("min.num.spills.for.combine", 3); localFs = FileSystem.getLocal(conf); partitions = conf.getNumReduceTasks(); rfs = ((LocalFileSystem)localFs).getRaw(); initialize(); } public MemoryElement getRegMemoryElement() { //LOG.info("getRegMemoryElement !!"); synchronized (regMemElePool) { while (regMemElePool.size() == 0) { try { long s = System.currentTimeMillis(); regMemElePool.wait(); poolWaitTime += System.currentTimeMillis()-s; } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } return regMemElePool.remove(0); } } public MemoryElement getBigMemoryElement(TaskAttemptID tid) throws InterruptedException { if(!ramManager.tryReserve(MemoryElement.getBigMESize())) { ramManager.reserve(tid, MemoryElement.getBigMESize()); return new MemoryElement(true); } return null; } private void startSpillQueue(TaskAttemptID tid) { if(this.activeMemQueues.get(tid) == null) { LOG.info("startSpillQueue : the queue to be spilled is null."); return; } MemoryElementQueue meq; meq = this.activeMemQueues.get(tid); meq.startSpill(); try { long s = System.currentTimeMillis(); mapSpillers.get(tid).sortAndSpill2Buffer(); LOG.info(tid + " sortAndSpill2BufferTime : " + (System.currentTimeMillis() - s)); } catch (Throwable t) { String logMsg = "Task " + tid + " failed : " + StringUtils.stringifyException(t); Child.getTask(tid).reportFatalError(tid, t, logMsg); } } public void returnMemElement(MemoryElement me, boolean isEmpty) { if(me==null) { return; } if(isEmpty) { synchronized (this.regMemElePool) { this.regMemElePool.add(me); this.regMemElePool.notify(); } return; } if (activeMemQueues.get(me.getTaskID()) == null) { activeMemQueues.put(me.getTaskID(), new MemoryElementQueue(me.getTaskID())); } activeMemQueues.get(me.getTaskID()).enQueue(me); if(me.isBig()) { ramManager.closeInMemorySpill(me.getBigMESize()); } } public void recycleMemElement(MemoryElement me) { if (!me.isBig()) { synchronized (this.regMemElePool) { this.regMemElePool.add(me); this.regMemElePool.notify(); } } else { ramManager.unreserve(me.getTaskID(), 1, MemoryElement.getBigMESize()); } } public void registerSpiller(JobConf job, TaskAttemptID tid, TaskReporter reporter) throws ClassNotFoundException { //synchronized (this.mapSpillers) { this.mapSpillers.put(tid, new MapSpiller(job, tid, reporter)); synchronized (this.toSpillMemQInds) { this.toSpillMemQInds.add(tid); } this.ramManager.registerMap(tid); //} } public void spillSingleRecord(TaskAttemptID tid, final Object key, final Object value, int partition) throws IOException, InterruptedException { LOG.info("spillSingleRecord : " + tid); mapSpillers.get(tid).spillSingleRecord(key, value, partition); } public void flush(TaskAttemptID tid) throws InterruptedException, IOException, ClassNotFoundException { LOG.info(tid+" flush 000"); if (this.activeMemQueues.get(tid) != null) { startSpillQueue(tid); } LOG.info(tid + " flush 444"); if (spillThread.isBusy() && tid.equals(spillThread.getCurrentSpillId())) { LOG.info("flush : spillThread.waitForFree : " + tid); spillThread.waitForFree(); } mapSpillers.get(tid).setState(MapState.merge); LOG.info(tid + " flush 111"); long s = System.currentTimeMillis(); mapSpillers.get(tid).mergeParts(); if (mapSpillers.get(tid).isKilled) { return; } LOG.info(tid + " mergeParts time : " + (System.currentTimeMillis() - s)); LOG.info(tid + " flush 222"); synchronized (this.toSpillMemQInds) { spillThread.toSpillTaskRemoved(this.toSpillMemQInds.indexOf(tid)); this.toSpillMemQInds.remove(tid); } LOG.info(tid + "flush 333"); this.mapSpillers.remove(tid); this.ramManager.unregisterMap(tid); if (tid.toString().substring(32, 34).equals("33")) LOG.info("cm %%%%%%%%%%%%% : totalSpillTime : " + spillTime + " totalMergeTime : " + mergeTime + " totalSpillWaitTime : " + spillWaitTime + " totalPoolWaitTime : " + poolWaitTime); LOG.info("cm ************* flush executed" + tid); /* if (mapSpillers.size() == 0) { System.exit(0); } */ } public void kill(TaskAttemptID tid) { if (!mapSpillers.containsKey(tid)) { return; } mapSpillers.get(tid).kill(); ramManager.unregisterMap(tid); mapSpillers.remove(tid); MemoryElementQueue meq = activeMemQueues.remove(tid); if (meq != null) { if (meq.getQueueSize() > 0) { meq.close2Recycle(); } } synchronized (this.toSpillMemQInds) { spillThread.toSpillTaskRemoved(this.toSpillMemQInds.indexOf(tid)); this.toSpillMemQInds.remove(tid); } } }