JSnapshotBuffer.java example

Explorer
Quatrain-MapReduce-master
- src
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred.buffer.impl;

import java.io.DataInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileHandle;
import org.apache.hadoop.mapred.InputCollector;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Merger;
import org.apache.hadoop.mapred.RawKeyValueIterator;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Task;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapred.Merger.Segment;
import org.apache.hadoop.mapred.buffer.OutputFile;
import org.apache.hadoop.util.Progress;

/**
 * This class manages the input to a task when in the form
 * of a snapshot. 
 *
 * @param <K> The key type
 * @param <V> The value type
 */
public class JSnapshotBuffer<K extends Object, V extends Object> 
       extends Buffer implements InputCollector<K, V> {
	
	private static final Log LOG = LogFactory.getLog(JSnapshotBuffer.class.getName());

	/**
	 * Contains the data and index files making up the snapshot. Each snapshot
	 * is tagged with a given progress. This class manages all snapshots taken
	 * from the output of a given task. For now, this class only keeps track
	 * of the most recent data and index snapshot files.
	 */
	public class Snapshot {
		/* Have we applied this snapshot to the running task? */
		public boolean fresh = false;

		/* The identifier of the task that produce this snapshot. */
		private TaskID taskid;

		/* Data and index files (living on the local fs). */
		private Path data  = null;

		/* How many bytes are in the data file */
		private long length = 0;

		/* Snapshot progress. */
		private float progress = 0f;

		/* Used for naming snapshot files. */
		private int runs = 0;

		public Snapshot(TaskID taskid) {
			this.taskid = taskid;
		}

		public String toString() {
			return "JBufferSnapshot " + taskid + ": progress = " + progress;
		}

		public Path data() {
			synchronized (this) {
				return this.data;
			}
		}
		
		public void discard() {
			synchronized (this) {
				if (this.data != null) {
					try {
						localFs.delete(data, true);
					} catch (IOException e) {
						e.printStackTrace();
						LOG.error("Snapshot discard error: " + e);
					} finally {
						data = null;
					}
				}
			}
		}

		/**
		 * Create a new snapshot.
		 * @throws IOException
		 */
		public void
		read(DataInputStream istream, OutputFile.SnapshotHeader header)
		throws IOException {
			if (this.progress < header.progress()) {
				long bytes = header.compressed();
				Path filename = 
					fileHandle.getInputSnapshotFileForWrite(task.getTaskID(), taskid, runs++, bytes);
				// Copy data to local-disk
				OutputStream output = null;
				output = localFs.create(filename);

				byte[] buf = new byte[64 * 1024];
				int n = istream.read(buf, 0, (int) Math.min(bytes, buf.length));
				while (n > 0) {
					bytes -= n;
					output.write(buf, 0, n);
					n = istream.read(buf, 0, (int) Math.min(bytes, buf.length));
				}
				output.close();
				
				synchronized (this) {
					if (this.progress < header.progress()) {
						this.data = filename;
						this.progress = header.progress();
						this.length = header.decompressed();
					}
				}
			}
		}
	}
	
	/* Number of files to merge at a time */
	private final int ioSortFactor;

	/* The local filesystem handle. */
	private final FileSystem localFs;

	/* Used to name new snapshot files. */
	private final FileHandle fileHandle;

	/* Keeps track of the most recent snapshot taken
	 * from each input task. */
	private Map<TaskID, Snapshot> snapshots;

	public JSnapshotBuffer(JobConf conf, Task task, Reporter reporter, Progress progress,
			Class<K> keyClass, Class<V> valClass, 
			Class<? extends CompressionCodec> codecClass) 
	throws IOException {
		super(conf, task, reporter, progress, keyClass, valClass, codecClass);
		this.fileHandle = new FileHandle(task.getJobID());;
		this.fileHandle.setConf(conf);

		this.snapshots = new HashMap<TaskID, Snapshot>();
		this.localFs = FileSystem.getLocal(conf);
		
		this.ioSortFactor = conf.getInt("io.sort.factor", 10);
	}
	
	@Override
	public void close() {
		
	}

	@Override
	public boolean read(DataInputStream istream, OutputFile.Header header) throws IOException {
		synchronized (this.snapshots) {
			TaskID taskid = header.owner().getTaskID();
			if (!this.snapshots.containsKey(taskid)) {
				this.snapshots.put(taskid, new Snapshot(taskid));
			}

			Snapshot snapshot = this.snapshots.get(taskid);
			if (snapshot.progress < header.progress()) {
				/* read the snapshot data */
				snapshot.read(istream, (OutputFile.SnapshotHeader) header);
				return true;
			}
			return false;
		}
	}

	@Override
	public void flush() throws IOException {
		// TODO Finish when in-memory buffers implemented
	}

	@Override
	public void free() {
		for (Snapshot snapshot : this.snapshots.values()) {
			snapshot.discard();
		}
	}

	@Override
	public ValuesIterator<K, V> valuesIterator() throws IOException {
		synchronized (this.snapshots) {
			RawKeyValueIterator kvIter = this.createKVIterator(conf, localFs, reporter);
			return kvIter != null ? 
					new ValuesIterator<K, V>(kvIter, comparator, keyClass, valClass, conf, reporter) :
					null;
		}
	}
	
	@SuppressWarnings("unchecked")
	private RawKeyValueIterator 
	createKVIterator(JobConf job, FileSystem fs, Reporter reporter) throws IOException {
		// segments required to vacate memory
		final Path tmpDir = new Path(task.getTaskID().toString());

		// segments on disk
		List<Segment<K,V>> diskSegments = new ArrayList<Segment<K,V>>();
		long onDiskBytes = 0L;
		for (Snapshot snapshot : this.snapshots.values()) {
			Path data = snapshot.data();
			if (data != null && fs.exists(data)) {
				onDiskBytes += fs.getFileStatus(data).getLen();
				diskSegments.add(new Segment<K, V>(job, fs, data, codec, false));
			}
			else if (data != null && !fs.exists(data)) {
				LOG.warn("Snapshot data is missing from file system! " + data);
			}
		}
		
		LOG.info("Merging " + diskSegments.size() + " files, " +
				onDiskBytes + " bytes from disk");
		Collections.sort(diskSegments, new Comparator<Segment<K,V>>() {
			public int compare(Segment<K, V> o1, Segment<K, V> o2) {
				if (o1.getLength() == o2.getLength()) {
					return 0;
				}
				return o1.getLength() < o2.getLength() ? -1 : 1;
			}
		});

		if (0 != onDiskBytes) {
			return Merger.merge(
					job, fs, keyClass, valClass, codec, diskSegments,
					ioSortFactor, diskSegments.size(), tmpDir, comparator,
					reporter, false, null, null);
		}
		return null;
	}

}