/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.HarFileSystem;
import org.apache.hadoop.fs.HarFileSystem.HarFSDataInputStream;
import org.apache.hadoop.fs.HarProperties;
import org.apache.hadoop.fs.HarReader;
import org.apache.hadoop.fs.HarStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.SequenceFileRecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* a archive creation utility.
* This class provides methods that can be used
* to create hadoop archives. For understanding of
* Hadoop archives look at {@link HarFileSystem}.
*/
public class HadoopArchives implements Tool {
public static final int VERSION = 2;
private static final Log LOG = LogFactory.getLog(HadoopArchives.class);
private static final String NAME = "har";
/** name of the file in hdfs to read the files needed to archive **/
private static final String SRC_LIST_LABEL = NAME + ".src.list";
/** total size of the files to archive **/
private static final String TOTAL_SIZE_LABEL = NAME + ".total.size";
/** path to be used as a parent to archive files **/
private static final String SRC_PARENT_LABEL = NAME + ".parent.path";
/** the size of the blocks that will be created when archiving **/
private static final String HAR_BLOCKSIZE_LABEL = NAME + ".block.size";
private static final long HAR_BLOCKSIZE_DEFAULT = 512 * 1024 * 1024l;
/**the size of the part files that will be created when archiving **/
private static final String HAR_PARTSIZE_LABEL = NAME + ".partfile.size";
private static final long HAR_PARTSIZE_DEFAULT = 4 * 1024 * 1024 * 1024l;
private static final String PART_ID_OFFSET = NAME + ".partid.offset";
/** number of lines in each block of _index file
* see {@link HarFileSystem} for more details
*/
private static final long NUM_LINES_IN_BLOCK_INDEX = 100l;
private static final String PART_PREFIX = "part-";
private static final String USAGE =
"USAGE: java HadoopArchives [options]: \n" +
" archive -archiveName NAME -p <parent path> <src>* <dst>\n" +
" archive -append <archiveName> -p <parent path> <src>* <dstArchive>\n" +
" archive -appendFromArchive <srcArchive> <archivePaths>* <dstArchive>\n" +
" archive -copyFromLocal <srcDir> <dstArchive>\n" +
" archive -copyToLocal <srcArchive> <dstDir>\n";
private Path jobDirectory;
private Path srcFiles;
private JobConf conf;
public void setConf(Configuration conf) {
if (conf instanceof JobConf) {
this.conf = (JobConf) conf;
} else {
this.conf = new JobConf(conf, HadoopArchives.class);
}
}
public Configuration getConf() {
return this.conf;
}
/** map of possible run modes **/
private Map<String, Executor> executors;
public HadoopArchives(Configuration conf) {
setConf(conf);
executors = new HashMap<String, HadoopArchives.Executor>();
executors.put("-archiveName", new ArchiveExecutor(conf));
executors.put("-append", new AppendExecutor(conf));
executors.put("-appendFromArchive", new AppendFromArchiveExecutor(conf));
executors.put("-copyFromLocal", new CopyFromLocalExecutor(conf));
executors.put("-copyToLocal", new CopyToLocalExecutor(conf));
}
// check the src paths
private static void checkPaths(Configuration conf, List<Path> paths) throws
IOException {
for (Path p : paths) {
FileSystem fs = p.getFileSystem(conf);
if (!fs.exists(p)) {
throw new FileNotFoundException("Source " + p + " does not exist.");
}
}
}
/**
* this assumes that there are two types of files file/dir
* @param fs the input filesystem
* @param p the top level path
* @param out the list of paths output of recursive ls
* @throws IOException
*/
private void recursivels(FileSystem fs, Path p, List<FileStatus> out)
throws IOException {
FileStatus fstatus = fs.getFileStatus(p);
if (!fstatus.isDir()) {
out.add(fstatus);
return;
}
else {
out.add(fstatus);
FileStatus[] listStatus = fs.listStatus(p);
for (FileStatus stat: listStatus) {
recursivels(fs, stat.getPath(), out);
}
}
}
private static class HarEntry implements Writable {
String path;
String[] children;
HarProperties properties;
HarEntry() {}
public HarEntry(String path, String[] children, HarProperties properties) {
this.path = path;
this.children = children;
this.properties = properties;
}
private boolean isDir() {
return children != null;
}
@Override
public void write(DataOutput out) throws IOException {
Text.writeString(out, path);
final boolean dir = isDir();
out.writeBoolean(dir);
if (dir) {
out.writeInt(children.length);
for(String c : children) {
Text.writeString(out, c);
}
}
properties.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
path = Text.readString(in);
if (in.readBoolean()) {
children = new String[in.readInt()];
for(int i = 0; i < children.length; i++) {
children[i] = Text.readString(in);
}
} else {
children = null;
}
properties = new HarProperties();
properties.readFields(in);
}
public HarProperties getProperties() {
return properties;
}
}
private boolean checkValidName(String name) {
Path tmp = new Path(name);
if (tmp.depth() != 1) {
return false;
}
if (name.endsWith(".har"))
return true;
return false;
}
private Path largestDepth(List<Path> paths) {
Path deepest = paths.get(0);
for (Path p: paths) {
if (p.depth() > deepest.depth()) {
deepest = p;
}
}
return deepest;
}
/**
* truncate the prefix root from the full path
* @param fullPath the full path
* @param root the prefix root to be truncated
* @return the relative path
*/
private Path relPathToRoot(Path fullPath, Path root) {
// just take some effort to do it
// rather than just using substring
// so that we do not break sometime later
Path justRoot = new Path(Path.SEPARATOR);
if (fullPath.depth() == root.depth()) {
return justRoot;
}
else if (fullPath.depth() > root.depth()) {
Path retPath = new Path(fullPath.getName());
Path parent = fullPath.getParent();
for (int i=0; i < (fullPath.depth() - root.depth() -1); i++) {
retPath = new Path(parent.getName(), retPath);
parent = parent.getParent();
}
return new Path(justRoot, retPath);
}
return null;
}
/**
* this method writes all the valid top level directories into the srcWriter
* for indexing. This method is a little tricky. example- for an input with
* parent path /home/user/ and sources as /home/user/source/dir1,
* /home/user/source/dir2 - this will output {@code <source, dir, dir1, dir2>}
* (dir means that source is a dir with dir1 and dir2 as children),
* {@code <source/dir1, file, null>}, {@code <source/dir2, file, null>} and
* {@code </, dir, source>}
*
* @param srcWriter
* the sequence file writer to write the directories to
* @param paths
* the source paths provided by the user. They are glob free and have
* full path (not relative paths)
* @param parentPath
* the parent path that you want the archives to be relative to.
* example - /home/user/dir1 can be archived with parent as /home or
* /home/user.
* @throws IOException
*/
private void writeTopLevelDirs(SequenceFile.Writer srcWriter,
List<Path> paths, Path parentPath) throws IOException {
/* find all the common parents of paths that are valid archive
* paths. The below is done so that we do not add a common path
* twice and also we need to only add valid child of a path that
* are specified the user.
*/
TreeMap<Path, HashSet<String>> allpaths = new TreeMap<Path,
HashSet<String>>();
/* the largest depth of paths. the max number of times
* we need to iterate
*/
Path deepest = largestDepth(paths);
Path root = new Path(Path.SEPARATOR);
List<Path> justDirs = paths;
for (int i = parentPath.depth(); i < deepest.depth(); i++) {
List<Path> parents = new ArrayList<Path>();
for (Path p: justDirs) {
if (p.compareTo(root) == 0){
//do nothing
}
else {
Path parent = p.getParent();
if (allpaths.containsKey(parent)) {
HashSet<String> children = allpaths.get(parent);
children.add(p.getName());
}
else {
HashSet<String> children = new HashSet<String>();
children.add(p.getName());
allpaths.put(parent, children);
}
parents.add(parent);
}
}
justDirs = parents;
}
Set<Map.Entry<Path, HashSet<String>>> keyVals = allpaths.entrySet();
for (Map.Entry<Path, HashSet<String>> entry : keyVals) {
Path currentPath = entry.getKey();
Path relPath = relPathToRoot(currentPath, parentPath);
if (relPath != null) {
FileSystem fs = currentPath.getFileSystem(getConf());
HarProperties properties = new HarProperties(fs.getFileStatus(currentPath));
final String[] children = new String[entry.getValue().size()];
int i = 0;
for(String child: entry.getValue()) {
children[i++] = child;
}
HarEntry harEntry = new HarEntry(relPath.toString(), children, properties);
srcWriter.append(new LongWritable(0L), harEntry);
}
}
}
//delete the tmp job directory
private void cleanJobDirectory() {
try {
FileSystem jobfs = jobDirectory.getFileSystem(conf);
jobfs.delete(jobDirectory, true);
} catch(IOException ioe) {
LOG.warn("Unable to clean tmp directory " + jobDirectory, ioe);
}
}
private long writeFromArchiveFilesToProcess(Path harSrc, List<Path> relativePaths) throws IOException {
Set<Path> allowedPaths = new HashSet<Path>(relativePaths);
Map<String, HashSet<String>> tree = new HashMap<String, HashSet<String>>();
HashSet<String> toTake = new HashSet<String>();
HarReader harReader = new HarReader(harSrc, conf);
List<HarStatus> allStatuses = new ArrayList<HarStatus>();
try {
while (harReader.hasNext()) {
allStatuses.add(harReader.getNext());
}
} finally {
if (harReader != null) {
harReader.close();
}
}
Path root = new Path(Path.SEPARATOR);
// decide which of the har files we need to process
// and create in-memory tree structure of the files
for (HarStatus harStatus : allStatuses) {
Path path = new Path(harStatus.getName());
Path currentPath = path;
// decide whether we need to process this har-entry
boolean allowed = false;
for (int i = 0; i <= path.depth(); ++i) {
if (allowedPaths.contains(currentPath)) {
allowed = true;
break;
}
currentPath = currentPath.getParent();
}
if (allowed) {
currentPath = path;
// update in-memory structure of har files
for (int i = 0; i <= path.depth(); ++i) {
toTake.add(currentPath.toString());
if (currentPath.equals(root)) {
break;
}
Path parent = currentPath.getParent();
String parentString = parent.toString();
HashSet<String> treeEntry = tree.get(parentString);
if (treeEntry == null) {
HashSet<String> value = new HashSet<String>();
value.add(currentPath.getName());
tree.put(parentString, value);
} else {
treeEntry.add(currentPath.getName());
}
currentPath = parent;
}
}
}
final String randomId = DistCp.getRandomId();
jobDirectory = new Path(new JobClient(conf).getSystemDir(), NAME + "_" + randomId);
//get a tmp directory for input splits
FileSystem jobfs = jobDirectory.getFileSystem(conf);
jobfs.mkdirs(jobDirectory);
srcFiles = new Path(jobDirectory, "_har_src_files");
SequenceFile.Writer srcWriter = SequenceFile.createWriter(jobfs, conf,
srcFiles, LongWritable.class, HarEntry.class,
SequenceFile.CompressionType.NONE);
long totalSize = 0;
try {
for (HarStatus harStatus : allStatuses) {
String pathString = harStatus.getName();
// skip items that we don't need
if (!toTake.contains(pathString)) {
continue;
}
HashSet<String> treeEntry = tree.get(pathString);
String[] children;
if (treeEntry == null) {
children = null;
} else {
children = treeEntry.toArray(new String[0]);
}
HarEntry harEntry = new HarEntry(harStatus.getName(), children, harStatus.getProperties());
srcWriter.append(new LongWritable(harStatus.getLength()), harEntry);
srcWriter.sync();
totalSize += harStatus.getLength();
}
} finally {
srcWriter.close();
}
return totalSize;
}
private void appendFromArchive(Path harSrc, List<Path> relativePaths, Path harDst) throws IOException {
Path outputPath = harDst;
FileOutputFormat.setOutputPath(conf, outputPath);
FileSystem outFs = outputPath.getFileSystem(conf);
if (!outFs.exists(outputPath)) {
throw new IOException("Invalid Output. HAR File " + outputPath + "doesn't exist");
}
if (outFs.isFile(outputPath)) {
throw new IOException("Invalid Output. HAR File " + outputPath
+ "must be represented as directory");
}
long totalSize = writeFromArchiveFilesToProcess(harSrc, relativePaths);
//make it a har path
FileSystem fs1 = harSrc.getFileSystem(conf);
URI uri = fs1.getUri();
Path parentPath = new Path("har://" + "hdfs-" + uri.getHost() +":" +
uri.getPort() + fs1.makeQualified(harSrc).toUri().getPath());
FileSystem fs = parentPath.getFileSystem(conf);
conf.set(SRC_LIST_LABEL, srcFiles.toString());
conf.set(SRC_PARENT_LABEL, parentPath.makeQualified(fs).toString());
conf.setLong(TOTAL_SIZE_LABEL, totalSize);
long partSize = conf.getLong(HAR_PARTSIZE_LABEL, HAR_PARTSIZE_DEFAULT);
int numMaps = (int) (totalSize / partSize);
//run atleast one map.
conf.setNumMapTasks(numMaps == 0 ? 1 : numMaps);
conf.setNumReduceTasks(1);
conf.setOutputFormat(NullOutputFormat.class);
conf.setMapOutputKeyClass(IntWritable.class);
conf.setMapOutputValueClass(Text.class);
conf.set("hadoop.job.history.user.location", "none");
//make sure no speculative execution is done
conf.setSpeculativeExecution(false);
// set starting offset for mapper
int partId = findFirstAvailablePartId(outputPath);
conf.setInt(PART_ID_OFFSET, partId);
Path index = new Path(outputPath, HarFileSystem.INDEX_NAME);
Path indexDirectory = new Path(outputPath, HarFileSystem.INDEX_NAME + ".copy");
outFs.mkdirs(indexDirectory);
Path indexCopy = new Path(indexDirectory, "data");
outFs.rename(index, indexCopy);
MultipleInputs.addInputPath(conf, jobDirectory, HArchiveInputFormat.class,
HArchivesMapper.class);
MultipleInputs.addInputPath(conf, indexDirectory, TextInputFormat.class,
HArchivesConvertingMapper.class);
conf.setReducerClass(HArchivesMergingReducer.class);
JobClient.runJob(conf);
cleanJobDirectory();
}
/**
* archive the given source paths into the dest
*
* @param parentPath
* the parent path of all the source paths
* @param srcPaths
* the src paths to be archived
* @param dest
* the dest dir that will contain the archive
* @param append
* append to existing archive or create new
*
*/
private void archive(Path parentPath, List<Path> srcPaths, Path outputPath, boolean append)
throws IOException {
parentPath = parentPath.makeQualified(parentPath.getFileSystem(conf));
checkPaths(conf, srcPaths);
Path destinationDir = outputPath.getParent();
FileOutputFormat.setOutputPath(conf, outputPath);
FileSystem outFs = outputPath.getFileSystem(conf);
if (append) {
if (!outFs.exists(outputPath)) {
throw new IOException("Invalid Output. HAR File " + outputPath + "doesn't exist");
}
if (outFs.isFile(outputPath)) {
throw new IOException("Invalid Output. HAR File " + outputPath
+ "must be represented as directory");
}
} else {
if (outFs.exists(outputPath)) {
throw new IOException("Invalid Output: " + outputPath + ". File already exists");
}
if (outFs.isFile(destinationDir)) {
throw new IOException("Invalid Output. " + outputPath + " is not a directory");
}
}
long totalSize = writeFilesToProcess(parentPath, srcPaths);
FileSystem fs = parentPath.getFileSystem(conf);
conf.set(SRC_LIST_LABEL, srcFiles.toString());
conf.set(SRC_PARENT_LABEL, parentPath.makeQualified(fs).toString());
conf.setLong(TOTAL_SIZE_LABEL, totalSize);
long partSize = conf.getLong(HAR_PARTSIZE_LABEL, HAR_PARTSIZE_DEFAULT);
int numMaps = (int) (totalSize / partSize);
//run atleast one map.
conf.setNumMapTasks(numMaps == 0 ? 1 : numMaps);
conf.setNumReduceTasks(1);
conf.setOutputFormat(NullOutputFormat.class);
conf.setMapOutputKeyClass(IntWritable.class);
conf.setMapOutputValueClass(Text.class);
conf.set("hadoop.job.history.user.location", "none");
//make sure no speculative execution is done
conf.setSpeculativeExecution(false);
if (append) {
// set starting offset for mapper
int partId = findFirstAvailablePartId(outputPath);
conf.setInt(PART_ID_OFFSET, partId);
Path index = new Path(outputPath, HarFileSystem.INDEX_NAME);
Path indexDirectory = new Path(outputPath, HarFileSystem.INDEX_NAME + ".copy");
outFs.mkdirs(indexDirectory);
Path indexCopy = new Path(indexDirectory, "data");
outFs.rename(index, indexCopy);
MultipleInputs.addInputPath(conf, jobDirectory, HArchiveInputFormat.class,
HArchivesMapper.class);
MultipleInputs.addInputPath(conf, indexDirectory, TextInputFormat.class,
HArchivesConvertingMapper.class);
conf.setReducerClass(HArchivesMergingReducer.class);
} else {
conf.setMapperClass(HArchivesMapper.class);
conf.setInputFormat(HArchiveInputFormat.class);
FileInputFormat.addInputPath(conf, jobDirectory);
conf.setReducerClass(HArchivesReducer.class);
}
JobClient.runJob(conf);
cleanJobDirectory();
}
private long writeFilesToProcess(Path parentPath, List<Path> srcPaths) throws IOException {
final String randomId = DistCp.getRandomId();
jobDirectory = new Path(new JobClient(conf).getSystemDir(), NAME + "_" + randomId);
//get a tmp directory for input splits
FileSystem jobfs = jobDirectory.getFileSystem(conf);
jobfs.mkdirs(jobDirectory);
srcFiles = new Path(jobDirectory, "_har_src_files");
SequenceFile.Writer srcWriter = SequenceFile.createWriter(jobfs, conf,
srcFiles, LongWritable.class, HarEntry.class,
SequenceFile.CompressionType.NONE);
long totalSize = 0;
// get the list of files
// create single list of files and dirs
FileSystem fs = parentPath.getFileSystem(conf);
try {
// write the top level dirs in first
writeTopLevelDirs(srcWriter, srcPaths, parentPath);
srcWriter.sync();
// these are the input paths passed
// from the command line
// we do a recursive ls on these paths
// and then write them to the input file
// one at a time
for (Path src: srcPaths) {
ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>();
recursivels(fs, src, allFiles);
for (FileStatus stat: allFiles) {
long len = stat.isDir() ? 0 : stat.getLen();
String path = relPathToRoot(stat.getPath(), parentPath).toString();
String[] children = null;
if (stat.isDir()) {
//get the children
FileStatus[] list = fs.listStatus(stat.getPath());
children = new String[list.length];
for (int i = 0; i < list.length; i++) {
children[i] = list[i].getPath().getName();
}
}
HarEntry harEntry = new HarEntry(path, children, new HarProperties(stat));
srcWriter.append(new LongWritable(len), harEntry);
srcWriter.sync();
totalSize += len;
}
}
} finally {
srcWriter.close();
}
jobfs.setReplication(srcFiles, (short) 10);
return totalSize;
}
private int findFirstAvailablePartId(Path archivePath) throws IOException {
FileSystem fs = archivePath.getFileSystem(conf);
FileStatus[] fileStatuses = fs.listStatus(archivePath);
int result = 0;
for (FileStatus fileStatus : fileStatuses) {
String name = fileStatus.getPath().getName();
if (name.startsWith(PART_PREFIX)) {
int id = Integer.parseInt(name.substring(PART_PREFIX.length()));
result = Math.max(result, id + 1);
}
}
return result;
}
/**
* Input format of a hadoop archive job responsible for
* generating splits of the file list
*/
private static class HArchiveInputFormat implements InputFormat<LongWritable, HarEntry> {
//generate input splits from the src file lists
public InputSplit[] getSplits(JobConf jconf, int numSplits)
throws IOException {
String srcfilelist = jconf.get(SRC_LIST_LABEL, "");
if ("".equals(srcfilelist)) {
throw new IOException("Unable to get the " +
"src file for archive generation.");
}
long totalSize = jconf.getLong(TOTAL_SIZE_LABEL, -1);
if (totalSize == -1) {
throw new IOException("Invalid size of files to archive");
}
//we should be safe since this is set by our own code
Path src = new Path(srcfilelist);
FileSystem fs = src.getFileSystem(jconf);
FileStatus fstatus = fs.getFileStatus(src);
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
LongWritable key = new LongWritable();
HarEntry value = new HarEntry();
SequenceFile.Reader reader = null;
// the remaining bytes in the file split
long remaining = fstatus.getLen();
// the count of sizes calculated till now
long currentCount = 0L;
// the endposition of the split
long lastPos = 0L;
// the start position of the split
long startPos = 0L;
long targetSize = totalSize / numSplits;
// create splits of size target size so that all the maps
// have equals sized data to read and write to.
try {
reader = new SequenceFile.Reader(fs, src, jconf);
while(reader.next(key, value)) {
if (currentCount + key.get() > targetSize && currentCount != 0){
long size = lastPos - startPos;
splits.add(new FileSplit(src, startPos, size, (String[]) null));
remaining = remaining - size;
startPos = lastPos;
currentCount = 0L;
}
currentCount += key.get();
lastPos = reader.getPosition();
}
// the remaining not equal to the target size.
if (remaining != 0) {
splits.add(new FileSplit(src, startPos, remaining, (String[])null));
}
}
finally {
reader.close();
}
return splits.toArray(new FileSplit[splits.size()]);
}
public RecordReader<LongWritable, HarEntry> getRecordReader(InputSplit split,
JobConf job, Reporter reporter) throws IOException {
return new SequenceFileRecordReader<LongWritable, HarEntry>(job,
(FileSplit)split);
}
}
/**
* A class that reads lines from _index file and writes
* them with hash of the path field
*/
private static class HArchivesConvertingMapper
implements Mapper<LongWritable, Text, IntWritable, Text> {
public void configure(JobConf conf) {
}
public void map(LongWritable key, Text value, OutputCollector<IntWritable, Text> out,
Reporter reporter) throws IOException {
reporter.setStatus("Passing file " + value + " to archive.");
reporter.progress();
HarStatus harStatus = new HarStatus(value.toString());
int hash = HarFileSystem.getHarHash(harStatus.getName());
out.collect(new IntWritable(hash), value);
}
public void close() throws IOException {
}
}
private static class HArchivesMapper
implements Mapper<LongWritable, HarEntry, IntWritable, Text> {
private JobConf conf = null;
private int partId = -1 ;
private Path tmpOutputDir = null;
Path tmpOutput = null;
String partName = null;
Path rootPath = null;
FSDataOutputStream partStream = null;
FileSystem destFs = null;
byte[] buffer;
final static int BUFFER_SIZE = 128 * 1024;
long blockSize;
// configure the mapper and create
// the part file.
// use map reduce framework to write into
// tmp files.
public void configure(JobConf conf) {
this.conf = conf;
int partIdOffset = conf.getInt(PART_ID_OFFSET, 0);
// this is tightly tied to map reduce
// since it does not expose an api
// to get the partition
partId = conf.getInt("mapred.task.partition", -1) + partIdOffset;
// create a file name using the partition
// we need to write to this directory
tmpOutputDir = FileOutputFormat.getWorkOutputPath(conf);
blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, HAR_BLOCKSIZE_DEFAULT);
// get the output path and write to the tmp
// directory
partName = PART_PREFIX + partId;
tmpOutput = new Path(tmpOutputDir, partName);
String rootPathString = conf.get(SRC_PARENT_LABEL, null);
if (rootPathString == null) {
throw new RuntimeException("Unable to read parent " +
"path for har from config");
}
rootPath = new Path(rootPathString);
try {
destFs = tmpOutput.getFileSystem(conf);
//this was a stale copy
if (destFs.exists(tmpOutput)) {
destFs.delete(tmpOutput, false);
}
partStream = destFs.create(tmpOutput, false, conf.getInt("io.file.buffer.size", 4096),
destFs.getDefaultReplication(), blockSize);
} catch(IOException ie) {
throw new RuntimeException("Unable to open output file " + tmpOutput, ie);
}
buffer = new byte[BUFFER_SIZE];
}
// copy raw data.
public void copyData(Path input, FSDataInputStream fsin,
FSDataOutputStream fout, Reporter reporter) throws IOException {
try {
for (int cbread=0; (cbread = fsin.read(buffer))>= 0;) {
fout.write(buffer, 0,cbread);
reporter.progress();
}
} finally {
fsin.close();
}
}
/**
* get rid of / in the beginning of path
* @param p the path
* @return return path without /
*/
private Path realPath(Path p, Path parent) {
Path rootPath = new Path(Path.SEPARATOR);
if (rootPath.compareTo(p) == 0) {
return parent;
}
return new Path(parent, new Path(p.toString().substring(1)));
}
// read files from the split input
// and write it onto the part files.
// also output hash(name) and string
// for reducer to create index
// and masterindex files.
public void map(LongWritable key, HarEntry value,
OutputCollector<IntWritable, Text> out,
Reporter reporter) throws IOException {
Path relativePath = new Path(value.path);
int hash = HarFileSystem.getHarHash(relativePath.toString());
String towrite = null;
Path srcPath = realPath(relativePath, rootPath);
long startPos = partStream.getPos();
FileSystem srcFs = srcPath.getFileSystem(conf);
HarProperties properties = value.getProperties();
String propStr = properties.serialize();
if (value.isDir()) {
towrite = HarFileSystem.encode(relativePath.toString())
+ " dir " + propStr + " 0 0 ";
StringBuffer sbuff = new StringBuffer();
sbuff.append(towrite);
for (String child: value.children) {
sbuff.append(HarFileSystem.encode(child) + " ");
}
towrite = sbuff.toString();
//reading directories is also progress
reporter.progress();
}
else {
FSDataInputStream input = srcFs.open(srcPath);
reporter.setStatus("Copying file " + srcPath +
" to archive.");
copyData(srcPath, input, partStream, reporter);
long len = partStream.getPos() - startPos;
towrite = HarFileSystem.encode(relativePath.toString())
+ " file " + partName + " " + startPos
+ " " + len + " " + propStr + " ";
}
out.collect(new IntWritable(hash), new Text(towrite));
}
public void close() throws IOException {
// close the part files.
partStream.close();
}
}
/**
* Base reducer for creating the index and the master index
*/
private static class HArchivesReducer implements Reducer<IntWritable,
Text, Text, Text> {
private JobConf conf = null;
private long startIndex = 0;
private long endIndex = 0;
private long startPos = 0;
private Path masterIndex = null;
private Path index = null;
private FileSystem fs = null;
private FSDataOutputStream outStream = null;
private FSDataOutputStream indexStream = null;
private Path tmpOutputDir = null;
private int written = 0;
private int keyVal = 0;
/**
* Configure the reducer: open the _index and _masterindex files for writing
*/
public void configure(JobConf conf) {
this.conf = conf;
tmpOutputDir = FileOutputFormat.getWorkOutputPath(this.conf);
masterIndex = new Path(tmpOutputDir, HarFileSystem.MASTER_INDEX_NAME);
index = new Path(tmpOutputDir, HarFileSystem.INDEX_NAME);
try {
fs = masterIndex.getFileSystem(conf);
if (fs.exists(masterIndex)) {
fs.delete(masterIndex, false);
}
if (fs.exists(index)) {
fs.delete(index, false);
}
indexStream = fs.create(index);
outStream = fs.create(masterIndex);
String version = VERSION + " \n";
outStream.write(version.getBytes());
} catch(IOException e) {
throw new RuntimeException(e);
}
}
// create the index and master index. The input to
// the reduce is already sorted by the hash of the
// files. SO we just need to write it to the index.
// We update the masterindex as soon as we update
// numIndex entries.
public void reduce(IntWritable key, Iterator<Text> values,
OutputCollector<Text, Text> out,
Reporter reporter) throws IOException {
keyVal = key.get();
while(values.hasNext()) {
Text value = values.next();
String towrite = value.toString() + "\n";
indexStream.write(towrite.getBytes());
written++;
if (written > HadoopArchives.NUM_LINES_IN_BLOCK_INDEX - 1) {
// every 1000 indexes we report status
reporter.setStatus("Creating index for archives");
reporter.progress();
endIndex = keyVal;
writeLineToMasterIndex(outStream, startIndex, endIndex, startPos, indexStream.getPos());
startPos = indexStream.getPos();
startIndex = endIndex;
written = 0;
}
}
}
public void close() throws IOException {
//write the last part of the master index.
if (written > 0) {
writeLineToMasterIndex(outStream, startIndex, keyVal, startPos, indexStream.getPos());
}
// close the streams
outStream.close();
indexStream.close();
// try increasing the replication
fs.setReplication(index, (short) 5);
fs.setReplication(masterIndex, (short) 5);
}
}
/**
* Reducer that merges entries for _index file
*/
private static class HArchivesMergingReducer implements Reducer<IntWritable,
Text, Text, Text> {
private JobConf conf = null;
private long startIndex = 0;
private long endIndex = 0;
private long startPos = 0;
private Path masterIndex = null;
private Path index = null;
private FileSystem fs = null;
private FSDataOutputStream outStream = null;
private FSDataOutputStream indexStream = null;
private Path outputDir = null;
private int written = 0;
private int keyVal = 0;
/**
* Configure the reducer: open the _index and _masterindex files for writing
*/
public void configure(JobConf conf) {
this.conf = conf;
outputDir = FileOutputFormat.getWorkOutputPath(this.conf);
masterIndex = new Path(outputDir, HarFileSystem.MASTER_INDEX_NAME);
index = new Path(outputDir, HarFileSystem.INDEX_NAME);
try {
fs = masterIndex.getFileSystem(conf);
if (fs.exists(masterIndex)) {
fs.delete(masterIndex, false);
}
if (fs.exists(index)) {
fs.delete(index, false);
}
indexStream = fs.create(index);
outStream = fs.create(masterIndex);
String version = VERSION + " \n";
outStream.write(version.getBytes());
} catch(IOException e) {
throw new RuntimeException(e);
}
}
/**
* Write the data to index and master index. The input to the reduce is
* already sorted by the hash of the files.
*/
public void reduce(IntWritable key, Iterator<Text> values,
OutputCollector<Text, Text> out,
Reporter reporter) throws IOException {
// merge the children of the same directories
Map<String, HarStatus> harItems = new HashMap<String, HarStatus>();
while(values.hasNext()) {
Text value = values.next();
HarStatus harStatus = new HarStatus(value.toString());
if (harItems.containsKey(harStatus.getName())) {
if (!harStatus.isDir()) {
throw new RuntimeException("File " + harStatus.getName() + " already exists in har");
}
HarStatus existingHarStatus = harItems.get(harStatus.getName());
existingHarStatus.getChildren().addAll(harStatus.getChildren());
} else {
harItems.put(harStatus.getName(), harStatus);
}
}
// write to _index file and update _masterindex
keyVal = key.get();
for (HarStatus harStatus: harItems.values()) {
String towrite = harStatus.serialize() + "\n";
indexStream.write(towrite.getBytes());
written++;
if (written > HadoopArchives.NUM_LINES_IN_BLOCK_INDEX - 1) {
// every 1000 indexes we report status
reporter.setStatus("Creating index for archives");
reporter.progress();
endIndex = keyVal;
writeLineToMasterIndex(outStream, startIndex, endIndex, startPos, indexStream.getPos());
startPos = indexStream.getPos();
startIndex = endIndex;
written = 0;
}
}
}
public void close() throws IOException {
//write the last part of the master index.
if (written > 0) {
writeLineToMasterIndex(outStream, startIndex, keyVal, startPos, indexStream.getPos());
}
// close the streams
outStream.close();
indexStream.close();
// try increasing the replication
fs.setReplication(index, (short) 5);
fs.setReplication(masterIndex, (short) 5);
}
}
/**
* Writes data corresponding to part of _index to master index
* @param stream stream where to write data
* @param startHash hash of first entry in block
* @param endHash hash of last entry in block
* @param indexStartPos position (in bytes) of the beginning of the block
* @param indexEndPos position (in bytes) of the end of the block
* @throws IOException
*/
private static void writeLineToMasterIndex(FSDataOutputStream stream, long startHash,
long endHash, long indexStartPos, long indexEndPos) throws IOException {
String toWrite = startHash + " " + endHash + " " + indexStartPos + " " + indexEndPos + "\n";
stream.write(toWrite.getBytes());
}
/**
* Creates new stream to write actual file data
* @param dst parent of the part-id file
* @param partId id of the part
* @return the open stream
* @throws IOException
*/
private FSDataOutputStream createNewPartStream(Path dst, int partId) throws IOException {
String partName = PART_PREFIX + partId;
Path output = new Path(dst, partName);
FileSystem destFs = output.getFileSystem(conf);
FSDataOutputStream partStream = destFs.create(output, false,
conf.getInt("io.file.buffer.size", 4096), destFs.getDefaultReplication(),
conf.getLong(HAR_BLOCKSIZE_LABEL, HAR_BLOCKSIZE_DEFAULT));
return partStream;
}
private static final class LocalAndArchivePaths {
private final Path localPath;
private final String archivePath;
public Path getLocalPath() {
return localPath;
}
public String getArchivePath() {
return archivePath;
}
public LocalAndArchivePaths(Path localPath, String archivePath) {
super();
this.localPath = localPath;
this.archivePath = archivePath;
}
}
/**
* Uploads local directory as har file
* @param srcDir path to local directory to upload
* @param dst path to har archive
* @throws IOException
*/
private void copyFromLocal(Path srcDir, Path dst) throws IOException {
long partSize = conf.getLong(HAR_PARTSIZE_LABEL, HAR_PARTSIZE_DEFAULT);
FileSystem srcFS = FileSystem.getLocal(conf);
int partId = 0;
FSDataOutputStream partStream = null;
// index entries will be sorted by hash
TreeMap<Integer, String> indexEntries = new TreeMap<Integer, String>();
Queue<LocalAndArchivePaths> queue = new LinkedList<LocalAndArchivePaths>();
try {
queue.add(new LocalAndArchivePaths(srcDir, Path.SEPARATOR));
while (!queue.isEmpty()) {
LocalAndArchivePaths item = queue.remove();
Path localPath = item.getLocalPath();
String archiveItem = item.getArchivePath();
FileStatus currenPathFileStatus = srcFS.getFileStatus(localPath);
StringBuilder toWrite = new StringBuilder(
URLEncoder.encode(item.getArchivePath().toString(), "UTF-8"));
String properties = new HarProperties(currenPathFileStatus).serialize();
if (currenPathFileStatus.isDir()) {
FileStatus chlids[] = srcFS.listStatus(localPath);
toWrite.append(" dir ");
toWrite.append(properties);
toWrite.append(" 0 0");
for (FileStatus child : chlids) {
Path childPath = child.getPath();
String nextArchiveItem = new Path(archiveItem, childPath.getName()).toString();
queue.add(new LocalAndArchivePaths(childPath, nextArchiveItem));
toWrite.append(" ");
toWrite.append(URLEncoder.encode(childPath.getName().toString(), "UTF-8"));
}
toWrite.append("\n");
} else {
if (partStream == null) {
partStream = createNewPartStream(dst, partId);
}
toWrite.append(" file ");
toWrite.append("part-" + partId);
toWrite.append(" ");
toWrite.append(partStream.getPos() + " " + currenPathFileStatus.getLen());
toWrite.append(" " + properties);
toWrite.append("\n");
InputStream input = srcFS.open(localPath);
IOUtils.copyBytes(input, partStream, conf, false);
// proceed to next part
if (partStream.getPos() >= partSize) {
++partId;
partStream.close();
partStream = null;
}
}
int hash = HarFileSystem.getHarHash(archiveItem);
indexEntries.put(new Integer(hash), toWrite.toString());
}
} finally {
if (partStream != null) {
partStream.close();
}
}
// Now create master index
// IndexEntries are already sorted by hash
Path index = new Path(dst, HarFileSystem.INDEX_NAME);
Path masterIndex = new Path(dst, HarFileSystem.MASTER_INDEX_NAME);
FSDataOutputStream indexStream = null;
FSDataOutputStream masterIndexStream = null;
try {
FileSystem dstFS = index.getFileSystem(conf);
indexStream = dstFS.create(index);
masterIndexStream = dstFS.create(masterIndex);
String version = VERSION + "\n";
masterIndexStream.write(version.getBytes());
int startHash = 0;
int endHash = 0;
long indexStartPos = 0;
long indexEndPos = 0;
long numLines = 0;
for (Map.Entry<Integer, String> indexEntry : indexEntries.entrySet()) {
if (numLines == 0) {
startHash = indexEntry.getKey();
indexStartPos = indexStream.getPos();
}
endHash = indexEntry.getKey();
indexStream.write(indexEntry.getValue().getBytes());
++numLines;
if (numLines >= HadoopArchives.NUM_LINES_IN_BLOCK_INDEX) {
numLines = 0;
indexEndPos = indexStream.getPos();
writeLineToMasterIndex(masterIndexStream, startHash, endHash, indexStartPos, indexEndPos);
}
}
if (numLines > 0) {
numLines = 0;
indexEndPos = indexStream.getPos();
writeLineToMasterIndex(masterIndexStream, startHash, endHash, indexStartPos, indexEndPos);
}
} finally {
if (indexStream != null) {
indexStream.close();
}
if (masterIndexStream != null) {
masterIndexStream.close();
}
}
}
private void copyToLocal(Path archivePath, Path local) throws IOException {
HarReader harReader = new HarReader(archivePath, conf);
FileSystem localFS = FileSystem.getLocal(conf);
FileSystem fs = archivePath.getFileSystem(conf);
if (!localFS.getFileStatus(local).isDir()) {
throw new IOException("Path " + local + " is not a directory");
}
try {
while (harReader.hasNext()) {
HarStatus harStatus = harReader.getNext();
String harPath = harStatus.getName();
// skip top level dir
if (harPath.equals(Path.SEPARATOR)) {
continue;
}
String relativePath = harPath.substring(1);
Path output = new Path(local, relativePath);
if (harStatus.isDir()) {
localFS.mkdirs(output);
} else {
OutputStream outputStream = null;
FSDataInputStream inputStream = null;
try {
outputStream = localFS.create(output);
Path partFile = new Path(archivePath, harStatus.getPartName());
inputStream = new HarFSDataInputStream(fs, partFile,
harStatus.getStartIndex(), harStatus.getLength(), conf.getInt("io.file.buffer.size", 4096));
IOUtils.copyBytes(inputStream, outputStream, conf);
} finally {
if (outputStream != null) {
outputStream.close();
}
}
}
}
} finally {
if (harReader != null) {
harReader.close();
}
}
}
/**
* General interface to parse command line arguments
* and then execute needed actions
*/
private interface Executor {
public void parse(String[] args) throws Exception;
public void run() throws Exception;
}
private class CopyFromLocalExecutor implements Executor {
private Path sourceDir;
private Path harDestination;
Configuration conf;
public CopyFromLocalExecutor(Configuration conf) {
this.conf = conf;
}
@Override
public void parse(String[] args) throws Exception {
if (args.length != 2) {
throw new ParseException("Not enough arguments to parse: expected 2, found "
+ args.length);
}
sourceDir = new Path(args[0]);
harDestination = new Path(args[1]);
}
@Override
public void run() throws Exception {
copyFromLocal(sourceDir, harDestination);
}
}
private class CopyToLocalExecutor implements Executor {
private Path harArchive;
private Path localFolder;
Configuration conf;
public CopyToLocalExecutor(Configuration conf) {
this.conf = conf;
}
@Override
public void parse(String[] args) throws Exception {
if (args.length != 2) {
throw new ParseException("Not enough arguments to parse: expected 2, found "
+ args.length);
}
harArchive = new Path(args[0]);
localFolder = new Path(args[1]);
}
@Override
public void run() throws Exception {
copyToLocal(harArchive, localFolder);
}
}
private class AppendFromArchiveExecutor implements Executor {
private Path harSource;
private List<Path> pathsInHar;
private Path harDestination;
private Configuration conf;
public AppendFromArchiveExecutor(Configuration conf) {
this.conf = conf;
}
public void parse(String[] args) throws Exception {
if (args.length < 3) {
throw new ParseException("Not enough arguments to parse: expected >= 3, found " + args.length);
}
harSource = new Path(args[0]);
harDestination = new Path(args[args.length - 1]);
pathsInHar = new ArrayList<Path>();
for (int i = 1; i < args.length - 1; ++i) {
pathsInHar.add(new Path(args[i]));
}
}
public void run() throws Exception {
appendFromArchive(harSource, pathsInHar, harDestination);
}
}
private abstract class ArchiveExecutorBase implements Executor {
protected Path parentPath;
protected List<Path> pathsToArchive;
protected Path harDestination;
protected Configuration conf;
public ArchiveExecutorBase(Configuration conf) {
this.conf = conf;
}
@Override
public void parse(String[] args) throws Exception {
Options options = new Options();
Option parentOption = OptionBuilder.isRequired().hasArg().create("p");
options.addOption(parentOption);
CommandLineParser parser = new PosixParser();
CommandLine cmd = parser.parse(options, args);
parentPath = new Path(cmd.getOptionValue("p"));
parsePositionalOptions(cmd.getArgs());
}
private void parsePositionalOptions(String[] args) throws Exception {
if (args.length < 2) {
throw new ParseException("Not enough arguments to parse: expected >= 2, found " + args.length);
}
String archiveName = args[0];
if (!checkValidName(archiveName)) {
throw new ParseException("Invalid archive name: " + archiveName);
}
harDestination = new Path(args[args.length - 1], archiveName);
pathsToArchive = new ArrayList<Path>();
if (args.length == 2) {
// assuming if the user does not specify path for sources
// the whole parent directory needs to be archived.
pathsToArchive.add(parentPath);
return;
}
// process other paths
for (int i = 1; i < args.length - 1; ++i) {
Path argPath = new Path(args[i]);
if (argPath.isAbsolute()) {
throw new ParseException("source path " + argPath +
" is not relative to "+ parentPath);
}
Path srcPath = new Path(parentPath, argPath);
FileSystem fs = srcPath.getFileSystem(conf);
FileStatus[] statuses = fs.globStatus(srcPath);
for (FileStatus status : statuses) {
pathsToArchive.add(fs.makeQualified(status.getPath()));
}
}
}
}
private class ArchiveExecutor extends ArchiveExecutorBase {
public ArchiveExecutor(Configuration conf) {
super(conf);
}
@Override
public void run() throws Exception {
archive(parentPath, pathsToArchive, harDestination, false);
}
}
private class AppendExecutor extends ArchiveExecutorBase {
public AppendExecutor(Configuration conf) {
super(conf);
}
@Override
public void run() throws Exception {
archive(parentPath, pathsToArchive, harDestination, true);
}
}
private void doRun(String[] args) throws Exception {
if (args.length < 1) {
System.out.println(USAGE);
throw new ParseException("Invalid usage: command was not specified");
}
String command = args[0];
Executor executor = executors.get(command);
if (executor == null) {
System.err.println("Unknown command: " + command + ". Available commands:");
for (String cmd : executors.keySet()) {
System.err.println(cmd);
}
throw new ParseException("Unknown command: " + command);
}
String[] otherArgs = new String[args.length - 1];
for (int i = 1; i < args.length; ++i) {
otherArgs[i-1] = args[i];
}
try {
executor.parse(otherArgs);
} catch (Exception e) {
throw new Exception("Error, while parsing args.", e);
}
executor.run();
}
@Override
public int run(String[] args) throws Exception {
try {
doRun(args);
} catch(Exception e) {
LOG.debug("Exception in archives ", e);
e.printStackTrace();
System.err.println("Exception in archives");
System.err.println(e.getLocalizedMessage());
return 1;
}
return 0;
}
public static void main(String[] args) throws Exception {
JobConf job = new JobConf(HadoopArchives.class);
HadoopArchives harchives = new HadoopArchives(job);
int ret = ToolRunner.run(harchives, args);
System.exit(ret);
}
}