/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.archive;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.io.ByteStreams;
import com.indeed.imhotep.archive.compression.SquallArchiveCompressor;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import com.indeed.util.compress.CompressionOutputStream;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.security.DigestOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author jsgroth
*/
public class SquallArchiveWriter {
private static final Joiner TAB = Joiner.on("\t");
private static final Joiner SLASH = Joiner.on("/");
@VisibleForTesting
static final Pattern ARCHIVE_FILENAME_PATTERN = Pattern.compile("^archive(\\d+)\\.bin$");
private final FileSystem fs;
private final Path path;
private final List<FileMetadata> pendingMetadataWrites;
private final SquallArchiveCompressor defaultCompressor;
private int archivePathCounter;
/**
* create an archive writer
* be aware that if create is set to true, any existing archive at the given path will be destroyed
* if there is currently no archive at the given path, create MUST be set to true or the first call to appendFile
* will throw FileNotFoundException
*
* @param fs a file system implementation
* @param path the directory to write this archive to
* @param create whether to create from scratch or append to
* @throws IOException if there is an IO problem
*/
public SquallArchiveWriter(FileSystem fs, Path path, boolean create) throws IOException {
this(fs, path, create, SquallArchiveCompressor.NONE);
}
public SquallArchiveWriter(FileSystem fs, Path path, boolean create, SquallArchiveCompressor defaultCompressor) throws IOException {
this.fs = fs;
this.path = path;
pendingMetadataWrites = Lists.newArrayList();
this.defaultCompressor = defaultCompressor;
if (create) {
archivePathCounter = 0;
fs.create(new Path(path, "metadata.txt"), true).close();
deleteExistingArchiveFiles(fs, path);
} else {
archivePathCounter = computeCurrentArchivePathCounter(fs, path);
}
}
private static void deleteExistingArchiveFiles(FileSystem fs, Path path) throws IOException {
for (final FileStatus status : fs.listStatus(path, new PathFilter() {
@Override
public boolean accept(Path path) {
return ARCHIVE_FILENAME_PATTERN.matcher(path.getName()).matches();
}
})) {
fs.delete(status.getPath(), true);
}
}
private static int computeCurrentArchivePathCounter(FileSystem fs, Path path) throws IOException {
int max = -1;
for (final FileStatus status : fs.listStatus(path)) {
final String pathName = status.getPath().getName();
final Matcher matcher = ARCHIVE_FILENAME_PATTERN.matcher(pathName);
if (matcher.matches()) {
final String numberString = matcher.group(1);
final int number = Integer.parseInt(numberString);
max = Math.max(max, number);
}
}
return max + 1;
}
private Path newArchivePath() {
return new Path(path, "archive" + (archivePathCounter++) + ".bin");
}
/**
* convenience method, calls appendFile or appendDirectory depending on whether or not file refers to a directory
* does not modify metadata.txt until {@link #commit()} is called
*
* @param file the file or directory to append
* @throws IOException if there is an IO problem
*/
public void append(File file) throws IOException {
append(file, defaultCompressor);
}
public void append(File file, SquallArchiveCompressor compressor) throws IOException {
if (file.isDirectory()) {
appendDirectory(file, compressor);
} else {
appendFile(file, compressor);
}
}
/**
* recursively appends a directory, stripping the root directory name from the filenames
* this is safer than appendDirectory because it only makes one call to {@link org.apache.hadoop.fs.FileSystem#append(Path)}
* there is no need to call {@link #commit()} after calling this method
*
* @param directory the directory to append
* @throws IOException if there is an IO problem
*/
public void batchAppendDirectory(File directory) throws IOException {
batchAppendDirectory(directory, defaultCompressor);
}
public void batchAppendDirectory(File directory, SquallArchiveCompressor compressor) throws IOException {
if (!directory.isDirectory()) {
throw new FileNotFoundException(directory.getAbsolutePath() + " is not a directory");
}
batchAppend(Arrays.asList(sorted(directory.listFiles())), compressor);
}
/**
* appends a set of files while only making a single call to {@link org.apache.hadoop.fs.FileSystem#append(Path)}
* there is no need to call {@link #commit()} after calling this method
*
* @param files the files to append
* @throws IOException if there is an IO problem
*/
public void batchAppend(Iterable<File> files) throws IOException {
batchAppend(files, defaultCompressor);
}
public void batchAppend(Iterable<File> files, SquallArchiveCompressor compressor) throws IOException {
batchAppend(files, compressor, newArchivePath());
}
private void batchAppend(Iterable<File> files, SquallArchiveCompressor compressor, Path archivePath) throws IOException {
final FSDataOutputStream os = fs.create(archivePath, false);
try {
for (final File file : files) {
if (file.isDirectory()) {
batchAppendDirectory(os, file, Lists.newArrayList(file.getName()), compressor, archivePath.getName());
} else {
internalAppendFile(os, file, Collections.<String>emptyList(), compressor, archivePath.getName());
}
}
commit();
} finally {
os.close();
}
}
private void batchAppendDirectory(FSDataOutputStream os, File directory, List<String> parentDirectories, SquallArchiveCompressor compressor, String archiveFilename) throws IOException {
for (final File file : sorted(directory.listFiles())) {
if (file.isDirectory()) {
final List<String> newParentDirectories = Lists.newArrayList(parentDirectories);
newParentDirectories.add(file.getName());
batchAppendDirectory(os, file, newParentDirectories, compressor, archiveFilename);
} else {
internalAppendFile(os, file, parentDirectories, compressor, archiveFilename);
}
}
}
/**
* recursively append a directory to the archive
* does not modify metadata.txt until {@link #commit()} is called
*
* @param directory the directory to append
* @throws IOException if there is an IO problem
*/
public void appendDirectory(File directory) throws IOException {
appendDirectory(directory, defaultCompressor);
}
public void appendDirectory(File directory, SquallArchiveCompressor compressor) throws IOException {
appendDirectory(directory, Collections.<String>emptyList(), compressor);
}
private void appendDirectory(File directory, List<String> parentDirectories, SquallArchiveCompressor compressor) throws IOException {
if (!directory.exists() || !directory.isDirectory()) {
throw new FileNotFoundException(directory.getAbsolutePath() + " either does not exist or is not a directory");
}
final List<String> newParentDirectories = new ArrayList<String>(parentDirectories);
newParentDirectories.add(directory.getName().replaceAll("\\s+", "_"));
for (final File file : sorted(directory.listFiles())) {
if (file.isDirectory()) {
appendDirectory(file, newParentDirectories, compressor);
} else {
appendFile(file, newParentDirectories, compressor);
}
}
}
/**
* append a file from the local file system into the archive
* does not modify metadata.txt until {@link #commit()} is called
*
* @param file a file on the local file system
* @throws IOException if the file does not exist or if there is an IO problem
*/
public void appendFile(File file) throws IOException {
appendFile(file, defaultCompressor);
}
public void appendFile(File file, SquallArchiveCompressor compressor) throws IOException {
appendFile(file, Collections.<String>emptyList(), compressor);
}
private void appendFile(File file, List<String> parentDirectories, SquallArchiveCompressor compressor) throws IOException {
if (!file.exists() || file.isDirectory()) {
throw new FileNotFoundException(file.getAbsolutePath() + " either does not exist or is a directory");
}
final Path archivePath = newArchivePath();
final FSDataOutputStream os = fs.create(archivePath, false);
try {
internalAppendFile(os, file, parentDirectories, compressor, archivePath.getName());
} finally {
os.close();
}
}
private void internalAppendFile(FSDataOutputStream os, File file, List<String> parentDirectories, SquallArchiveCompressor compressor, String archiveFilename) throws IOException {
final String baseFilename = file.getName().replaceAll("\\s+", "_");
final String filename = makeFilename(parentDirectories, baseFilename);
final long size = file.length();
final long timestamp = file.lastModified();
final long startOffset = os.getPos();
final InputStream is = new BufferedInputStream(new FileInputStream(file));
final String checksum;
try {
final CompressionOutputStream cos = compressor.newOutputStream(os);
final DigestOutputStream dos = new DigestOutputStream(cos, ArchiveUtils.getMD5Digest());
ByteStreams.copy(is, dos);
checksum = ArchiveUtils.toHex(dos.getMessageDigest().digest());
cos.finish();
} finally {
is.close();
}
pendingMetadataWrites.add(new FileMetadata(filename, size, timestamp, checksum, startOffset, compressor, archiveFilename));
}
/**
* flushes pending metadata writes to metadata.txt
*
* @throws IOException if there is an IO problem
*/
public void commit() throws IOException {
if (pendingMetadataWrites.isEmpty()) {
return;
}
final Path metadataPath = new Path(path, "metadata.txt");
final Path tmpMetadataPath = new Path(path, "metadata." + UUID.randomUUID() + ".txt.tmp");
final BufferedReader r = new BufferedReader(new InputStreamReader(fs.open(metadataPath), Charsets.UTF_8));
try {
final PrintWriter w = new PrintWriter(new OutputStreamWriter(fs.create(tmpMetadataPath, false), Charsets.UTF_8));
try {
for (String line = r.readLine(); line != null; line = r.readLine()) {
w.println(line);
}
for (final FileMetadata file : pendingMetadataWrites) {
w.println(TAB.join(file.getFilename(), file.getSize(), file.getTimestamp(), file.getChecksum(), file.getStartOffset(), file.getCompressor().getKey(), file.getArchiveFilename()));
}
} finally {
w.close();
}
} finally {
r.close();
}
fs.delete(metadataPath, false);
fs.rename(tmpMetadataPath, metadataPath);
pendingMetadataWrites.clear();
}
private static String makeFilename(List<String> parentDirectories, String baseFilename) {
final List<String> stringsToJoin = new ArrayList<String>(parentDirectories);
stringsToJoin.add(baseFilename);
return SLASH.join(stringsToJoin);
}
private static File[] sorted(File[] files) {
Arrays.sort(files, new Comparator<File>() {
@Override
public int compare(File o1, File o2) {
return o1.getName().compareTo(o2.getName());
}
});
return files;
}
}