package eu.fbk.knowledgestore.filestore;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import javax.annotation.Nullable;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.AbstractIterator;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.fbk.knowledgestore.data.Data;
import eu.fbk.knowledgestore.data.Stream;
/**
* A {@code FileStore} implementation based on the Hadoop API.
* <p>
* An {@code HadoopFileStore} stores its files in an Hadoop {@link FileSystem}, under a certain,
* configurable root path; the filesystem can be any of the filesystems supported by the Hadoop
* API, including the local (raw) filesystem and the distributed HDFS filesystem.
* </p>
* <p>
* Files are stored in a a two-level directory structure, where first level directories reflect
* the MIME types of stored files, and second level directories are buckets of files whose name is
* obtained by hashing the filename; buckets are used in order to equally split a large number of
* files in several subdirectories, overcoming possible filesystem limitations in terms of maximum
* number of files storable in a directory.
* </p>
*/
public class HadoopFileStore implements FileStore {
private static final Logger LOGGER = LoggerFactory.getLogger(HadoopFileStore.class);
private static final String DEFAULT_PATH = "files";
private final FileSystem fileSystem;
private final Path rootPath;
/**
* Creates a new {@code HadoopFileStore} storing files in the {@code FileSystem} and under the
* {@code rootPath} specified.
*
* @param fileSystem
* the file system, not null
* @param path
* the root path where to store files, possibly relative to the filesystem working
* directory; if null, the default root path {@code files} will be used
*/
public HadoopFileStore(final FileSystem fileSystem, @Nullable final String path) {
this.fileSystem = Preconditions.checkNotNull(fileSystem);
this.rootPath = new Path(MoreObjects.firstNonNull(path, DEFAULT_PATH))
.makeQualified(this.fileSystem); // resolve wrt workdir
LOGGER.info("{} configured, path={}", getClass().getSimpleName(), this.rootPath);
}
@Override
public void init() throws IOException {
if (!this.fileSystem.exists(this.rootPath)) {
this.fileSystem.mkdirs(this.rootPath);
}
}
@Override
public InputStream read(final String fileName) throws FileMissingException, IOException {
final Path path = getFullPath(fileName);
try {
final InputStream stream = this.fileSystem.open(path);
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Reading file " + getRelativePath(path));
}
return stream;
} catch (final IOException ex) {
if (!this.fileSystem.exists(path)) {
throw new FileMissingException(fileName, "Cannot read non-existing file");
}
throw ex;
}
}
@Override
public OutputStream write(final String fileName) throws FileExistsException, IOException {
final Path path = getFullPath(fileName);
try {
final OutputStream stream = this.fileSystem.create(path, false);
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Creating file " + getRelativePath(path));
}
return stream;
} catch (final IOException ex) {
if (this.fileSystem.exists(path)) {
throw new FileExistsException(fileName, "Cannot overwrite file");
}
throw ex;
}
}
@Override
public void delete(final String fileName) throws FileMissingException, IOException {
final Path path = getFullPath(fileName);
boolean deleted = false;
try {
deleted = this.fileSystem.delete(path, false);
} finally {
if (!deleted && !this.fileSystem.exists(path)) {
throw new FileMissingException(fileName, "Cannot delete non-existing file.");
}
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Deleted file " + getRelativePath(path));
}
}
@Override
public Stream<String> list() throws IOException {
return Stream.create(new HadoopIterator());
}
@Override
public void close() {
// Nothing to do here. FileSystems are cached and closed by Hadoop at shutdown.
}
@Override
public String toString() {
return getClass().getSimpleName();
}
private Path getFullPath(final String fileName) {
final String typeDirectory = MoreObjects.firstNonNull(Data.extensionToMimeType(fileName),
"application/octet-stream").replace('/', '_');
final String bucketDirectory = Data.hash(fileName).substring(0, 2);
return new Path(this.rootPath, typeDirectory + "/" + bucketDirectory + "/" + fileName);
}
private String getRelativePath(final Path path) {
return path.toString().substring(this.rootPath.toString().length());
}
private class HadoopIterator extends AbstractIterator<String> {
private final FileStatus[] typeDirectories;
private FileStatus[] bucketDirectories;
private FileStatus[] files;
private int typeIndex;
private int bucketIndex;
private int fileIndex;
HadoopIterator() throws IOException {
this.typeDirectories = HadoopFileStore.this.fileSystem
.listStatus(HadoopFileStore.this.rootPath);
this.bucketDirectories = new FileStatus[] {};
this.files = new FileStatus[] {};
}
@Override
protected String computeNext() {
try {
while (true) {
if (this.fileIndex < this.files.length) {
final FileStatus file = this.files[this.fileIndex++];
if (!file.isDir()) {
return file.getPath().getName();
}
} else if (this.bucketIndex < this.bucketDirectories.length) {
final FileStatus bucketDirectory;
bucketDirectory = this.bucketDirectories[this.bucketIndex++];
if (bucketDirectory.isDir()) {
this.files = HadoopFileStore.this.fileSystem
.listStatus(bucketDirectory.getPath());
this.fileIndex = 0;
}
} else if (this.typeIndex < this.typeDirectories.length) {
final FileStatus typeDirectory;
typeDirectory = this.typeDirectories[this.typeIndex++];
if (typeDirectory.isDir()) {
this.bucketDirectories = HadoopFileStore.this.fileSystem
.listStatus(typeDirectory.getPath());
this.bucketIndex = 0;
}
} else {
return endOfData();
}
}
} catch (final Throwable ex) {
throw Throwables.propagate(ex);
}
}
}
}