/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.blur.kvs;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedSet;
import java.util.Timer;
import java.util.TimerTask;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.ipc.RemoteException;
public class HdfsKeyValueStore implements Store {
public static final int DEFAULT_MAX_AMOUNT_ALLOWED_PER_FILE = 64 * 1024 * 1024;
public static final long DEFAULT_MAX_OPEN_FOR_WRITING = TimeUnit.MINUTES.toMillis(1);
private static final String UTF_8 = "UTF-8";
private static final String BLUR_KEY_VALUE = "blur_key_value";
private static final Log LOG = LogFactory.getLog(HdfsKeyValueStore.class);
private static final byte[] MAGIC;
private static final int VERSION = 1;
private static final long DAEMON_POLL_TIME = TimeUnit.SECONDS.toMillis(5);
private static final int VERSION_LENGTH = 4;
static {
try {
MAGIC = BLUR_KEY_VALUE.getBytes(UTF_8);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
static enum OperationType {
PUT, DELETE
}
static class Operation implements Writable {
OperationType type;
BytesWritable key = new BytesWritable();
BytesWritable value = new BytesWritable();
@Override
public void write(DataOutput out) throws IOException {
if (type == OperationType.DELETE) {
out.write(0);
key.write(out);
} else if (type == OperationType.PUT) {
out.write(1);
key.write(out);
value.write(out);
} else {
throw new RuntimeException("Not supported [" + type + "]");
}
}
@Override
public void readFields(DataInput in) throws IOException {
byte b = in.readByte();
switch (b) {
case 0:
type = OperationType.DELETE;
key.readFields(in);
return;
case 1:
type = OperationType.PUT;
key.readFields(in);
value.readFields(in);
return;
default:
throw new RuntimeException("Not supported [" + b + "]");
}
}
}
static class Value {
Value(BytesRef bytesRef, Path path) {
_bytesRef = bytesRef;
_path = path;
}
BytesRef _bytesRef;
Path _path;
}
private final ConcurrentNavigableMap<BytesRef, Value> _pointers = new ConcurrentSkipListMap<BytesRef, Value>();
private final Path _path;
private final ReentrantReadWriteLock _readWriteLock;
private final AtomicReference<SortedSet<FileStatus>> _fileStatus = new AtomicReference<SortedSet<FileStatus>>();
private final FileSystem _fileSystem;
private final AtomicLong _currentFileCounter = new AtomicLong();
private final WriteLock _writeLock;
private final ReadLock _readLock;
private final AtomicLong _size = new AtomicLong();
private final long _maxAmountAllowedPerFile;
private final TimerTask _idleLogTimerTask;
private final TimerTask _oldFileCleanerTimerTask;
private final AtomicLong _lastWrite = new AtomicLong();
private final Timer _hdfsKeyValueTimer;
private final long _maxTimeOpenForWriting;
private final boolean _readOnly;
private FSDataOutputStream _output;
private Path _outputPath;
private boolean _isClosed;
public HdfsKeyValueStore(boolean readOnly, Timer hdfsKeyValueTimer, Configuration configuration, Path path)
throws IOException {
this(readOnly, hdfsKeyValueTimer, configuration, path, DEFAULT_MAX_AMOUNT_ALLOWED_PER_FILE,
DEFAULT_MAX_OPEN_FOR_WRITING);
}
public HdfsKeyValueStore(boolean readOnly, Timer hdfsKeyValueTimer, Configuration configuration, Path path,
long maxAmountAllowedPerFile) throws IOException {
this(readOnly, hdfsKeyValueTimer, configuration, path, maxAmountAllowedPerFile, DEFAULT_MAX_OPEN_FOR_WRITING);
}
public HdfsKeyValueStore(boolean readOnly, Timer hdfsKeyValueTimer, Configuration configuration, Path path,
long maxAmountAllowedPerFile, long maxTimeOpenForWriting) throws IOException {
_readOnly = readOnly;
_maxTimeOpenForWriting = maxTimeOpenForWriting;
_maxAmountAllowedPerFile = maxAmountAllowedPerFile;
_path = path;
_fileSystem = _path.getFileSystem(configuration);
_fileSystem.mkdirs(_path);
_readWriteLock = new ReentrantReadWriteLock();
_writeLock = _readWriteLock.writeLock();
_readLock = _readWriteLock.readLock();
_fileStatus.set(getSortedSet(_path));
if (!_fileStatus.get().isEmpty()) {
_currentFileCounter.set(Long.parseLong(_fileStatus.get().last().getPath().getName()));
}
removeAnyTruncatedFiles();
loadIndexes();
cleanupOldFiles();
if (!_readOnly) {
_idleLogTimerTask = getIdleLogTimer();
_oldFileCleanerTimerTask = getOldFileCleanerTimer();
_hdfsKeyValueTimer = hdfsKeyValueTimer;
_hdfsKeyValueTimer.schedule(_idleLogTimerTask, DAEMON_POLL_TIME, DAEMON_POLL_TIME);
_hdfsKeyValueTimer.schedule(_oldFileCleanerTimerTask, DAEMON_POLL_TIME, DAEMON_POLL_TIME);
} else {
_idleLogTimerTask = null;
_oldFileCleanerTimerTask = null;
_hdfsKeyValueTimer = null;
}
// Metrics.newGauge(new MetricName(ORG_APACHE_BLUR, HDFS_KV, SIZE,
// path.getParent().toString()), new Gauge<Long>() {
// @Override
// public Long value() {
// return _size.get();
// }
// });
}
private void removeAnyTruncatedFiles() throws IOException {
for (FileStatus fileStatus : _fileStatus.get()) {
Path path = fileStatus.getPath();
FSDataInputStream inputStream = _fileSystem.open(path);
long len = HdfsUtils.getFileLength(_fileSystem, path, inputStream);
inputStream.close();
if (len < MAGIC.length + VERSION_LENGTH) {
// Remove invalid file
LOG.warn("Removing file [{0}] because length of [{1}] is less than MAGIC plus version length of [{2}]", path,
len, MAGIC.length + VERSION_LENGTH);
_fileSystem.delete(path, false);
}
}
}
private TimerTask getOldFileCleanerTimer() {
return new TimerTask() {
@Override
public void run() {
try {
cleanupOldFiles();
} catch (Throwable e) {
LOG.error("Unknown error while trying to clean up old files.", e);
}
}
};
}
private TimerTask getIdleLogTimer() {
return new TimerTask() {
@Override
public void run() {
try {
closeLogFileIfIdle();
} catch (Throwable e) {
LOG.error("Unknown error while trying to close output file.", e);
}
}
};
}
@Override
public void sync() throws IOException {
ensureOpen();
_writeLock.lock();
ensureOpenForWriting();
try {
syncInternal();
} catch (RemoteException e) {
throw new IOException("Another HDFS KeyStore has taken ownership of this key value store.", e);
} catch (LeaseExpiredException e) {
throw new IOException("Another HDFS KeyStore has taken ownership of this key value store.", e);
} finally {
_writeLock.unlock();
}
}
@Override
public Iterable<Entry<BytesRef, BytesRef>> scan(BytesRef key) throws IOException {
ensureOpen();
NavigableMap<BytesRef, Value> pointers = createSnapshot();
return getIterable(key, pointers);
}
private Iterable<Entry<BytesRef, BytesRef>> getIterable(BytesRef key, NavigableMap<BytesRef, Value> pointers) {
if (key == null) {
key = pointers.firstKey();
}
NavigableMap<BytesRef, Value> tailMap = pointers.tailMap(key, true);
return getIterable(tailMap);
}
private NavigableMap<BytesRef, Value> createSnapshot() {
_writeLock.lock();
try {
return new ConcurrentSkipListMap<BytesRef, Value>(_pointers);
} finally {
_writeLock.unlock();
}
}
private Iterable<Entry<BytesRef, BytesRef>> getIterable(NavigableMap<BytesRef, Value> map) {
final Set<Entry<BytesRef, Value>> entrySet = map.entrySet();
return new Iterable<Entry<BytesRef, BytesRef>>() {
@Override
public Iterator<Entry<BytesRef, BytesRef>> iterator() {
final Iterator<Entry<BytesRef, Value>> iterator = entrySet.iterator();
return new Iterator<Entry<BytesRef, BytesRef>>() {
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public Entry<BytesRef, BytesRef> next() {
final Entry<BytesRef, Value> e = iterator.next();
return new Entry<BytesRef, BytesRef>() {
@Override
public BytesRef setValue(BytesRef value) {
throw new RuntimeException("Read only.");
}
@Override
public BytesRef getValue() {
return e.getValue()._bytesRef;
}
@Override
public BytesRef getKey() {
return e.getKey();
}
};
}
@Override
public void remove() {
throw new RuntimeException("Read only.");
}
};
}
};
}
@Override
public void put(BytesRef key, BytesRef value) throws IOException {
ensureOpen();
if (value == null) {
delete(key);
return;
}
_writeLock.lock();
ensureOpenForWriting();
try {
Operation op = getPutOperation(OperationType.PUT, key, value);
Path path = write(op);
BytesRef deepCopyOf = BytesRef.deepCopyOf(value);
_size.addAndGet(deepCopyOf.bytes.length);
Value old = _pointers.put(BytesRef.deepCopyOf(key), new Value(deepCopyOf, path));
if (old != null) {
_size.addAndGet(-old._bytesRef.bytes.length);
}
} catch (RemoteException e) {
throw new IOException("Another HDFS KeyStore has taken ownership of this key value store.", e);
} catch (LeaseExpiredException e) {
throw new IOException("Another HDFS KeyStore has taken ownership of this key value store.", e);
} finally {
_writeLock.unlock();
}
}
private void ensureOpenForWriting() throws IOException {
if (_output == null) {
openWriter();
}
}
private Path write(Operation op) throws IOException {
op.write(_output);
Path p = _outputPath;
if (_output.getPos() >= _maxAmountAllowedPerFile) {
rollFile();
}
return p;
}
private void rollFile() throws IOException {
LOG.info("Rolling file [" + _outputPath + "]");
_output.close();
_output = null;
openWriter();
}
public void cleanupOldFiles() throws IOException {
_writeLock.lock();
try {
if (!isOpenForWriting()) {
return;
}
SortedSet<FileStatus> fileStatusSet = getSortedSet(_path);
if (fileStatusSet == null || fileStatusSet.size() < 1) {
return;
}
Path newestGen = fileStatusSet.last().getPath();
if (!newestGen.equals(_outputPath)) {
throw new IOException("No longer the owner of [" + _path + "]");
}
Set<Path> existingFiles = new HashSet<Path>();
for (FileStatus fileStatus : fileStatusSet) {
existingFiles.add(fileStatus.getPath());
}
Set<Entry<BytesRef, Value>> entrySet = _pointers.entrySet();
existingFiles.remove(_outputPath);
for (Entry<BytesRef, Value> e : entrySet) {
Path p = e.getValue()._path;
existingFiles.remove(p);
}
for (Path p : existingFiles) {
LOG.info("Removing file no longer referenced [{0}]", p);
_fileSystem.delete(p, false);
}
} finally {
_writeLock.unlock();
}
}
private void closeLogFileIfIdle() throws IOException {
_writeLock.lock();
try {
if (_output != null && _lastWrite.get() + _maxTimeOpenForWriting < System.currentTimeMillis()) {
// Close writer
LOG.info("Closing KV log due to inactivity [{0}].", _path);
try {
_output.close();
} finally {
_output = null;
}
}
} finally {
_writeLock.unlock();
}
}
private boolean isOpenForWriting() {
return _output != null;
}
private Operation getPutOperation(OperationType put, BytesRef key, BytesRef value) {
Operation operation = new Operation();
operation.type = put;
operation.key.set(key.bytes, key.offset, key.length);
operation.value.set(value.bytes, value.offset, value.length);
return operation;
}
private Operation getDeleteOperation(OperationType delete, BytesRef key) {
Operation operation = new Operation();
operation.type = delete;
operation.key.set(key.bytes, key.offset, key.length);
return operation;
}
@Override
public boolean get(BytesRef key, BytesRef value) throws IOException {
ensureOpen();
_readLock.lock();
try {
Value internalValue = _pointers.get(key);
if (internalValue == null) {
return false;
}
value.copyBytes(internalValue._bytesRef);
return true;
} finally {
_readLock.unlock();
}
}
@Override
public void delete(BytesRef key) throws IOException {
ensureOpen();
_writeLock.lock();
ensureOpenForWriting();
try {
Operation op = getDeleteOperation(OperationType.DELETE, key);
write(op);
Value old = _pointers.remove(key);
if (old != null) {
_size.addAndGet(-old._bytesRef.bytes.length);
}
} catch (RemoteException e) {
throw new IOException("Another HDFS KeyStore has taken ownership of this key value store.", e);
} catch (LeaseExpiredException e) {
throw new IOException("Another HDFS KeyStore has taken ownership of this key value store.", e);
} finally {
_writeLock.unlock();
}
}
@Override
public void close() throws IOException {
if (!_isClosed) {
_isClosed = true;
if (_idleLogTimerTask != null) {
_idleLogTimerTask.cancel();
}
if (_oldFileCleanerTimerTask != null) {
_oldFileCleanerTimerTask.cancel();
}
if (_hdfsKeyValueTimer != null) {
_hdfsKeyValueTimer.purge();
}
_writeLock.lock();
try {
if (isOpenForWriting()) {
try {
syncInternal();
} finally {
IOUtils.closeQuietly(_output);
_output = null;
}
}
} finally {
_writeLock.unlock();
}
}
}
private void openWriter() throws IOException {
if (_readOnly) {
throw new IOException("Key value store is set in read only mode.");
}
_outputPath = getSegmentPath(_currentFileCounter.incrementAndGet());
LOG.info("Opening for writing [{0}].", _outputPath);
_output = _fileSystem.create(_outputPath, false);
_output.write(MAGIC);
_output.writeInt(VERSION);
syncInternal();
}
private Path getSegmentPath(long segment) {
return new Path(_path, buffer(segment));
}
private static String buffer(long number) {
String s = Long.toString(number);
StringBuilder builder = new StringBuilder();
for (int i = s.length(); i < 12; i++) {
builder.append('0');
}
return builder.append(s).toString();
}
private void loadIndexes() throws IOException {
for (FileStatus fileStatus : _fileStatus.get()) {
loadIndex(fileStatus.getPath());
}
}
private void ensureOpen() throws IOException {
if (_isClosed) {
throw new IOException("Already closed.");
}
}
private void syncInternal() throws IOException {
validateNextSegmentHasNotStarted();
_output.flush();
_output.sync();
_lastWrite.set(System.currentTimeMillis());
}
private void validateNextSegmentHasNotStarted() throws IOException {
if (!isOwner()) {
throw new IOException("Another HDFS KeyStore has taken ownership of this key value store.");
}
}
private void loadIndex(Path path) throws IOException {
FSDataInputStream inputStream = _fileSystem.open(path);
byte[] buf = new byte[MAGIC.length];
inputStream.readFully(buf);
if (!Arrays.equals(MAGIC, buf)) {
throw new IOException("File [" + path + "] not a " + BLUR_KEY_VALUE + " file.");
}
int version = inputStream.readInt();
if (version == 1) {
long fileLength = HdfsUtils.getFileLength(_fileSystem, path, inputStream);
Operation operation = new Operation();
try {
while (inputStream.getPos() < fileLength) {
try {
operation.readFields(inputStream);
} catch (IOException e) {
// End of sync point found
return;
}
loadIndex(path, operation);
}
} finally {
inputStream.close();
}
} else {
throw new IOException("Unknown version [" + version + "]");
}
}
private void loadIndex(Path path, Operation operation) {
Value old;
switch (operation.type) {
case PUT:
BytesRef deepCopyOf = BytesRef.deepCopyOf(getKey(operation.value));
_size.addAndGet(deepCopyOf.bytes.length);
old = _pointers.put(BytesRef.deepCopyOf(getKey(operation.key)), new Value(deepCopyOf, path));
break;
case DELETE:
old = _pointers.remove(getKey(operation.key));
break;
default:
throw new RuntimeException("Not supported [" + operation.type + "]");
}
if (old != null) {
_size.addAndGet(-old._bytesRef.bytes.length);
}
}
private BytesRef getKey(BytesWritable key) {
return new BytesRef(key.getBytes(), 0, key.getLength());
}
private SortedSet<FileStatus> getSortedSet(Path p) throws IOException {
if (_fileSystem.exists(p)) {
FileStatus[] listStatus = _fileSystem.listStatus(p);
if (listStatus != null) {
TreeSet<FileStatus> result = new TreeSet<FileStatus>();
for (FileStatus fileStatus : listStatus) {
if (!fileStatus.isDir()) {
result.add(fileStatus);
}
}
return result;
}
}
return new TreeSet<FileStatus>();
}
@Override
public boolean isOwner() throws IOException {
Path p = getSegmentPath(_currentFileCounter.get() + 1);
if (_fileSystem.exists(p)) {
return false;
}
return true;
}
}