package io.qdb.kvstore; import io.qdb.buffer.MessageBuffer; import io.qdb.buffer.MessageCursor; import io.qdb.buffer.PersistentMessageBuffer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; /** * KV store implementation. Create these using {@link KeyValueStoreBuilder}. */ public class KeyValueStoreImpl<K, V> implements KeyValueStore<K, V> { private static final Logger log = LoggerFactory.getLogger(KeyValueStoreImpl.class); private final KeyValueStoreSerializer serializer; private final VersionProvider<V> versionProvider; private final Listener<K, V> listener; private final File dir; private final int snapshotCount; private final int snapshotIntervalSecs; private final Timer snapshotTimer; private FileOutputStream lockFile; private FileLock lock; private MessageBuffer txLog; private long mostRecentSnapshotId; private boolean busySavingSnapshot; private boolean snapshotScheduled; private final ConcurrentMap<String, ConcurrentMap<K, V>> maps = new ConcurrentHashMap<String, ConcurrentMap<K, V>>(); @SuppressWarnings("unchecked") KeyValueStoreImpl(KeyValueStoreSerializer serializer, VersionProvider<V> versionProvider, Listener<K, V> listener, File dir, int txLogSizeM, int maxObjectSize, int snapshotCount, int snapshotIntervalSecs) throws IOException { this.serializer = serializer; this.versionProvider = versionProvider; this.dir = dir; this.snapshotCount = snapshotCount; this.snapshotIntervalSecs = snapshotIntervalSecs; dir = DirUtil.ensureDirectory(dir); lockFile = new FileOutputStream(new File(dir, "lock")); lockFile.write(0); lock = lockFile.getChannel().tryLock(); if (lock == null) { lockFile.close(); throw new DirLockedException(dir + " is in use"); } txLog = new PersistentMessageBuffer(DirUtil.ensureDirectory(new File(dir, "txlog"))); txLog.setMaxSize(txLogSizeM * 1000000); txLog.setMaxPayloadSize(maxObjectSize + 100); File[] files = getSnapshotFiles(); Map<String, Map<K, V>> snapshot = null; for (int i = files.length - 1; i >= 0; i--) { File f = files[i]; BufferedInputStream in = new BufferedInputStream(new FileInputStream(f)); try { snapshot = (Map<String, Map<K, V>>)this.serializer.deserialize(in, Map.class); } catch (Exception e) { log.error("Error loading " + f + ", ignoring: " + e); continue; } finally { try { in.close(); } catch (IOException ignore) { } } String name = f.getName(); int j = name.indexOf('-'); int k = name.lastIndexOf('.'); mostRecentSnapshotId = Long.parseLong(name.substring(j + 1, k), 16); if (log.isDebugEnabled()) log.debug("Loaded " + f); break; } if (mostRecentSnapshotId < txLog.getOldestId()) { throw new IOException("Most recent snapshot " + Long.toHexString(mostRecentSnapshotId) + " is older than oldest record in txlog " + Long.toHexString(txLog.getOldestId())); } if (txLog.getNextId() == 0 && mostRecentSnapshotId > 0) { // probably this a recovery after a cluster failure by copying snapshot files around and nuking tx logs // to get everyone in sync log.info("The txlog is empty but we have snapshot " + Long.toHexString(mostRecentSnapshotId) + " so using that as next id"); txLog.setFirstId(mostRecentSnapshotId); } if (snapshot != null) { for (Map.Entry<String, Map<K, V>> e : snapshot.entrySet()) { maps.put(e.getKey(), new ConcurrentHashMap<K, V>(e.getValue())); } } int count = 0; for (MessageCursor c = txLog.cursor(mostRecentSnapshotId); c.next(); count++) { StoreTx tx = this.serializer.deserialize(new ByteArrayInputStream(c.getPayload()), StoreTx.class); try { apply(tx); } catch (KeyValueStoreException e) { if (log.isDebugEnabled()) log.debug("Got " + e + " replaying " + tx); } } if (log.isDebugEnabled()) log.debug("Replayed " + count + " transaction(s)"); // set listener now so it doesn't get events when transactions are replayed this.listener = listener; snapshotTimer = new Timer("kvstore-snapshot-" + dir.getName(), true); } private File[] getSnapshotFiles() { File[] files = dir.listFiles(new RegexFilenameFilter("[0-9a-f]+\\.snapshot")); Arrays.sort(files); return files; } @Override public void close() throws IOException { snapshotTimer.cancel(); txLog.close(); lock.release(); lockFile.close(); } @Override public boolean isEmpty() { return maps.isEmpty(); } /** * Save a snapshot. This is a NOP if we are already busy saving a snapshot or if no new transactions have been * applied since the most recent snapshot was saved. */ public void saveSnapshot() throws IOException { Map<String, Map<K, V>> snapshot; long id; try { synchronized (this) { if (busySavingSnapshot) return; busySavingSnapshot = true; txLog.sync(); id = txLog.getNextId(); if (id == mostRecentSnapshotId) return; // nothing to do snapshot = new HashMap<String, Map<K, V>>(); for (Map.Entry<String, ConcurrentMap<K, V>> e : maps.entrySet()) { snapshot.put(e.getKey(), new HashMap<K, V>(e.getValue())); } } File f = new File(dir, String.format("%016x", id) + ".snapshot"); if (log.isDebugEnabled()) log.debug("Creating " + f); boolean ok = false; FileOutputStream out = new FileOutputStream(f); try { serializer.serialize(snapshot, true, out); out.flush(); out.getChannel().force(true); out.close(); synchronized (this) { mostRecentSnapshotId = id; } ok = true; } finally { if (!ok) { try { out.close(); } catch (IOException ignore) { } if (!f.delete()) { log.error("Unable to delete bad snapshot: " + f); } } } deleteOldSnapshots(); } finally { synchronized (this) { busySavingSnapshot = false; } } } private void deleteOldSnapshots() { File[] a = getSnapshotFiles(); for (int i = 0; i < (a.length - snapshotCount); i++) { if (a[i].delete()) { if (log.isDebugEnabled()) log.debug("Deleted " + a[i]); } else { log.error("Unable to delete " + a[i]); } } } /** * Attempt to apply tx. It is written to the transaction log and then applied to our maps. */ @SuppressWarnings("unchecked") private Object exec(StoreTx<K, V> tx) { ByteArrayOutputStream bos = new ByteArrayOutputStream(); try { serializer.serialize(tx, false, bos); } catch (IOException e) { throw new KeyValueStoreException("Error serializing tx: " + e, e); } byte[] payload = bos.toByteArray(); long timestamp = System.currentTimeMillis(); boolean snapshotNow = false; synchronized (this) { try { long txId = txLog.append(timestamp, null, payload); // the bytes calculation isn't perfectly accurate but good enough long bytes = (txId + payload.length) - mostRecentSnapshotId; snapshotNow = bytes > txLog.getMaxSize() / 2; // half our log space is gone so do a snapshot now } catch (IOException e) { throw new KeyValueStoreException("Error appending to tx log: " + e, e); } finally { scheduleSnapshot(snapshotNow); } return apply(tx); } } private synchronized void scheduleSnapshot(boolean asap) { if (!snapshotScheduled) { snapshotTimer.schedule(new TimerTask() { @Override public void run() { try { synchronized (KeyValueStoreImpl.this) { snapshotScheduled = false; } saveSnapshot(); } catch (Throwable e) { log.error("Error saving snapshot: " + e, e); // todo the store should go offline if it cannot save snapshots } } }, asap ? 1L : snapshotIntervalSecs * 1000L); } } private void dispatch(ObjectEvent<K, V> ev) { try { listener.onObjectEvent(ev); } catch (Exception e) { log.error(e.toString(), e); } } /** * Make changes to our in memory maps based on tx. */ private synchronized Object apply(StoreTx<K, V> tx) { ConcurrentMap<K, V> m = maps.get(tx.map); V existing; switch (tx.op) { case NOP: return null; case PUT: case REPLACE: existing = m != null ? m.get(tx.key) : null; if (existing != null) checkVersionNumbers(tx, existing); if (tx.op == StoreTx.Operation.PUT || existing != null) { if (m == null) maps.put(tx.map, m = new ConcurrentHashMap<K, V>()); versionProvider.incVersion(tx.value); m.put(tx.key, tx.value); if (listener != null) { dispatch(new ObjectEvent<K, V>(this, tx.map, existing == null ? ObjectEvent.Type.CREATED : ObjectEvent.Type.UPDATED, tx.key, tx.value)); } } return existing; case REPLACE_KVV: if (m == null) return Boolean.FALSE; versionProvider.incVersion(tx.value); boolean replace = m.replace(tx.key, tx.oldValue, tx.value); if (replace && listener != null) { dispatch(new ObjectEvent<K, V>(this, tx.map, ObjectEvent.Type.UPDATED, tx.key, tx.value)); } return replace; case PUT_IF_ABSENT: if (m == null) maps.put(tx.map, m = new ConcurrentHashMap<K, V>()); versionProvider.incVersion(tx.value); V v = m.putIfAbsent(tx.key, tx.value); if (v == null && listener != null) { dispatch(new ObjectEvent<K, V>(this, tx.map, ObjectEvent.Type.CREATED, tx.key, tx.value)); } return v; case REMOVE: if (m == null) return null; V ans = m.remove(tx.key); if (m.isEmpty()) maps.remove(tx.map); if (ans != null && listener != null) { dispatch(new ObjectEvent<K, V>(this, tx.map, ObjectEvent.Type.DELETED, tx.key, ans)); } return ans; case REMOVE_KV: if (m == null) return Boolean.FALSE; existing = m.get(tx.key); if (existing == null) return Boolean.FALSE; checkVersionNumbers(tx, existing); Boolean removed = m.remove(tx.key, tx.value); if (m.isEmpty()) maps.remove(tx.map); if (removed && listener != null) { dispatch(new ObjectEvent<K, V>(this, tx.map, ObjectEvent.Type.DELETED, tx.key, tx.value)); } return removed; } throw new KeyValueStoreException("Unhandled operation: " + tx); } private void checkVersionNumbers(StoreTx<K, V> tx, V existing) { Object v1 = versionProvider.getVersion(existing); Object v2 = versionProvider.getVersion(tx.value); if (v1 != null && !v1.equals(v2)) { throw new OptimisticLockingException("Existing value for " + tx.map + "." + tx.key + " " + "has version " + v1 + ", value has version " + v2 + ": " + tx.value); } } @Override public List<String> getMapNames() { return new ArrayList<String>(maps.keySet()); } @Override public ConcurrentMap<K, V> getMap(String name) { return new Namespace(name); } @SuppressWarnings("unchecked") @Override public <T extends V> ConcurrentMap<K, T> getMap(String name, Class<T> cls) { return (ConcurrentMap<K, T>)getMap(name); } @SuppressWarnings({"unchecked", "NullableProblems"}) public class Namespace implements ConcurrentMap<K, V> { private final String name; public Namespace(String name) { this.name = name; } public V put(K key, V value) { return (V)exec(new StoreTx<K, V>(name, StoreTx.Operation.PUT, key, value)); } public V putIfAbsent(K key, V value) { return (V)exec(new StoreTx<K, V>(name, StoreTx.Operation.PUT_IF_ABSENT, key, value)); } public V remove(Object key) { return (V)exec(new StoreTx<K, V>(name, StoreTx.Operation.REMOVE, (K) key)); } public boolean remove(Object key, Object value) { return (Boolean)exec(new StoreTx<K, V>(name, StoreTx.Operation.REMOVE_KV, (K) key, (V) value)); } public V replace(K key, V value) { return (V)exec(new StoreTx<K, V>(name, StoreTx.Operation.REPLACE, key, value)); } public boolean replace(K key, V oldValue, V newValue) { return (Boolean)exec(new StoreTx<K, V>(name, StoreTx.Operation.REPLACE_KVV, key, newValue, oldValue)); } public void putAll(Map<? extends K, ? extends V> m) { for (Entry<? extends K, ? extends V> e : m.entrySet()) put(e.getKey(), e.getValue()); } public void clear() { ConcurrentMap<K, V> m = maps.get(name); if (m == null) return; List<K> list = new ArrayList<K>(m.keySet()); for (K id : list) remove(id); } public int size() { ConcurrentMap<K, V> m = maps.get(name); return m == null ? 0 : m.size(); } public boolean isEmpty() { ConcurrentMap<K, V> m = maps.get(name); return m == null || m.isEmpty(); } public boolean containsKey(Object key) { ConcurrentMap<K, V> m = maps.get(name); return m != null && m.containsKey(key); } public boolean containsValue(Object value) { ConcurrentMap<K, V> m = maps.get(name); return m != null && m.containsValue(value); } public V get(Object key) { ConcurrentMap<K, V> m = maps.get(name); return m == null ? null : m.get(key); } public Set<K> keySet() { ConcurrentMap<K, V> m = maps.get(name); return m == null ? Collections.EMPTY_SET : m.keySet(); } public Collection<V> values() { ConcurrentMap<K, V> m = maps.get(name); return m == null ? Collections.EMPTY_LIST : m.values(); } public Set<Entry<K, V>> entrySet() { ConcurrentMap<K, V> m = maps.get(name); return m == null ? Collections.EMPTY_SET : m.entrySet(); } } }