/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.alibaba.jstorm.hdfs.transaction; import com.alibaba.jstorm.metric.JStormMetrics; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import org.apache.commons.io.FileUtils; import org.apache.hadoop.fs.FileStatus; import org.rocksdb.Checkpoint; import org.rocksdb.ColumnFamilyOptions; import org.rocksdb.DBOptions; import org.rocksdb.FlushOptions; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; import org.rocksdb.TtlDB; import org.rocksdb.WriteBatch; import org.rocksdb.WriteOptions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import backtype.storm.generated.InvalidTopologyException; import backtype.storm.state.Serializer; import backtype.storm.task.TopologyContext; import backtype.storm.utils.Utils; import com.alibaba.jstorm.cache.rocksdb.RocksDbOptionsFactory; import com.alibaba.jstorm.client.ConfigExtension; import com.alibaba.jstorm.cluster.Common; import com.alibaba.jstorm.common.metric.AsmHistogram; import com.alibaba.jstorm.hdfs.HdfsCache; import com.alibaba.jstorm.metric.MetricClient; import com.alibaba.jstorm.transactional.state.IRichCheckpointKvState; import com.alibaba.jstorm.utils.JStormUtils; import com.alibaba.jstorm.utils.SerializerFactory; /** * 1. Backup checkpoint state from local FS to remote FS * 2. Restore checkpoint state from remoteFS to local FS * * HDFS state structure: base_dir/topology_name/rocksdb/key_range/0001.sst * 0002.sst * ....... * checkpoint/batchId/sstFiles.list * CURRENT * MANIFEST-* * * Local RocksDB structure: worker_dir/transactionState/rocksdb/key_range/rocksdb_files * checkpoint/... * * Transaction State */ public class RocksDbHdfsState<K, V> implements IRichCheckpointKvState<K, V, String> { private static final long serialVersionUID = 7907988526099798193L; private static final Logger LOG = LoggerFactory.getLogger(RocksDbHdfsState.class); protected static final String ROCKSDB_DATA_FILE_EXT = "sst"; protected static final String SST_FILE_LIST = "sstFile.list"; protected static final String ENABLE_METRICS = "rocksdb.hdfs.state.metrics"; protected String topologyName; protected Map conf; protected String stateName; protected HdfsCache hdfsCache; protected String hdfsDbDir; protected String hdfsCheckpointDir; protected RocksDB rocksDb; protected String rocksDbDir; protected String rocksDbCheckpointDir; protected Collection<String> lastCheckpointFiles; protected Serializer<Object> serializer; protected int ttlTimeSec; protected long lastCleanTime; protected long cleanPeriod; protected long lastSuccessBatchId; protected MetricClient metricClient; protected AsmHistogram hdfsWriteLatency; protected AsmHistogram hdfsDeleteLatency; protected AsmHistogram rocksDbFlushAndCpLatency; public RocksDbHdfsState() { } @Override public void init(TopologyContext context) { try { this.topologyName = Common.topologyIdToName(context.getTopologyId()); } catch (InvalidTopologyException e) { LOG.error("Failed get topology name by id-{}", context.getTopologyId()); throw new RuntimeException(e.getMessage()); } String workerDir = context.getWorkerIdDir(); metricClient = new MetricClient(context); hdfsWriteLatency = metricClient.registerHistogram("HDFS write latency"); hdfsDeleteLatency = metricClient.registerHistogram("HDFS delete latency"); rocksDbFlushAndCpLatency = metricClient.registerHistogram("RocksDB flush and checkpoint latency"); cleanPeriod = ConfigExtension.getTransactionBatchSnapshotTimeout(context.getStormConf()) * 5 * 1000; serializer = SerializerFactory.createSerailzer(context.getStormConf()); initEnv(topologyName, context.getStormConf(), workerDir); } public void initEnv(String topologyName, Map conf, String workerDir) { this.conf = conf; // init hdfs env this.hdfsCache = new HdfsCache(conf); this.hdfsDbDir = hdfsCache.getBaseDir() + "/" + topologyName + "/rocksDb/" + stateName; this.hdfsCheckpointDir = hdfsDbDir + "/checkpoint"; try { if (!hdfsCache.exist(hdfsDbDir)) hdfsCache.mkdir(hdfsDbDir); if (!hdfsCache.exist(hdfsCheckpointDir)) hdfsCache.mkdir(hdfsCheckpointDir); } catch (IOException e) { LOG.error("Failed to create hdfs dir for path=" + hdfsCheckpointDir, e); throw new RuntimeException(e.getMessage()); } // init local rocksdb even this.rocksDbDir = workerDir + "/transactionState/rocksdb/" + stateName; this.rocksDbCheckpointDir = rocksDbDir + "/checkpoint"; initLocalRocksDbDir(); initRocksDb(); LOG.info("HDFS: dataDir={}, checkpointDir={}", hdfsDbDir, hdfsCheckpointDir); LOG.info("Local: dataDir={}, checkpointDir={}", rocksDbDir, rocksDbCheckpointDir); } protected void initRocksDb() { RocksDbOptionsFactory optionFactory = new RocksDbOptionsFactory.Defaults(); Options options = optionFactory.createOptions(null); DBOptions dbOptions = optionFactory.createDbOptions(null); ColumnFamilyOptions cfOptions = optionFactory.createColumnFamilyOptions(null); String optionsFactoryClass = (String) conf.get(ConfigExtension.ROCKSDB_OPTIONS_FACTORY_CLASS); if (optionsFactoryClass != null) { RocksDbOptionsFactory udfOptionFactory = (RocksDbOptionsFactory) Utils.newInstance(optionsFactoryClass); options = udfOptionFactory.createOptions(options); dbOptions = udfOptionFactory.createDbOptions(dbOptions); cfOptions = udfOptionFactory.createColumnFamilyOptions(cfOptions); } try { ttlTimeSec = ConfigExtension.getStateTtlTime(conf); if (ttlTimeSec > 0) rocksDb = TtlDB.open(options, rocksDbDir, ttlTimeSec, false); else rocksDb = RocksDB.open(options, rocksDbDir); // enable compaction rocksDb.compactRange(); LOG.info("Finish the initialization of RocksDB"); } catch (RocksDBException e) { LOG.error("Failed to open rocksdb located at " + rocksDbDir, e); throw new RuntimeException(e.getMessage()); } lastCheckpointFiles = new HashSet<String>(); lastCleanTime = System.currentTimeMillis(); lastSuccessBatchId = -1; } private void initLocalRocksDbDir() { try { File file = new File(rocksDbDir); if (file.exists()) FileUtils.cleanDirectory(file); FileUtils.forceMkdir(new File(rocksDbCheckpointDir)); } catch (IOException e) { LOG.error("Failed to create dir for path=" + rocksDbCheckpointDir, e); throw new RuntimeException(e.getMessage()); } } @Override public void setStateName(String stateName) { this.stateName = stateName; } @Override public void put(K key, V value) { try { rocksDb.put(serializer.serialize(key), serializer.serialize(value)); } catch (RocksDBException e) { LOG.error("Failed to put data, key={}, value={}", key, value); throw new RuntimeException(e.getMessage()); } } @Override public void putBatch(Map<K, V> batch) { try { WriteBatch writeBatch = new WriteBatch(); for (Map.Entry<K, V> entry : batch.entrySet()) { writeBatch.put(serializer.serialize(entry.getKey()), serializer.serialize(entry.getValue())); } rocksDb.write(new WriteOptions(), writeBatch); } catch (RocksDBException e) { LOG.error("Failed to put batch={}", batch); throw new RuntimeException(e.getMessage()); } } @Override public V get(K key) { try { V ret = null; if (key != null) { byte[] rawKey = serializer.serialize(key); byte[] rawData = rocksDb.get(rawKey); ret = rawData != null ? (V) serializer.deserialize(rawData) : null; } return ret; } catch (RocksDBException e) { LOG.error("Failed to get value by key-{}", key); throw new RuntimeException(e.getMessage()); } } @Override public Map<K, V> getBatch(Collection<K> keys) { Map<K, V> batch = new HashMap<K, V>(); for (K key : keys) { V value = get(key); if (value != null) batch.put(key, value); } return batch; } @Override public Map<K, V> getBatch() { Map<K, V> batch = new HashMap<K, V>(); RocksIterator itr = rocksDb.newIterator(); itr.seekToFirst(); while (itr.isValid()) { byte[] rawKey = itr.key(); byte[] rawValue = itr.value(); V value = rawValue != null ? (V) serializer.deserialize(rawValue) : null; batch.put((K) serializer.deserialize(rawKey), value); itr.next(); } return batch; } @Override public Collection<K> getAllKeys() { Collection<K> keys = new ArrayList<K>(); RocksIterator itr = rocksDb.newIterator(); itr.seekToFirst(); while (itr.isValid()) { keys.add((K) serializer.deserialize(itr.key())); itr.next(); } return keys; } @Override public void remove(K key) { try { rocksDb.remove(serializer.serialize(key)); } catch (RocksDBException e) { LOG.warn("Failed to remove " + key, e); } } @Override public void clear() { for (K key : getAllKeys()) { remove(key); } } @Override public void cleanup() { if (rocksDb != null) rocksDb.dispose(); if (hdfsCache != null) hdfsCache.close(); } @Override public void restore(String checkpointBackupDir) { LOG.info("Start restore from remote: {}", checkpointBackupDir); if (rocksDb != null) rocksDb.dispose(); initLocalRocksDbDir(); // Restore db files from hdfs to local disk try { if (checkpointBackupDir != null) { // Get dir of sst files int index = checkpointBackupDir.lastIndexOf("checkpoint"); String remoteDbBackupDir = checkpointBackupDir.substring(0, index); // copy sstFile.list, CURRENT, MANIFEST to local disk for the specified batch Collection<String> files = hdfsCache.listFile(checkpointBackupDir, false); LOG.debug("Restore checkpoint files: {}", files); for (String fileName : files) hdfsCache.copyToLocal(checkpointBackupDir + "/" + fileName, rocksDbDir); // copy all rocksDB sst files to local disk String sstFileList = rocksDbDir + "/" + SST_FILE_LIST; File file = new File(sstFileList); List<String> sstFiles = FileUtils.readLines(file); LOG.debug("Restore sst files: {}", sstFiles); for (String sstFile : sstFiles) { hdfsCache.copyToLocal(remoteDbBackupDir + "/" + sstFile, rocksDbDir); } FileUtils.deleteQuietly(file); } initRocksDb(); } catch (IOException e) { LOG.error("Failed to restore checkpoint", e); throw new RuntimeException(e.getMessage()); } } @Override public String backup(long batchId) { try { String hdfsCpDir = getRemoteCheckpointPath(batchId); String batchCpPath = getLocalCheckpointPath(batchId); long startTime = System.currentTimeMillis(); // upload sst data files to hdfs Collection<File> sstFiles = FileUtils.listFiles(new File(batchCpPath), new String[] { ROCKSDB_DATA_FILE_EXT }, false); for (File sstFile : sstFiles) { if (!lastCheckpointFiles.contains(sstFile.getName())) { hdfsCache.copyToDfs(batchCpPath + "/" + sstFile.getName(), hdfsDbDir, true); } } // upload sstFile.list, CURRENT, MANIFEST to hdfs Collection<String> sstFileList = getFileList(sstFiles); File cpFileList = new File(batchCpPath + "/" + SST_FILE_LIST); FileUtils.writeLines(cpFileList, sstFileList); if (hdfsCache.exist(hdfsCpDir)) hdfsCache.remove(hdfsCpDir, true); hdfsCache.mkdir(hdfsCpDir); Collection<File> allFiles = FileUtils.listFiles(new File(batchCpPath), null, false); allFiles.removeAll(sstFiles); Collection<File> nonSstFiles = allFiles; for (File nonSstFile : nonSstFiles) { hdfsCache.copyToDfs(batchCpPath + "/" + nonSstFile.getName(), hdfsCpDir, true); } if (JStormMetrics.enabled) hdfsWriteLatency.update(System.currentTimeMillis() - startTime); lastCheckpointFiles = sstFileList; return hdfsCpDir; } catch (IOException e) { LOG.error("Failed to upload checkpoint", e); throw new RuntimeException(e.getMessage()); } } /** * Flush the data in memtable of RocksDB into disk, and then create checkpoint * * @param batchId */ @Override public void checkpoint(long batchId) { long startTime = System.currentTimeMillis(); try { rocksDb.flush(new FlushOptions()); Checkpoint cp = Checkpoint.create(rocksDb); cp.createCheckpoint(getLocalCheckpointPath(batchId)); } catch (RocksDBException e) { LOG.error("Failed to create checkpoint for batch-" + batchId, e); throw new RuntimeException(e.getMessage()); } if (JStormMetrics.enabled) rocksDbFlushAndCpLatency.update(System.currentTimeMillis() - startTime); } /** * remove obsolete checkpoint data at local disk and remote backup storage * * @param batchId id of success batch */ @Override public void remove(long batchId) { removeObsoleteLocalCheckpoints(batchId); removeObsoleteRemoteCheckpoints(batchId); } private String getLocalCheckpointPath(long batchId) { return rocksDbCheckpointDir + "/" + batchId; } private String getRemoteCheckpointPath(long batchId) { return hdfsCheckpointDir + "/" + batchId; } private String getRemoteCheckpointSstListFile(long batchId) { return getRemoteCheckpointPath(batchId) + "/" + SST_FILE_LIST; } private Collection<String> getFileList(Collection<File> files) { Collection<String> ret = new HashSet<String>(); for (File file : files) ret.add(file.getName()); return ret; } private void removeObsoleteLocalCheckpoints(long successBatchId) { File cpRootDir = new File(rocksDbCheckpointDir); for (String cpDir : cpRootDir.list()) { try { long cpId = JStormUtils.parseLong(cpDir); if (cpId < successBatchId) FileUtils.deleteQuietly(new File(rocksDbCheckpointDir + "/" + cpDir)); } catch (Throwable e) { File file = new File(rocksDbCheckpointDir + "/" + cpDir); // If existing more thant one hour, remove the unexpected file if (System.currentTimeMillis() - file.lastModified() > 60 * 60 * 1000) { LOG.debug("Unexpected file-" + cpDir + " in local checkpoint dir, " + rocksDbCheckpointDir, e); FileUtils.deleteQuietly(file); } } } } private void removeObsoleteRemoteCheckpoints(long successBatchId) { long startTime = System.currentTimeMillis(); if (lastSuccessBatchId != -1 && lastSuccessBatchId != (successBatchId - 1)) { LOG.warn("Some ack msgs from TM were lost!. Last success batch Id: {}, Current success batch Id: {}", lastSuccessBatchId, successBatchId); } lastSuccessBatchId = successBatchId; long obsoleteBatchId = successBatchId - 1; try { Collection<String> lastSuccessSStFiles = hdfsCache.readLines(getRemoteCheckpointSstListFile(successBatchId)); if (hdfsCache.exist(getRemoteCheckpointPath(obsoleteBatchId))) { // remove obsolete sst files Collection<String> obsoleteSstFiles = hdfsCache.readLines(getRemoteCheckpointSstListFile(obsoleteBatchId)); obsoleteSstFiles.removeAll(lastSuccessSStFiles); for (String sstFile : obsoleteSstFiles) { hdfsCache.remove(hdfsDbDir + "/" + sstFile, false); } // remove checkpoint dir hdfsCache.remove(getRemoteCheckpointPath(obsoleteBatchId), true); } // Sometimes if remove was failed, some checkpoint files would be left in remote FS. // So here to check if full clean is required for a specified period. If so, clean all checkpoint files which are expired. long currentTime = System.currentTimeMillis(); if (currentTime - lastCleanTime > cleanPeriod) { FileStatus successCpFileStatus = hdfsCache.getFileStatus(getRemoteCheckpointSstListFile(successBatchId)); // remove obsolete sst files FileStatus[] fileStatuses = hdfsCache.listFileStatus(hdfsDbDir); for (FileStatus fileStatus : fileStatuses) { String fileName = fileStatus.getPath().getName(); if (fileStatus.getModificationTime() < successCpFileStatus.getModificationTime() && !lastSuccessSStFiles.contains(fileName)) { hdfsCache.remove(hdfsDbDir + "/" + fileName, true); } } // remove obsolete checkpoint dir fileStatuses = hdfsCache.listFileStatus(hdfsCheckpointDir); for (FileStatus fileStatus : fileStatuses) { String checkpointId = fileStatus.getPath().getName(); if (fileStatus.getModificationTime() < successCpFileStatus.getModificationTime() && Integer.valueOf(checkpointId) != successBatchId) { hdfsCache.remove(hdfsCheckpointDir + "/" + checkpointId, true); } } lastCleanTime = currentTime; } } catch (IOException e) { LOG.error("Failed to remove obsolete checkpoint data for batch-" + obsoleteBatchId, e); } if (JStormMetrics.enabled) this.hdfsDeleteLatency.update(System.currentTimeMillis() - startTime); } public static void main(String[] args) { Map conf = new HashMap<Object, Object>(); conf.putAll(Utils.loadConf(args[0])); RocksDbHdfsState<String, Integer> state = new RocksDbHdfsState<String, Integer>(); state.setStateName(String.valueOf(1)); // setup checkpoint int batchNum = JStormUtils.parseInt(conf.get("batch.num"), 100); state.initEnv("test", conf, "/tmp/rocksdb_test"); String remoteCpPath = null; for (int i = 0; i < batchNum; i++) { state.put(String.valueOf(i % 20), i); state.checkpoint(i); remoteCpPath = state.backup(i); state.remove(i); } state.cleanup(); state.initEnv("test", conf, "/tmp/rocksdb_test"); state.restore(remoteCpPath); for (int i = 0; i < 20; i++) { Integer value = state.get(String.valueOf(i)); LOG.info("key={}, value={}", String.valueOf(i), value); } state.cleanup(); } }