/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb; import static org.voltdb.VoltDB.exitAfterMessage; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.lang.management.ManagementFactory; import java.lang.reflect.Constructor; import java.lang.reflect.Field; import java.net.Inet4Address; import java.net.Inet6Address; import java.net.InetAddress; import java.net.NetworkInterface; import java.net.SocketException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import org.aeonbits.owner.ConfigFactory; import org.apache.cassandra_voltpatches.GCInspector; import org.apache.hadoop_voltpatches.util.PureJavaCrc32C; import org.apache.log4j.Appender; import org.apache.log4j.DailyRollingFileAppender; import org.apache.log4j.FileAppender; import org.apache.log4j.Logger; import org.apache.zookeeper_voltpatches.CreateMode; import org.apache.zookeeper_voltpatches.KeeperException; import org.apache.zookeeper_voltpatches.KeeperException.Code; import org.apache.zookeeper_voltpatches.WatchedEvent; import org.apache.zookeeper_voltpatches.Watcher; import org.apache.zookeeper_voltpatches.ZooDefs.Ids; import org.apache.zookeeper_voltpatches.ZooKeeper; import org.apache.zookeeper_voltpatches.data.Stat; import org.json_voltpatches.JSONException; import org.json_voltpatches.JSONObject; import org.json_voltpatches.JSONStringer; import org.voltcore.logging.Level; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.HostMessenger; import org.voltcore.messaging.HostMessenger.HostInfo; import org.voltcore.messaging.SiteMailbox; import org.voltcore.network.CipherExecutor; import org.voltcore.utils.CoreUtils; import org.voltcore.utils.OnDemandBinaryLogger; import org.voltcore.utils.Pair; import org.voltcore.utils.ShutdownHooks; import org.voltcore.utils.VersionChecker; import org.voltcore.zk.CoreZK; import org.voltcore.zk.ZKCountdownLatch; import org.voltcore.zk.ZKUtil; import org.voltdb.ProducerDRGateway.MeshMemberInfo; import org.voltdb.TheHashinator.HashinatorType; import org.voltdb.VoltDB.Configuration; import org.voltdb.catalog.Catalog; import org.voltdb.catalog.CatalogMap; import org.voltdb.catalog.Cluster; import org.voltdb.catalog.Deployment; import org.voltdb.catalog.PlanFragment; import org.voltdb.catalog.Procedure; import org.voltdb.catalog.SnapshotSchedule; import org.voltdb.catalog.Statement; import org.voltdb.catalog.Systemsettings; import org.voltdb.catalog.Table; import org.voltdb.common.Constants; import org.voltdb.common.NodeState; import org.voltdb.compiler.AdHocCompilerCache; import org.voltdb.compiler.AsyncCompilerAgent; import org.voltdb.compiler.VoltCompiler; import org.voltdb.compiler.deploymentfile.ClusterType; import org.voltdb.compiler.deploymentfile.ConsistencyType; import org.voltdb.compiler.deploymentfile.DeploymentType; import org.voltdb.compiler.deploymentfile.DrRoleType; import org.voltdb.compiler.deploymentfile.HeartbeatType; import org.voltdb.compiler.deploymentfile.PartitionDetectionType; import org.voltdb.compiler.deploymentfile.PathsType; import org.voltdb.compiler.deploymentfile.SecurityType; import org.voltdb.compiler.deploymentfile.SystemSettingsType; import org.voltdb.dtxn.InitiatorStats; import org.voltdb.dtxn.LatencyHistogramStats; import org.voltdb.dtxn.LatencyStats; import org.voltdb.dtxn.LatencyUncompressedHistogramStats; import org.voltdb.dtxn.SiteTracker; import org.voltdb.export.ExportManager; import org.voltdb.importer.ImportManager; import org.voltdb.iv2.BaseInitiator; import org.voltdb.iv2.Cartographer; import org.voltdb.iv2.Initiator; import org.voltdb.iv2.KSafetyStats; import org.voltdb.iv2.LeaderAppointer; import org.voltdb.iv2.MpInitiator; import org.voltdb.iv2.SpInitiator; import org.voltdb.iv2.SpScheduler.DurableUniqueIdListener; import org.voltdb.iv2.TxnEgo; import org.voltdb.jni.ExecutionEngine; import org.voltdb.join.BalancePartitionsStatistics; import org.voltdb.join.ElasticJoinService; import org.voltdb.licensetool.LicenseApi; import org.voltdb.messaging.VoltDbMessageFactory; import org.voltdb.modular.ModuleManager; import org.voltdb.planner.ActivePlanRepository; import org.voltdb.probe.MeshProber; import org.voltdb.processtools.ShellTools; import org.voltdb.rejoin.Iv2RejoinCoordinator; import org.voltdb.rejoin.JoinCoordinator; import org.voltdb.settings.ClusterSettings; import org.voltdb.settings.ClusterSettingsRef; import org.voltdb.settings.DbSettings; import org.voltdb.settings.NodeSettings; import org.voltdb.settings.Settings; import org.voltdb.settings.SettingsException; import org.voltdb.snmp.DummySnmpTrapSender; import org.voltdb.snmp.FaultFacility; import org.voltdb.snmp.FaultLevel; import org.voltdb.snmp.SnmpTrapSender; import org.voltdb.sysprocs.saverestore.SnapshotPathType; import org.voltdb.sysprocs.saverestore.SnapshotUtil; import org.voltdb.sysprocs.saverestore.SnapshotUtil.Snapshot; import org.voltdb.utils.CLibrary; import org.voltdb.utils.CatalogUtil; import org.voltdb.utils.CatalogUtil.CatalogAndIds; import org.voltdb.utils.Encoder; import org.voltdb.utils.HTTPAdminListener; import org.voltdb.utils.InMemoryJarfile; import org.voltdb.utils.LogKeys; import org.voltdb.utils.MiscUtils; import org.voltdb.utils.PlatformProperties; import org.voltdb.utils.SystemStatsCollector; import org.voltdb.utils.TopologyZKUtils; import org.voltdb.utils.VoltFile; import org.voltdb.utils.VoltSampler; import com.google_voltpatches.common.base.Charsets; import com.google_voltpatches.common.base.Joiner; import com.google_voltpatches.common.base.Supplier; import com.google_voltpatches.common.base.Suppliers; import com.google_voltpatches.common.base.Throwables; import com.google_voltpatches.common.collect.ImmutableList; import com.google_voltpatches.common.collect.ImmutableMap; import com.google_voltpatches.common.collect.Maps; import com.google_voltpatches.common.collect.Sets; import com.google_voltpatches.common.hash.Hashing; import com.google_voltpatches.common.net.HostAndPort; import com.google_voltpatches.common.util.concurrent.ListenableFuture; import com.google_voltpatches.common.util.concurrent.ListeningExecutorService; import com.google_voltpatches.common.util.concurrent.SettableFuture; /** * RealVoltDB initializes global server components, like the messaging * layer, ExecutionSite(s), and ClientInterface. It provides accessors * or references to those global objects. It is basically the global * namespace. A lot of the global namespace is described by VoltDBInterface * to allow test mocking. */ public class RealVoltDB implements VoltDBInterface, RestoreAgent.Callback, HostMessenger.HostWatcher { private static final boolean DISABLE_JMX = Boolean.valueOf(System.getProperty("DISABLE_JMX", "true")); /** Default deployment file contents if path to deployment is null */ private static final String[] defaultDeploymentXML = { "<?xml version=\"1.0\"?>", "<!-- This file is an auto-generated default deployment configuration. -->", "<deployment>", " <cluster hostcount=\"1\" />", " <httpd enabled=\"true\">", " <jsonapi enabled=\"true\" />", " </httpd>", "</deployment>" }; private static final VoltLogger hostLog = new VoltLogger("HOST"); private static final VoltLogger consoleLog = new VoltLogger("CONSOLE"); private VoltDB.Configuration m_config = new VoltDB.Configuration(); int m_configuredNumberOfPartitions; int m_configuredReplicationFactor; // CatalogContext is immutable, just make sure that accessors see a consistent version volatile CatalogContext m_catalogContext; // Managed voltdb directories settings volatile NodeSettings m_nodeSettings; // Cluster settings reference and supplier final ClusterSettingsRef m_clusterSettings = new ClusterSettingsRef(); private String m_buildString; static final String m_defaultVersionString = "7.2"; // by default set the version to only be compatible with itself static final String m_defaultHotfixableRegexPattern = "^\\Q7.2\\E\\z"; // these next two are non-static because they can be overrriden on the CLI for test private String m_versionString = m_defaultVersionString; private String m_hotfixableRegexPattern = m_defaultHotfixableRegexPattern; HostMessenger m_messenger = null; private ClientInterface m_clientInterface = null; HTTPAdminListener m_adminListener; private OpsRegistrar m_opsRegistrar = new OpsRegistrar(); private AsyncCompilerAgent m_asyncCompilerAgent = null; public AsyncCompilerAgent getAsyncCompilerAgent() { return m_asyncCompilerAgent; } private PartitionCountStats m_partitionCountStats = null; private IOStats m_ioStats = null; private MemoryStats m_memoryStats = null; private CpuStats m_cpuStats = null; private GcStats m_gcStats = null; private CommandLogStats m_commandLogStats = null; private DRRoleStats m_drRoleStats = null; private StatsManager m_statsManager = null; private SnapshotCompletionMonitor m_snapshotCompletionMonitor; // These are unused locally, but they need to be registered with the StatsAgent so they're // globally available @SuppressWarnings("unused") private InitiatorStats m_initiatorStats; private LiveClientsStats m_liveClientsStats = null; int m_myHostId; String m_httpPortExtraLogMessage = null; boolean m_jsonEnabled; // IV2 things TreeMap<Integer, Initiator> m_iv2Initiators = new TreeMap<>(); Cartographer m_cartographer = null; Supplier<Boolean> m_partitionZeroLeader = null; LeaderAppointer m_leaderAppointer = null; GlobalServiceElector m_globalServiceElector = null; MpInitiator m_MPI = null; Map<Integer, Long> m_iv2InitiatorStartingTxnIds = new HashMap<>(); private ScheduledFuture<?> resMonitorWork; private HealthMonitor m_healthMonitor; private NodeStateTracker m_statusTracker; // Should the execution sites be started in recovery mode // (used for joining a node to an existing cluster) // If CL is enabled this will be set to true // by the CL when the truncation snapshot completes // and this node is viable for replay volatile boolean m_rejoining = false; // Need to separate the concepts of rejoin data transfer and rejoin // completion. This boolean tracks whether or not the data transfer // process is done. CL truncation snapshots will not flip the all-complete // boolean until no mode data is pending. // Yes, this is fragile having two booleans. We could aggregate them into // some rejoining state enum at some point. volatile boolean m_rejoinDataPending = false; // Since m_rejoinDataPending is set asynchronously, sites could have inconsistent // view of what the value is during the execution of a sysproc. Use this and // m_safeMpTxnId to prevent the race. The m_safeMpTxnId is updated once in the // lifetime of the node to reflect the first MP txn that witnessed the flip of // m_rejoinDataPending. private final Object m_safeMpTxnIdLock = new Object(); private long m_lastSeenMpTxnId = Long.MIN_VALUE; private long m_safeMpTxnId = Long.MAX_VALUE; String m_rejoinTruncationReqId = null; // Are we adding the node to the cluster instead of rejoining? volatile boolean m_joining = false; private boolean m_preparingShuttingdown = false; long m_clusterCreateTime; AtomicBoolean m_replicationActive = new AtomicBoolean(false); private ProducerDRGateway m_producerDRGateway = null; private ConsumerDRGateway m_consumerDRGateway = null; //Only restrict recovery completion during test static Semaphore m_testBlockRecoveryCompletion = new Semaphore(Integer.MAX_VALUE); private long m_executionSiteRecoveryFinish; private long m_executionSiteRecoveryTransferred; // Rejoin coordinator private JoinCoordinator m_joinCoordinator = null; private ElasticJoinService m_elasticJoinService = null; // Snapshot IO agent private SnapshotIOAgent m_snapshotIOAgent = null; // id of the leader, or the host restore planner says has the catalog int m_hostIdWithStartupCatalog; String m_pathToStartupCatalog; // Synchronize initialize and shutdown private final Object m_startAndStopLock = new Object(); // Synchronize updates of catalog contexts across the multiple sites on this host. // Ensure that the first site to reach catalogUpdate() does all the work and that no // others enter until that's finished. CatalogContext is immutable and volatile, accessors // should be able to always get a valid context without needing this lock. private final Object m_catalogUpdateLock = new Object(); // add a random number to the sampler output to make it likely to be unique for this process. private final VoltSampler m_sampler = new VoltSampler(10, "sample" + String.valueOf(new Random().nextInt() % 10000) + ".txt"); private final AtomicBoolean m_hasStartedSampler = new AtomicBoolean(false); List<Integer> m_partitionsToSitesAtStartupForExportInit; RestoreAgent m_restoreAgent = null; private final ListeningExecutorService m_es = CoreUtils.getCachedSingleThreadExecutor("StartAction ZK Watcher", 15000); private volatile boolean m_isRunning = false; private boolean m_isRunningWithOldVerb = true; private boolean m_isBare = false; private static final String SECONDARY_PICONETWORK_THREADS = "secondaryPicoNetworkThreads"; /** Last transaction ID at which the logging config updated. * Also, use the intrinsic lock to safeguard access from multiple * execution site threads */ private Long m_lastLogUpdateTxnId = 0L; /** * Startup snapshot nonce taken on shutdown --save */ String m_terminusNonce = null; private int m_maxThreadsCount; @Override public boolean isRunningWithOldVerbs() { return m_isRunningWithOldVerb; }; @Override public boolean isPreparingShuttingdown() { return m_preparingShuttingdown; } @Override public void setShuttingdown(boolean preparingShuttingdown) { m_preparingShuttingdown = preparingShuttingdown; } @Override public boolean rejoining() { return m_rejoining; } @Override public boolean rejoinDataPending() { return m_rejoinDataPending; } @Override public boolean isMpSysprocSafeToExecute(long txnId) { synchronized (m_safeMpTxnIdLock) { if (txnId >= m_safeMpTxnId) { return true; } if (txnId > m_lastSeenMpTxnId) { m_lastSeenMpTxnId = txnId; if (!rejoinDataPending() && m_safeMpTxnId == Long.MAX_VALUE) { m_safeMpTxnId = txnId; } } return txnId >= m_safeMpTxnId; } } StartAction getStartAction() { return m_config.m_startAction; } private long m_recoveryStartTime; CommandLog m_commandLog; SnmpTrapSender m_snmp; private volatile OperationMode m_mode = OperationMode.INITIALIZING; private OperationMode m_startMode = null; volatile String m_localMetadata = ""; private ListeningExecutorService m_computationService; private Thread m_configLogger; // methods accessed via the singleton @Override public void startSampler() { if (m_hasStartedSampler.compareAndSet(false, true)) { m_sampler.start(); } } private ScheduledThreadPoolExecutor m_periodicWorkThread; private ScheduledThreadPoolExecutor m_periodicPriorityWorkThread; // The configured license api: use to decide enterprise/community edition feature enablement LicenseApi m_licenseApi; String m_licenseInformation = ""; private LatencyStats m_latencyStats; private LatencyHistogramStats m_latencyCompressedStats; private LatencyUncompressedHistogramStats m_latencyHistogramStats; private File getConfigDirectory() { return getConfigDirectory(m_config); } private File getConfigDirectory(Configuration config) { return getConfigDirectory(config.m_voltdbRoot); } private File getConfigDirectory(File voltdbroot) { return new VoltFile(voltdbroot, Constants.CONFIG_DIR); } private File getConfigLogDeployment() { return getConfigLogDeployment(m_config); } private File getConfigLogDeployment(Configuration config) { return new VoltFile(getConfigDirectory(config), "deployment.xml"); } @Override public LicenseApi getLicenseApi() { return m_licenseApi; } @Override public String getLicenseInformation() { return m_licenseInformation; } @Override public String getVoltDBRootPath(PathsType.Voltdbroot path) { if (isRunningWithOldVerbs()) { return path.getPath(); } return m_nodeSettings.getVoltDBRoot().getPath(); } @Override public String getCommandLogPath(PathsType.Commandlog path) { if (isRunningWithOldVerbs()) { return path.getPath(); } return m_nodeSettings.resolve(m_nodeSettings.getCommandLog()).getPath(); } @Override public String getCommandLogSnapshotPath(PathsType.Commandlogsnapshot path) { if (isRunningWithOldVerbs()) { return path.getPath(); } return m_nodeSettings.resolve(m_nodeSettings.getCommandLogSnapshot()).getPath(); } @Override public String getSnapshotPath(PathsType.Snapshots path) { if (isRunningWithOldVerbs()) { return path.getPath(); } return m_nodeSettings.resolve(m_nodeSettings.getSnapshoth()).getPath(); } @Override public String getExportOverflowPath(PathsType.Exportoverflow path) { if (isRunningWithOldVerbs()) { return path.getPath(); } return m_nodeSettings.resolve(m_nodeSettings.getExportOverflow()).getPath(); } @Override public String getDROverflowPath(PathsType.Droverflow path) { if (isRunningWithOldVerbs()) { return path.getPath(); } return m_nodeSettings.resolve(m_nodeSettings.getDROverflow()).getPath(); } @Override public String getVoltDBRootPath() { return m_nodeSettings.getVoltDBRoot().getPath(); } @Override public String getCommandLogPath() { return m_nodeSettings.resolve(m_nodeSettings.getCommandLog()).getPath(); } @Override public String getCommandLogSnapshotPath() { return m_nodeSettings.resolve(m_nodeSettings.getCommandLogSnapshot()).getPath(); } @Override public String getSnapshotPath() { return m_nodeSettings.resolve(m_nodeSettings.getSnapshoth()).getPath(); } @Override public String getExportOverflowPath() { return m_nodeSettings.resolve(m_nodeSettings.getExportOverflow()).getPath(); } @Override public String getDROverflowPath() { return m_nodeSettings.resolve(m_nodeSettings.getDROverflow()).getPath(); } public static String getStagedCatalogPath(String voltDbRoot) { return voltDbRoot + File.separator + CatalogUtil.STAGED_CATALOG_PATH; } private String managedPathEmptyCheck(String voltDbRoot, String path) { VoltFile managedPath; if (new File(path).isAbsolute()) managedPath = new VoltFile(path); else managedPath = new VoltFile(voltDbRoot, path); if (managedPath.exists() && managedPath.canRead() && managedPath.list().length > 0) return managedPath.getAbsolutePath(); return null; } private void managedPathsEmptyCheck(Configuration config) { List<String> nonEmptyPaths = managedPathsWithFiles(config, m_catalogContext.getDeployment()); if (!nonEmptyPaths.isEmpty()) { StringBuilder crashMessage = new StringBuilder("Files from a previous database session exist in the managed directories:"); for (String nonEmptyPath : nonEmptyPaths) { crashMessage.append("\n - " + nonEmptyPath); } if (config.m_startAction.isLegacy()) { crashMessage.append("\nUse the recover command to restore the previous database or use create --force" + " to start a new database session overwriting existing files."); } else { crashMessage.append("\nUse start to restore the previous database or use init --force" + " to start a new database session overwriting existing files."); } VoltDB.crashLocalVoltDB(crashMessage.toString()); } } private List<String> managedPathsWithFiles(Configuration config, DeploymentType deployment) { ImmutableList.Builder<String> nonEmptyPaths = ImmutableList.builder(); PathsType paths = deployment.getPaths(); String voltDbRoot = getVoltDBRootPath(paths.getVoltdbroot()); String path; if (!config.m_isEnterprise) { return nonEmptyPaths.build(); } if ((path = managedPathEmptyCheck(voltDbRoot, getSnapshotPath(paths.getSnapshots()))) != null) nonEmptyPaths.add(path); if ((path = managedPathEmptyCheck(voltDbRoot, getExportOverflowPath(paths.getExportoverflow()))) != null) nonEmptyPaths.add(path); if ((path = managedPathEmptyCheck(voltDbRoot, getDROverflowPath(paths.getDroverflow()))) != null) nonEmptyPaths.add(path); if ((path = managedPathEmptyCheck(voltDbRoot, getCommandLogPath(paths.getCommandlog()))) != null) nonEmptyPaths.add(path); if ((path = managedPathEmptyCheck(voltDbRoot, getCommandLogSnapshotPath(paths.getCommandlogsnapshot()))) != null) nonEmptyPaths.add(path); return nonEmptyPaths.build(); } private final List<String> pathsWithRecoverableArtifacts(DeploymentType deployment) { ImmutableList.Builder<String> nonEmptyPaths = ImmutableList.builder(); if (!MiscUtils.isPro()) { return nonEmptyPaths.build(); } PathsType paths = deployment.getPaths(); String voltDbRoot = getVoltDBRootPath(paths.getVoltdbroot()); String path; if ((path = managedPathEmptyCheck(voltDbRoot, getSnapshotPath(paths.getSnapshots()))) != null) nonEmptyPaths.add(path); if ((path = managedPathEmptyCheck(voltDbRoot, getCommandLogPath(paths.getCommandlog()))) != null) nonEmptyPaths.add(path); if ((path = managedPathEmptyCheck(voltDbRoot, getCommandLogSnapshotPath(paths.getCommandlogsnapshot()))) != null) nonEmptyPaths.add(path); return nonEmptyPaths.build(); } private int outputDeployment(Configuration config) { try { File configInfoDir = new VoltFile(config.m_voltdbRoot, Constants.CONFIG_DIR); File depFH = new VoltFile(configInfoDir, "deployment.xml"); if (!depFH.isFile() || !depFH.canRead()) { consoleLog.fatal("Failed to get configuration or deployment configuration is invalid. " + depFH.getAbsolutePath()); return -1; } config.m_pathToDeployment = depFH.getCanonicalPath(); } catch (IOException e) { consoleLog.fatal("Failed to read deployment: " + e.getMessage()); return -1; } ReadDeploymentResults readDepl = readPrimedDeployment(config); try { DeploymentType dt = CatalogUtil.updateRuntimeDeploymentPaths(readDepl.deployment); // We don't have catalog context so host count is not there. String out; if ((out = CatalogUtil.getDeployment(dt, true)) != null) { if ((new File(config.m_getOutput)).exists() && !config.m_forceGetCreate) { consoleLog.fatal("Failed to save deployment, file already exists: " + config.m_getOutput); return -1; } try (FileOutputStream fos = new FileOutputStream(config.m_getOutput.trim())){ fos.write(out.getBytes()); } catch (IOException e) { consoleLog.fatal("Failed to write deployment to " + config.m_getOutput + " : " + e.getMessage()); return -1; } consoleLog.info("Deployment configuration saved in " + config.m_getOutput.trim()); } else { consoleLog.fatal("Failed to get configuration or deployment configuration is invalid."); return -1; } } catch (Exception e) { consoleLog.fatal("Failed to get configuration or deployment configuration is invalid. " + "Please make sure voltdbroot is a valid directory. " + e.getMessage()); return -1; } return 0; } private int outputSchema(Configuration config) { if ((new File(config.m_getOutput)).exists() && !config.m_forceGetCreate) { consoleLog.fatal("Failed to save schema file, file already exists: " + config.m_getOutput); return -1; } try { InMemoryJarfile catalogJar = CatalogUtil.loadInMemoryJarFile(MiscUtils.fileToBytes(new File (config.m_pathToCatalog))); String ddl = CatalogUtil.getAutoGenDDLFromJar(catalogJar); try (FileOutputStream fos = new FileOutputStream(config.m_getOutput.trim())){ fos.write(ddl.getBytes()); } catch (IOException e) { consoleLog.fatal("Failed to write schema to " + config.m_getOutput + " : " + e.getMessage()); return -1; } consoleLog.info("Schema saved in " + config.m_getOutput.trim()); } catch (IOException e) { consoleLog.fatal("Failed to load the catalog jar from " + config.m_pathToCatalog + " : " + e.getMessage()); return -1; } return 0; } private int outputProcedures(Configuration config) { File outputFile = new File(config.m_getOutput); if (outputFile.exists() && !config.m_forceGetCreate) { consoleLog.fatal("Failed to save classes, file already exists: " + config.m_getOutput); return -1; } try { InMemoryJarfile catalogJar = CatalogUtil.loadInMemoryJarFile(MiscUtils.fileToBytes(new File (config.m_pathToCatalog))); InMemoryJarfile filteredJar = CatalogUtil.getCatalogJarWithoutDefaultArtifacts(catalogJar); filteredJar.writeToFile(outputFile); consoleLog.info("Classes saved in " + outputFile.getPath()); } catch (IOException e) { consoleLog.fatal("Failed to read classes " + config.m_pathToCatalog + " : " + e.getMessage()); return -1; } return 0; } @Override public void cli(Configuration config) { if (config.m_startAction != StartAction.GET) { System.err.println("This can only be called for GET action."); VoltDB.exit(-1); } if (!config.m_voltdbRoot.exists() || !config.m_voltdbRoot.canRead() || !config.m_voltdbRoot.canExecute() || !config.m_voltdbRoot.isDirectory()) { try { System.err.println("FATAL: Invalid Voltdbroot directory: " + config.m_voltdbRoot.getCanonicalPath()); } catch (IOException ex) { //Ignore; } VoltDB.exit(-1); } // Handle multiple invocations of server thread in the same JVM. // by clearing static variables/properties which ModuleManager, // and Settings depend on ConfigFactory.clearProperty(Settings.CONFIG_DIR); int returnStatus = -1;; switch (config.m_getOption) { case DEPLOYMENT: returnStatus = outputDeployment(config); break; case SCHEMA: returnStatus = outputSchema(config); break; case CLASSES: returnStatus = outputProcedures(config); break; } VoltDB.exit(returnStatus); } /** * Initialize all the global components, then initialize all the m_sites. * @param config configuration that gets passed in from commandline. */ @Override public void initialize(Configuration config) { hostLog.info("PID of this Volt process is " + CLibrary.getpid()); ShutdownHooks.enableServerStopLogging(); synchronized(m_startAndStopLock) { exitAfterMessage = false; // Handle multiple invocations of server thread in the same JVM. // by clearing static variables/properties which ModuleManager, // and Settings depend on ConfigFactory.clearProperty(Settings.CONFIG_DIR); ModuleManager.resetCacheRoot(); CipherExecutor.SERVER.shutdown(); m_isRunningWithOldVerb = config.m_startAction.isLegacy(); // check that this is a 64 bit VM if (System.getProperty("java.vm.name").contains("64") == false) { hostLog.fatal("You are running on an unsupported (probably 32 bit) JVM. Exiting."); System.exit(-1); } // print the ascii art!. // determine the edition // Check license availability // All above - not for init String edition = "Community Edition"; if (config.m_startAction != StartAction.INITIALIZE) { consoleLog.l7dlog( Level.INFO, LogKeys.host_VoltDB_StartupString.name(), null); // load license API if (config.m_pathToLicense == null) { m_licenseApi = MiscUtils.licenseApiFactory(); if (m_licenseApi == null) { hostLog.fatal("Unable to open license file in default directories"); } } else { m_licenseApi = MiscUtils.licenseApiFactory(config.m_pathToLicense); if (m_licenseApi == null) { hostLog.fatal("Unable to open license file in provided path: " + config.m_pathToLicense); } } if (m_licenseApi == null) { hostLog.fatal("Please contact sales@voltdb.com to request a license."); VoltDB.crashLocalVoltDB( "Failed to initialize license verifier. " + "See previous log message for details.", false, null); } if (config.m_isEnterprise) { if (m_licenseApi.isEnterprise()) edition = "Enterprise Edition"; if (m_licenseApi.isPro()) edition = "Pro Edition"; if (m_licenseApi.isTrial()) edition = "Enterprise Edition"; if (m_licenseApi.isAWSMarketplace()) edition = "AWS Marketplace Pro Edition"; } // this also prints out the license type on the console readBuildInfo(edition); // print out the licensee on the license if (config.m_isEnterprise) { String licensee = m_licenseApi.licensee(); if ((licensee != null) && (licensee.length() > 0)) { consoleLog.info(String.format("Licensed to: %s", licensee)); } } } // Replay command line args that we can see StringBuilder sb = new StringBuilder(2048).append("Command line arguments: "); sb.append(System.getProperty("sun.java.command", "[not available]")); hostLog.info(sb.toString()); List<String> iargs = ManagementFactory.getRuntimeMXBean().getInputArguments(); sb.delete(0, sb.length()).append("Command line JVM arguments:"); for (String iarg : iargs) sb.append(" ").append(iarg); if (iargs.size() > 0) hostLog.info(sb.toString()); else hostLog.info("No JVM command line args known."); sb.delete(0, sb.length()).append("Command line JVM classpath: "); sb.append(System.getProperty("java.class.path", "[not available]")); hostLog.info(sb.toString()); if (config.m_startAction == StartAction.INITIALIZE) { if (config.m_forceVoltdbCreate) { deleteInitializationMarkers(config); } } // If there's no deployment provide a default and put it under voltdbroot. if (config.m_pathToDeployment == null) { try { config.m_pathToDeployment = setupDefaultDeployment(hostLog, config.m_voltdbRoot); config.m_deploymentDefault = true; } catch (IOException e) { VoltDB.crashLocalVoltDB("Failed to write default deployment.", false, null); return; } } ReadDeploymentResults readDepl = readPrimedDeployment(config); if (config.m_startAction == StartAction.INITIALIZE) { if (config.m_forceVoltdbCreate && m_nodeSettings.clean()) { String msg = "Archived previous snapshot directory to " + m_nodeSettings.getSnapshoth() + ".1"; consoleLog.info(msg); hostLog.info(msg); } stageDeploymentFileForInitialize(config, readDepl.deployment); stageSchemaFiles(config); stageInitializedMarker(config); hostLog.info("Initialized VoltDB root directory " + config.m_voltdbRoot.getPath()); consoleLog.info("Initialized VoltDB root directory " + config.m_voltdbRoot.getPath()); VoltDB.exit(0); } if (config.m_startAction.isLegacy()) { consoleLog.warn("The \"" + config.m_startAction.m_verb + "\" command is deprecated, please use \"init\" and \"start\" for your cluster operations."); } // config UUID is part of the status tracker. m_statusTracker = new NodeStateTracker(); final File stagedCatalogLocation = new VoltFile(RealVoltDB.getStagedCatalogPath(config.m_voltdbRoot.getAbsolutePath())); if (config.m_startAction.isLegacy()) { File rootFH = CatalogUtil.getVoltDbRoot(readDepl.deployment.getPaths()); File inzFH = new VoltFile(rootFH, VoltDB.INITIALIZED_MARKER); if (inzFH.exists()) { VoltDB.crashLocalVoltDB("Cannot use legacy start action " + config.m_startAction + " on voltdbroot " + rootFH + " that was initialized with the init command"); return; } //Case where you give primed deployment with -d look in ../../ for initialized marker. //Also check if parents are config and voltdbroot File cfile = (new File(config.m_pathToDeployment)).getParentFile(); if (cfile != null) { rootFH = cfile.getParentFile(); if ("config".equals(cfile.getName()) && VoltDB.DBROOT.equals(rootFH.getName())) { inzFH = new VoltFile(rootFH, VoltDB.INITIALIZED_MARKER); if (inzFH.exists()) { VoltDB.crashLocalVoltDB("Can not use legacy start action " + config.m_startAction + " on voltdbroot " + rootFH + " that was initialized with the init command"); return; } } } if (stagedCatalogLocation.isFile()) { hostLog.warn("Initialized schema is present, but is being ignored and may be removed."); } } else { assert (config.m_startAction == StartAction.PROBE); if (stagedCatalogLocation.isFile()) { assert (config.m_pathToCatalog == null) : config.m_pathToCatalog; config.m_pathToCatalog = stagedCatalogLocation.getAbsolutePath(); } } List<String> failed = m_nodeSettings.ensureDirectoriesExist(); if (!failed.isEmpty()) { String msg = "Unable to access or create the following directories:\n - " + Joiner.on("\n - ").join(failed); VoltDB.crashLocalVoltDB(msg); return; } if (config.m_hostCount == VoltDB.UNDEFINED) { config.m_hostCount = readDepl.deployment.getCluster().getHostcount(); } // set the mode first thing m_mode = OperationMode.INITIALIZING; m_config = config; m_startMode = null; // set a bunch of things to null/empty/new for tests // which reusue the process m_safeMpTxnId = Long.MAX_VALUE; m_lastSeenMpTxnId = Long.MIN_VALUE; m_clientInterface = null; m_adminListener = null; m_commandLog = new DummyCommandLog(); m_snmp = new DummySnmpTrapSender(); m_messenger = null; m_opsRegistrar = new OpsRegistrar(); m_asyncCompilerAgent = null; m_snapshotCompletionMonitor = null; m_catalogContext = null; m_partitionCountStats = null; m_ioStats = null; m_memoryStats = null; m_commandLogStats = null; m_statsManager = null; m_restoreAgent = null; m_recoveryStartTime = System.currentTimeMillis(); m_hostIdWithStartupCatalog = 0; m_pathToStartupCatalog = m_config.m_pathToCatalog; m_replicationActive = new AtomicBoolean(false); m_configLogger = null; ActivePlanRepository.clear(); updateMaxThreadsLimit(); // set up site structure final int computationThreads = Math.max(2, CoreUtils.availableProcessors() / 4); m_computationService = CoreUtils.getListeningExecutorService( "Computation service thread", computationThreads, m_config.m_computationCoreBindings); // Set std-out/err to use the UTF-8 encoding and fail if UTF-8 isn't supported try { System.setOut(new PrintStream(System.out, true, "UTF-8")); System.setErr(new PrintStream(System.err, true, "UTF-8")); } catch (UnsupportedEncodingException e) { hostLog.fatal("Support for the UTF-8 encoding is required for VoltDB. This means you are likely running an unsupported JVM. Exiting."); VoltDB.exit(-1); } m_snapshotCompletionMonitor = new SnapshotCompletionMonitor(); // use CLI overrides for testing hotfix version compatibility if (m_config.m_versionStringOverrideForTest != null) { m_versionString = m_config.m_versionStringOverrideForTest; } if (m_config.m_versionCompatibilityRegexOverrideForTest != null) { m_hotfixableRegexPattern = m_config.m_versionCompatibilityRegexOverrideForTest; } if (m_config.m_buildStringOverrideForTest != null) { m_buildString = m_config.m_buildStringOverrideForTest; } // Prime cluster settings from configuration parameters // evaluate properties with the following sources in terms of priority // 1) properties from command line options // 2) properties from the cluster.properties files // 3) properties from the deployment file // this reads the file config/cluster.properties ClusterSettings fromPropertyFile = ClusterSettings.create(); // handle case we recover clusters that were elastically expanded if (m_config.m_startAction.doesRecover()) { m_config.m_hostCount = fromPropertyFile.hostcount(); } Map<String, String> fromCommandLine = m_config.asClusterSettingsMap(); Map<String, String> fromDeploymentFile = CatalogUtil. asClusterSettingsMap(readDepl.deployment); ClusterSettings clusterSettings = ClusterSettings.create( fromCommandLine, fromPropertyFile.asMap(), fromDeploymentFile); // persist the merged settings clusterSettings.store(); m_clusterSettings.set(clusterSettings, 1); MeshProber.Determination determination = buildClusterMesh(readDepl); if (m_config.m_startAction == StartAction.PROBE) { String action = "Starting a new database cluster"; if (determination.startAction.doesRejoin()) { action = "Rejoining a running cluster"; } else if (determination.startAction == StartAction.JOIN) { action = "Adding this node to a running cluster"; } else if (determination.startAction.doesRecover()) { action = "Restarting the database cluster from the command logs"; } hostLog.info(action); consoleLog.info(action); } m_config.m_startAction = determination.startAction; m_config.m_hostCount = determination.hostCount; m_terminusNonce = determination.terminusNonce; // determine if this is a rejoining node // (used for license check and later the actual rejoin) m_rejoining = m_config.m_startAction.doesRejoin(); m_rejoinDataPending = m_config.m_startAction.doesJoin(); m_joining = m_config.m_startAction == StartAction.JOIN; if (m_rejoining || m_joining) { m_statusTracker.setNodeState(NodeState.REJOINING); } //Register dummy agents immediately m_opsRegistrar.registerMailboxes(m_messenger); //Start validating the build string in the background final Future<?> buildStringValidation = validateBuildString(getBuildString(), m_messenger.getZK()); // race to create start action nodes and then verify theirs compatibility. m_messenger.getZK().create(VoltZK.start_action, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, new ZKUtil.StringCallback(), null); VoltZK.createStartActionNode(m_messenger.getZK(), m_messenger.getHostId(), m_config.m_startAction); validateStartAction(); // durable means commandlogging is enabled. boolean durable = readDeploymentAndCreateStarterCatalogContext(config); if (config.m_isEnterprise && m_config.m_startAction.doesRequireEmptyDirectories() && !config.m_forceVoltdbCreate && durable) { managedPathsEmptyCheck(config); } //If we are not durable and we are not rejoining we backup auto snapshots if present. //If terminus is present we will recover from shutdown save so dont move. if (!durable && m_config.m_startAction.doesRecover() && determination.terminusNonce == null) { if (m_nodeSettings.clean()) { String msg = "Archiving old snapshots to " + m_nodeSettings.getSnapshoth() + ".1 and starting an empty database." + " Use voltadmin restore if you wish to restore an old database instance."; consoleLog.info(msg); hostLog.info(msg); } } // wait to make sure every host actually *see* each other's ZK node state. final int numberOfNodes = m_messenger.getLiveHostIds().size(); Map<Integer, HostInfo> hostInfos = m_messenger.waitForGroupJoin(numberOfNodes); Map<Integer, String> hostGroups = Maps.newHashMap(); Map<Integer, Integer> sitesPerHostMap = Maps.newHashMap(); hostInfos.forEach((k, v) -> { hostGroups.put(k, v.m_group); sitesPerHostMap.put(k, v.m_localSitesCount); }); if (m_messenger.isPaused() || m_config.m_isPaused) { setStartMode(OperationMode.PAUSED); } // Create the thread pool here. It's needed by buildClusterMesh() m_periodicWorkThread = CoreUtils.getScheduledThreadPoolExecutor("Periodic Work", 1, CoreUtils.SMALL_STACK_SIZE); m_periodicPriorityWorkThread = CoreUtils.getScheduledThreadPoolExecutor("Periodic Priority Work", 1, CoreUtils.SMALL_STACK_SIZE); Class<?> snapshotIOAgentClass = MiscUtils.loadProClass("org.voltdb.SnapshotIOAgentImpl", "Snapshot", true); if (snapshotIOAgentClass != null) { try { m_snapshotIOAgent = (SnapshotIOAgent) snapshotIOAgentClass.getConstructor(HostMessenger.class, long.class) .newInstance(m_messenger, m_messenger.getHSIdForLocalSite(HostMessenger.SNAPSHOT_IO_AGENT_ID)); m_messenger.createMailbox(m_snapshotIOAgent.getHSId(), m_snapshotIOAgent); } catch (Exception e) { VoltDB.crashLocalVoltDB("Failed to instantiate snapshot IO agent", true, e); } } m_asyncCompilerAgent = new AsyncCompilerAgent(m_licenseApi); try { SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d, yyyy"); JSONObject jo = new JSONObject(); jo.put("trial", m_licenseApi.isTrial()); jo.put("hostcount",m_licenseApi.maxHostcount()); jo.put("commandlogging", m_licenseApi.isCommandLoggingAllowed()); jo.put("wanreplication", m_licenseApi.isDrReplicationAllowed()); jo.put("expiration", sdf.format(m_licenseApi.expires().getTime())); m_licenseInformation = jo.toString(); } catch (JSONException ex) { //Ignore } // Create the GlobalServiceElector. Do this here so we can register the MPI with it // when we construct it below m_globalServiceElector = new GlobalServiceElector(m_messenger.getZK(), m_messenger.getHostId()); // Start the GlobalServiceElector. Not sure where this will actually belong. try { m_globalServiceElector.start(); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to start GlobalServiceElector", true, e); } // Always create a mailbox for elastic join data transfer if (m_config.m_isEnterprise) { long elasticHSId = m_messenger.getHSIdForLocalSite(HostMessenger.REBALANCE_SITE_ID); m_messenger.createMailbox(elasticHSId, new SiteMailbox(m_messenger, elasticHSId)); } if (m_joining) { Class<?> elasticJoinCoordClass = MiscUtils.loadProClass("org.voltdb.join.ElasticJoinNodeCoordinator", "Elastic", false); try { Constructor<?> constructor = elasticJoinCoordClass.getConstructor(HostMessenger.class, String.class); m_joinCoordinator = (JoinCoordinator) constructor.newInstance(m_messenger, VoltDB.instance().getVoltDBRootPath()); m_messenger.registerMailbox(m_joinCoordinator); m_joinCoordinator.initialize(m_catalogContext.getDeployment().getCluster().getKfactor()); } catch (Exception e) { VoltDB.crashLocalVoltDB("Failed to instantiate join coordinator", true, e); } } /* * Construct all the mailboxes for things that need to be globally addressable so they can be published * in one atomic shot. * * The starting state for partition assignments are statically derived from the host id generated * by host messenger and the k-factor/host count/sites per host. This starting state * is published to ZK as the topology metadata node. * * On join and rejoin the node has to inspect the topology meta node to find out what is missing * and then update the topology listing itself as the replica for those partitions. * Then it does a compare and set of the topology. * * Ning: topology may not reflect the true partitions in the cluster during join. So if another node * is trying to rejoin, it should rely on the cartographer's view to pick the partitions to replace. */ AbstractTopology topo = getTopology(config.m_startAction, hostGroups, sitesPerHostMap, m_joinCoordinator); m_partitionsToSitesAtStartupForExportInit = new ArrayList<>(); try { // IV2 mailbox stuff m_configuredReplicationFactor = topo.getReplicationFactor(); m_cartographer = new Cartographer(m_messenger, m_configuredReplicationFactor, m_catalogContext.cluster.getNetworkpartition()); m_partitionZeroLeader = new Supplier<Boolean>() { @Override public Boolean get() { return m_cartographer.isPartitionZeroLeader(); } }; List<Integer> partitions = null; if (m_rejoining) { m_configuredNumberOfPartitions = m_cartographer.getPartitionCount(); partitions = recoverPartitions(topo, hostGroups.get(m_messenger.getHostId())); if (partitions == null) { partitions = m_cartographer.getIv2PartitionsToReplace(m_configuredReplicationFactor, m_catalogContext.getNodeSettings().getLocalSitesCount(), m_messenger.getHostId(), hostGroups); } if (partitions.size() == 0) { VoltDB.crashLocalVoltDB("The VoltDB cluster already has enough nodes to satisfy " + "the requested k-safety factor of " + m_configuredReplicationFactor + ".\n" + "No more nodes can join.", false, null); } } else { m_configuredNumberOfPartitions = topo.getPartitionCount(); partitions = topo.getPartitionIdList(m_messenger.getHostId()); } for (int ii = 0; ii < partitions.size(); ii++) { Integer partition = partitions.get(ii); m_iv2InitiatorStartingTxnIds.put( partition, TxnEgo.makeZero(partition).getTxnId()); } m_iv2Initiators = createIv2Initiators( partitions, m_config.m_startAction, m_partitionsToSitesAtStartupForExportInit); m_iv2InitiatorStartingTxnIds.put( MpInitiator.MP_INIT_PID, TxnEgo.makeZero(MpInitiator.MP_INIT_PID).getTxnId()); // Pass the local HSIds to the MPI so it can farm out buddy sites // to the RO MP site pool List<Long> localHSIds = new ArrayList<>(); for (Initiator ii : m_iv2Initiators.values()) { localHSIds.add(ii.getInitiatorHSId()); } m_MPI = new MpInitiator(m_messenger, localHSIds, getStatsAgent()); m_iv2Initiators.put(MpInitiator.MP_INIT_PID, m_MPI); // Make a list of HDIds to join Map<Integer, Long> partsToHSIdsToRejoin = new HashMap<>(); for (Initiator init : m_iv2Initiators.values()) { if (init.isRejoinable()) { partsToHSIdsToRejoin.put(init.getPartitionId(), init.getInitiatorHSId()); } } OnDemandBinaryLogger.path = VoltDB.instance().getVoltDBRootPath(); if (m_rejoining) { SnapshotSaveAPI.recoveringSiteCount.set(partsToHSIdsToRejoin.size()); hostLog.info("Set recovering site count to " + partsToHSIdsToRejoin.size()); m_joinCoordinator = new Iv2RejoinCoordinator(m_messenger, partsToHSIdsToRejoin.values(), VoltDB.instance().getVoltDBRootPath(), m_config.m_startAction == StartAction.LIVE_REJOIN); m_joinCoordinator.initialize(m_configuredReplicationFactor); m_messenger.registerMailbox(m_joinCoordinator); if (m_config.m_startAction == StartAction.LIVE_REJOIN) { hostLog.info("Using live rejoin."); } else { hostLog.info("Using blocking rejoin."); } } else if (m_joining) { m_joinCoordinator.setPartitionsToHSIds(partsToHSIdsToRejoin); } } catch (Exception e) { VoltDB.crashLocalVoltDB(e.getMessage(), true, e); } // do the many init tasks in the Inits class Inits inits = new Inits(m_statusTracker, this, 1, durable); inits.doInitializationWork(); // Need the catalog so that we know how many tables so we can guess at the necessary heap size // This is done under Inits.doInitializationWork(), so need to wait until we get here. // Current calculation needs pro/community knowledge, number of tables, and the sites/host, // which is the number of initiators (minus the possibly idle MPI initiator) checkHeapSanity(MiscUtils.isPro(), m_catalogContext.tables.size(), (m_iv2Initiators.size() - 1), m_configuredReplicationFactor); if (m_joining && getReplicationRole() == ReplicationRole.REPLICA) { VoltDB.crashLocalVoltDB("Elastic join is prohibited on a replica cluster.", false, null); } collectLocalNetworkMetadata(); /* * Construct an adhoc planner for the initial catalog */ final CatalogSpecificPlanner csp = new CatalogSpecificPlanner(m_asyncCompilerAgent, m_catalogContext); // Initialize stats m_ioStats = new IOStats(); getStatsAgent().registerStatsSource(StatsSelector.IOSTATS, 0, m_ioStats); m_memoryStats = new MemoryStats(); getStatsAgent().registerStatsSource(StatsSelector.MEMORY, 0, m_memoryStats); getStatsAgent().registerStatsSource(StatsSelector.TOPO, 0, m_cartographer); m_partitionCountStats = new PartitionCountStats(m_cartographer); getStatsAgent().registerStatsSource(StatsSelector.PARTITIONCOUNT, 0, m_partitionCountStats); m_initiatorStats = new InitiatorStats(m_myHostId); m_liveClientsStats = new LiveClientsStats(); getStatsAgent().registerStatsSource(StatsSelector.LIVECLIENTS, 0, m_liveClientsStats); m_latencyStats = new LatencyStats(); getStatsAgent().registerStatsSource(StatsSelector.LATENCY, 0, m_latencyStats); m_latencyCompressedStats = new LatencyHistogramStats(m_myHostId); getStatsAgent().registerStatsSource(StatsSelector.LATENCY_COMPRESSED, 0, m_latencyCompressedStats); m_latencyHistogramStats = new LatencyUncompressedHistogramStats(m_myHostId); getStatsAgent().registerStatsSource(StatsSelector.LATENCY_HISTOGRAM, 0, m_latencyHistogramStats); BalancePartitionsStatistics rebalanceStats = new BalancePartitionsStatistics(); getStatsAgent().registerStatsSource(StatsSelector.REBALANCE, 0, rebalanceStats); KSafetyStats kSafetyStats = new KSafetyStats(); getStatsAgent().registerStatsSource(StatsSelector.KSAFETY, 0, kSafetyStats); m_cpuStats = new CpuStats(); getStatsAgent().registerStatsSource(StatsSelector.CPU, 0, m_cpuStats); m_gcStats = new GcStats(); getStatsAgent().registerStatsSource(StatsSelector.GC, 0, m_gcStats); // ENG-6321 m_commandLogStats = new CommandLogStats(m_commandLog); getStatsAgent().registerStatsSource(StatsSelector.COMMANDLOG, 0, m_commandLogStats); // Dummy DRCONSUMER stats replaceDRConsumerStatsWithDummy(); /* * Initialize the command log on rejoin and join before configuring the IV2 * initiators. This will prevent them from receiving transactions * which need logging before the internal file writers are * initialized. Root cause of ENG-4136. * * If sync command log is on, not initializing the command log before the initiators * are up would cause deadlock. */ if ((m_commandLog != null) && (m_commandLog.needsInitialization())) { consoleLog.l7dlog(Level.INFO, LogKeys.host_VoltDB_StayTunedForLogging.name(), null); } else { consoleLog.l7dlog(Level.INFO, LogKeys.host_VoltDB_StayTunedForNoLogging.name(), null); } if (m_commandLog != null && (m_rejoining || m_joining)) { //On rejoin the starting IDs are all 0 so technically it will load any snapshot //but the newest snapshot will always be the truncation snapshot taken after rejoin //completes at which point the node will mark itself as actually recovered. // // Use the partition count from the cluster config instead of the cartographer // here. Since the initiators are not started yet, the cartographer still doesn't // know about the new partitions at this point. m_commandLog.initForRejoin( m_catalogContext.cluster.getLogconfig().get("log").getLogsize(), Long.MIN_VALUE, m_configuredNumberOfPartitions, true, m_config.m_commandLogBinding, m_iv2InitiatorStartingTxnIds); } // Create the client interface try { InetAddress clientIntf = null; InetAddress adminIntf = null; if (!m_config.m_externalInterface.trim().equals("")) { clientIntf = InetAddress.getByName(m_config.m_externalInterface); //client and admin interfaces are same by default. adminIntf = clientIntf; } //If user has specified on command line host:port override client and admin interfaces. if (m_config.m_clientInterface != null && m_config.m_clientInterface.trim().length() > 0) { clientIntf = InetAddress.getByName(m_config.m_clientInterface); } if (m_config.m_adminInterface != null && m_config.m_adminInterface.trim().length() > 0) { adminIntf = InetAddress.getByName(m_config.m_adminInterface); } m_clientInterface = ClientInterface.create(m_messenger, m_catalogContext, getReplicationRole(), m_cartographer, clientIntf, config.m_port, adminIntf, config.m_adminPort, m_config.m_sslContext); } catch (Exception e) { VoltDB.crashLocalVoltDB(e.getMessage(), true, e); } // DR overflow directory if (VoltDB.instance().getLicenseApi().isDrReplicationAllowed()) { try { Class<?> ndrgwClass = null; ndrgwClass = Class.forName("org.voltdb.dr2.DRProducer"); Constructor<?> ndrgwConstructor = ndrgwClass.getConstructor(File.class, File.class, boolean.class, boolean.class, boolean.class, int.class, int.class); m_producerDRGateway = (ProducerDRGateway) ndrgwConstructor.newInstance( new VoltFile(VoltDB.instance().getDROverflowPath()), new VoltFile(VoltDB.instance().getSnapshotPath()), (m_config.m_startAction.doesRecover() && (durable || determination.terminusNonce != null)), m_config.m_startAction.doesRejoin(), m_replicationActive.get(), m_configuredNumberOfPartitions, (m_catalogContext.getClusterSettings().hostcount()-m_config.m_missingHostCount)); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to load DR system", true, e); } } else { // set up empty stats for the DR Producer getStatsAgent().registerStatsSource(StatsSelector.DRPRODUCERNODE, 0, new DRProducerStatsBase.DRProducerNodeStatsBase()); getStatsAgent().registerStatsSource(StatsSelector.DRPRODUCERPARTITION, 0, new DRProducerStatsBase.DRProducerPartitionStatsBase()); } m_drRoleStats = new DRRoleStats(this); getStatsAgent().registerStatsSource(StatsSelector.DRROLE, 0, m_drRoleStats); /* * Configure and start all the IV2 sites */ try { final String serializedCatalog = m_catalogContext.catalog.serialize(); for (Initiator iv2init : m_iv2Initiators.values()) { iv2init.configure( getBackendTargetType(), m_catalogContext, serializedCatalog, csp, m_configuredNumberOfPartitions, m_config.m_startAction, getStatsAgent(), m_memoryStats, m_commandLog, m_config.m_executionCoreBindings.poll(), shouldInitiatorCreateMPDRGateway(iv2init)); } // LeaderAppointer startup blocks if the initiators are not initialized. // So create the LeaderAppointer after the initiators. boolean expectSyncSnapshot = getReplicationRole() == ReplicationRole.REPLICA && config.m_startAction == StartAction.CREATE; m_leaderAppointer = new LeaderAppointer( m_messenger, m_configuredNumberOfPartitions, m_catalogContext.getDeployment().getCluster().getKfactor(), topo.topologyToJSON(), m_MPI, kSafetyStats, expectSyncSnapshot ); m_globalServiceElector.registerService(m_leaderAppointer); } catch (Exception e) { Throwable toLog = e; if (e instanceof ExecutionException) { toLog = ((ExecutionException)e).getCause(); } VoltDB.crashLocalVoltDB("Error configuring IV2 initiator.", true, toLog); } // Create the statistics manager and register it to JMX registry m_statsManager = null; try { final Class<?> statsManagerClass = MiscUtils.loadProClass("org.voltdb.management.JMXStatsManager", "JMX", true); if (statsManagerClass != null && !DISABLE_JMX) { m_statsManager = (StatsManager)statsManagerClass.newInstance(); m_statsManager.initialize(); } } catch (Exception e) { //JMXStatsManager will log and we continue. } try { m_snapshotCompletionMonitor.init(m_messenger.getZK()); } catch (Exception e) { hostLog.fatal("Error initializing snapshot completion monitor", e); VoltDB.crashLocalVoltDB("Error initializing snapshot completion monitor", true, e); } /* * Make sure the build string successfully validated * before continuing to do operations * that might return wrongs answers or lose data. */ try { buildStringValidation.get(); } catch (Exception e) { VoltDB.crashLocalVoltDB("Failed to validate cluster build string", false, e); } //elastic join, make sure all the joining nodes are ready //so that the secondary connections can be created. if (m_joining) { int expectedHosts = m_configuredReplicationFactor + 1; m_messenger.waitForJoiningHostsToBeReady(expectedHosts, this.m_myHostId); } else if (!m_rejoining) { // initial start or recover int expectedHosts = m_catalogContext.getClusterSettings().hostcount() - m_config.m_missingHostCount; m_messenger.waitForAllHostsToBeReady(expectedHosts); } // Create secondary connections within partition group createSecondaryConnections(m_rejoining); if (!m_joining && (m_cartographer.getPartitionCount()) != m_configuredNumberOfPartitions) { for (Map.Entry<Integer, ImmutableList<Long>> entry : getSiteTrackerForSnapshot().m_partitionsToSitesImmutable.entrySet()) { hostLog.info(entry.getKey() + " -- " + CoreUtils.hsIdCollectionToString(entry.getValue())); } VoltDB.crashGlobalVoltDB("Mismatch between configured number of partitions (" + m_configuredNumberOfPartitions + ") and actual (" + m_cartographer.getPartitionCount() + ")", true, null); } schedulePeriodicWorks(); m_clientInterface.schedulePeriodicWorks(); // print out a bunch of useful system info logDebuggingInfo(m_config.m_adminPort, m_config.m_httpPort, m_httpPortExtraLogMessage, m_jsonEnabled); // warn the user on the console if k=0 or if no command logging if (m_configuredReplicationFactor == 0) { consoleLog.warn("This is not a highly available cluster. K-Safety is set to 0."); } boolean usingCommandLog = m_config.m_isEnterprise && (m_catalogContext.cluster.getLogconfig() != null) && (m_catalogContext.cluster.getLogconfig().get("log") != null) && m_catalogContext.cluster.getLogconfig().get("log").getEnabled(); if (!usingCommandLog) { // figure out if using a snapshot schedule boolean usingPeridoicSnapshots = false; for (SnapshotSchedule ss : m_catalogContext.database.getSnapshotschedule()) { if (ss.getEnabled()) { usingPeridoicSnapshots = true; } } // print the right warning depending on durability settings if (usingPeridoicSnapshots) { consoleLog.warn("Durability is limited to periodic snapshots. Command logging is off."); } else { consoleLog.warn("Durability is turned off. Command logging is off."); } } // warn if cluster is partitionable, but partition detection is off if ((m_catalogContext.cluster.getNetworkpartition() == false) && (m_configuredReplicationFactor > 0)) { hostLog.warn("Running a redundant (k-safe) cluster with network " + "partition detection disabled is not recommended for production use."); // we decided not to include the stronger language below for the 3.0 version (ENG-4215) //hostLog.warn("With partition detection disabled, data may be lost or " + // "corrupted by certain classes of network failures."); } assert (m_clientInterface != null); m_clientInterface.initializeSnapshotDaemon(m_messenger, m_globalServiceElector); // Start elastic join service try { if (m_config.m_isEnterprise && TheHashinator.getCurrentConfig().type == HashinatorType.ELASTIC) { Class<?> elasticServiceClass = MiscUtils.loadProClass("org.voltdb.join.ElasticJoinCoordinator", "Elastic join", false); if (elasticServiceClass == null) { VoltDB.crashLocalVoltDB("Missing the ElasticJoinCoordinator class file in the enterprise " + "edition", false, null); } Constructor<?> constructor = elasticServiceClass.getConstructor(HostMessenger.class, ClientInterface.class, Cartographer.class, BalancePartitionsStatistics.class, String.class, int.class, Supplier.class); m_elasticJoinService = (ElasticJoinService) constructor.newInstance( m_messenger, m_clientInterface, m_cartographer, rebalanceStats, VoltDB.instance().getCommandLogSnapshotPath(), m_catalogContext.getDeployment().getCluster().getKfactor(), m_clusterSettings); m_elasticJoinService.updateConfig(m_catalogContext); } } catch (Exception e) { VoltDB.crashLocalVoltDB("Failed to instantiate elastic join service", false, e); } // set additional restore agent stuff if (m_restoreAgent != null) { m_restoreAgent.setInitiator(new Iv2TransactionCreator(m_clientInterface)); } // Start the stats agent at the end, after everything has been constructed m_opsRegistrar.setDummyMode(false); m_configLogger = new Thread(new ConfigLogging()); m_configLogger.start(); scheduleDailyLoggingWorkInNextCheckTime(); } } /** * recover the partition assignment from one of lost hosts in the same placement group for rejoin * Use the placement group of the recovering host to find a matched host from the lost nodes in the topology * If the partition count from the lost node is the same as the site count of the recovering host, * The partitions on the lost node will be placed on the recovering host. Partition group layout will be maintained. * Topology will be updated on ZK if successful * @param topology The topology from ZK, which contains the partition assignments for live or lost hosts * @param haGroup The placement group of the recovering host * @return A list of partitions if recover effort is a success. */ private List<Integer> recoverPartitions(AbstractTopology topology, String haGroup) { AbstractTopology recoveredTopo = AbstractTopology.mutateRecoverTopology(topology, m_messenger.getLiveHostIds(), m_messenger.getHostId(), haGroup); if (recoveredTopo == null) { return null; } List<Integer> partitions = recoveredTopo.getPartitionIdList(m_messenger.getHostId()); if (partitions != null && partitions.size() == m_catalogContext.getNodeSettings().getLocalSitesCount()) { TopologyZKUtils.updateTopologyToZK(m_messenger.getZK(), recoveredTopo); return partitions; } return null; } @Override public void hostsFailed(Set<Integer> failedHosts) { final ScheduledExecutorService es = getSES(true); if (es != null && !es.isShutdown()) { es.submit(new Runnable() { @Override public void run() { // First check to make sure that the cluster still is viable before // before allowing the fault log to be updated by the notifications // generated below. Set<Integer> hostsOnRing = new HashSet<>(); if (!m_leaderAppointer.isClusterKSafe(hostsOnRing)) { VoltDB.crashLocalVoltDB("Some partitions have no replicas. Cluster has become unviable.", false, null); return; } // Send KSafety trap - BTW the side effect of // calling m_leaderAppointer.isClusterKSafe(..) is that leader appointer // creates the ksafety stats set if (m_cartographer.isPartitionZeroLeader() || isFirstZeroPartitionReplica(failedHosts)) { // Send hostDown traps for (int hostId : failedHosts) { m_snmp.hostDown(FaultLevel.ERROR, hostId, "Host left cluster mesh due to connection loss"); } final int missing = m_leaderAppointer.getKSafetyStatsSet().stream() .max((s1,s2) -> s1.getMissingCount() - s2.getMissingCount()) .map(s->s.getMissingCount()).orElse(failedHosts.size()); final int expected = m_clusterSettings.getReference().hostcount(); m_snmp.statistics(FaultFacility.CLUSTER, "Node lost. Cluster is down to " + (expected - missing) + " members out of original "+ expected + "."); } // Cleanup the rejoin blocker in case the rejoining node failed. // This has to run on a separate thread because the callback is // invoked on the ZooKeeper server thread. // // I'm trying to be defensive to have this cleanup code run on // all live nodes. One of them will succeed in cleaning up the // rejoin ZK nodes. The others will just do nothing if the ZK // nodes are already gone. If this node is still initializing // when a rejoining node fails, there must be a live node that // can clean things up. It's okay to skip this if the executor // services are not set up yet. for (int hostId : failedHosts) { CoreZK.removeRejoinNodeIndicatorForHost(m_messenger.getZK(), hostId); } // If the current node hasn't finished rejoin when another // node fails, fail this node to prevent locking up the // system. if (m_rejoining) { VoltDB.crashLocalVoltDB("Another node failed before this node could finish rejoining. " + "As a result, the rejoin operation has been canceled. " + "Please try again."); } // let the client interface know host(s) have failed to clean up any outstanding work // especially non-transactional work m_clientInterface.handleFailedHosts(failedHosts); } }); } } private boolean isFirstZeroPartitionReplica(Set<Integer> failedHosts) { int partitionZeroMaster = CoreUtils.getHostIdFromHSId(m_cartographer.getHSIdForMaster(0)); if (!failedHosts.contains(partitionZeroMaster)) { return false; } int firstReplica = m_cartographer .getReplicasForPartition(0) .stream() .map(l->CoreUtils.getHostIdFromHSId(l)) .filter(i-> !failedHosts.contains(i)) .min((i1,i2) -> i1 - i2) .orElse(m_messenger.getHostId() + 1); return firstReplica == m_messenger.getHostId(); } class DailyLogTask implements Runnable { @Override public void run() { m_myHostId = m_messenger.getHostId(); hostLog.info(String.format("Host id of this node is: %d", m_myHostId)); hostLog.info("URL of deployment info: " + m_config.m_pathToDeployment); hostLog.info("Cluster uptime: " + MiscUtils.formatUptime(getClusterUptime())); logDebuggingInfo(m_config.m_adminPort, m_config.m_httpPort, m_httpPortExtraLogMessage, m_jsonEnabled); // log system setting information logSystemSettingFromCatalogContext(); scheduleDailyLoggingWorkInNextCheckTime(); // daily maintenance EnterpriseMaintenance em = EnterpriseMaintenance.get(); if (em != null) { em.dailyMaintenaceTask(); } } } /** * Get the next check time for a private member in log4j library, which is not a reliable idea. * It adds 30 seconds for the initial delay and uses a periodical thread to schedule the daily logging work * with this delay. * @return */ void scheduleDailyLoggingWorkInNextCheckTime() { DailyRollingFileAppender dailyAppender = null; Enumeration<?> appenders = Logger.getRootLogger().getAllAppenders(); while (appenders.hasMoreElements()) { Appender appender = (Appender) appenders.nextElement(); if (appender instanceof DailyRollingFileAppender){ dailyAppender = (DailyRollingFileAppender) appender; } } final DailyRollingFileAppender dailyRollingFileAppender = dailyAppender; Field field = null; if (dailyRollingFileAppender != null) { try { field = dailyRollingFileAppender.getClass().getDeclaredField("nextCheck"); field.setAccessible(true); } catch (NoSuchFieldException e) { hostLog.error("Failed to set daily system info logging: " + e.getMessage()); } } final Field nextCheckField = field; long nextCheck = System.currentTimeMillis(); // the next part may throw exception, current time is the default value if (dailyRollingFileAppender != null && nextCheckField != null) { try { nextCheck = nextCheckField.getLong(dailyRollingFileAppender); scheduleWork(new DailyLogTask(), nextCheck - System.currentTimeMillis() + 30 * 1000, 0, TimeUnit.MILLISECONDS); } catch (Exception e) { hostLog.error("Failed to set daily system info logging: " + e.getMessage()); } } } class StartActionWatcher implements Watcher { @Override public void process(WatchedEvent event) { if (m_mode == OperationMode.SHUTTINGDOWN) return; m_es.submit(new Runnable() { @Override public void run() { validateStartAction(); } }); } } private void validateStartAction() { ZooKeeper zk = m_messenger.getZK(); boolean initCompleted = false; List<String> children = null; try { initCompleted = zk.exists(VoltZK.init_completed, false) != null; children = zk.getChildren(VoltZK.start_action, new StartActionWatcher(), null); } catch (KeeperException e) { hostLog.error("Failed to validate the start actions", e); return; } catch (InterruptedException e) { VoltDB.crashLocalVoltDB("Interrupted during start action validation:" + e.getMessage(), true, e); } if (children != null && !children.isEmpty()) { for (String child : children) { byte[] data = null; try { data = zk.getData(VoltZK.start_action + "/" + child, false, null); } catch (KeeperException excp) { if (excp.code() == Code.NONODE) { hostLog.debug("Failed to validate the start action as node " + VoltZK.start_action + "/" + child + " got disconnected", excp); } else { hostLog.error("Failed to validate the start actions ", excp); } return; } catch (InterruptedException e) { VoltDB.crashLocalVoltDB("Interrupted during start action validation:" + e.getMessage(), true, e); } if (data == null) { VoltDB.crashLocalVoltDB("Couldn't find " + VoltZK.start_action + "/" + child); } String startAction = new String(data); if ((startAction.equals(StartAction.JOIN.toString()) || startAction.equals(StartAction.REJOIN.toString()) || startAction.equals(StartAction.LIVE_REJOIN.toString())) && !initCompleted) { int nodeId = VoltZK.getHostIDFromChildName(child); if (nodeId == m_messenger.getHostId()) { VoltDB.crashLocalVoltDB("This node was started with start action " + startAction + " during cluster creation. " + "All nodes should be started with matching create or recover actions when bring up a cluster. " + "Join and rejoin are for adding nodes to an already running cluster."); } else { hostLog.warn("Node " + nodeId + " tried to " + startAction + " cluster but it is not allowed during cluster creation. " + "All nodes should be started with matching create or recover actions when bring up a cluster. " + "Join and rejoin are for adding nodes to an already running cluster."); } } } } } private class ConfigLogging implements Runnable { private void logConfigInfo() { hostLog.info("Logging config info"); File configInfoDir = getConfigDirectory(); configInfoDir.mkdirs(); File configInfo = new File(configInfoDir, "config.json"); byte jsonBytes[] = null; try { JSONStringer stringer = new JSONStringer(); stringer.object(); stringer.keySymbolValuePair("workingDir", System.getProperty("user.dir")); stringer.keySymbolValuePair("pid", CLibrary.getpid()); stringer.key("log4jDst").array(); Enumeration<?> appenders = Logger.getRootLogger().getAllAppenders(); while (appenders.hasMoreElements()) { Appender appender = (Appender) appenders.nextElement(); if (appender instanceof FileAppender){ stringer.object(); stringer.keySymbolValuePair("path", new File(((FileAppender) appender).getFile()).getCanonicalPath()); if (appender instanceof DailyRollingFileAppender) { stringer.keySymbolValuePair("format", ((DailyRollingFileAppender)appender).getDatePattern()); } stringer.endObject(); } } Enumeration<?> loggers = Logger.getRootLogger().getLoggerRepository().getCurrentLoggers(); while (loggers.hasMoreElements()) { Logger logger = (Logger) loggers.nextElement(); appenders = logger.getAllAppenders(); while (appenders.hasMoreElements()) { Appender appender = (Appender) appenders.nextElement(); if (appender instanceof FileAppender){ stringer.object(); stringer.keySymbolValuePair("path", new File(((FileAppender) appender).getFile()).getCanonicalPath()); if (appender instanceof DailyRollingFileAppender) { stringer.keySymbolValuePair("format", ((DailyRollingFileAppender)appender).getDatePattern()); } stringer.endObject(); } } } stringer.endArray(); stringer.endObject(); JSONObject jsObj = new JSONObject(stringer.toString()); jsonBytes = jsObj.toString(4).getBytes(Charsets.UTF_8); } catch (JSONException e) { Throwables.propagate(e); } catch (IOException e) { e.printStackTrace(); } try { FileOutputStream fos = new FileOutputStream(configInfo); fos.write(jsonBytes); fos.getFD().sync(); fos.close(); } catch (IOException e) { hostLog.error("Failed to log config info: " + e.getMessage()); e.printStackTrace(); } } private void logCatalogAndDeployment() { File configInfoDir = getConfigDirectory(); configInfoDir.mkdirs(); try { m_catalogContext.writeCatalogJarToFile(configInfoDir.getPath(), "catalog.jar"); } catch (IOException e) { hostLog.error("Failed to log catalog: " + e.getMessage(), e); e.printStackTrace(); } logDeployment(); } private void logDeployment() { File configInfoDir = getConfigDirectory(); configInfoDir.mkdirs(); try { File deploymentFile = getConfigLogDeployment(); if (deploymentFile.exists()) { deploymentFile.delete(); } FileOutputStream fileOutputStream = new FileOutputStream(deploymentFile); fileOutputStream.write(m_catalogContext.getDeploymentBytes()); fileOutputStream.close(); } catch (Exception e) { hostLog.error("Failed to log deployment file: " + e.getMessage(), e); e.printStackTrace(); } } @Override public void run() { logConfigInfo(); logCatalogAndDeployment(); } } // Get topology information. If rejoining, get it directly from // ZK. Otherwise, try to do the write/read race to ZK on startup. private AbstractTopology getTopology(StartAction startAction, Map<Integer, String> hostGroups, Map<Integer, Integer> sitesPerHostMap, JoinCoordinator joinCoordinator) { AbstractTopology topology = null; if (startAction == StartAction.JOIN) { assert(joinCoordinator != null); JSONObject topoJson = joinCoordinator.getTopology(); try { topology = AbstractTopology.topologyFromJSON(topoJson); } catch (JSONException e) { VoltDB.crashLocalVoltDB("Unable to get topology from Json object", true, e); } } else if (startAction.doesRejoin()) { topology = TopologyZKUtils.readTopologyFromZK(m_messenger.getZK()); } else { // initial start or recover int hostcount = m_clusterSettings.get().hostcount(); if (sitesPerHostMap.size() != (hostcount - m_config.m_missingHostCount)) { VoltDB.crashLocalVoltDB("The total number of live and missing hosts must be the same as the cluster host count", false, null); } int kfactor = m_catalogContext.getDeployment().getCluster().getKfactor(); if (kfactor == 0 && m_config.m_missingHostCount > 0) { VoltDB.crashLocalVoltDB("A cluster with 0 kfactor can not be started with missing nodes ", false, null); } if (hostcount <= kfactor) { VoltDB.crashLocalVoltDB("Not enough nodes to ensure K-Safety.", false, null); } //startup or recover a cluster with missing nodes. make up the missing hosts to fool the topology //The topology will contain hosts which are marked as missing.The missing hosts will not host any master partitions. //At least one partition replica must be on the live hosts (not missing). Otherwise, the cluster will not be started up. //LeaderAppointer will ignore these hosts during startup. int sph = sitesPerHostMap.values().iterator().next(); int missingHostId = Integer.MAX_VALUE; Set<Integer> missingHosts = Sets.newHashSet(); for (int i = 0; i < m_config.m_missingHostCount; i++) { sitesPerHostMap.put(missingHostId, sph); hostGroups.put(missingHostId, AbstractTopology.PLACEMENT_GROUP_DEFAULT); missingHosts.add(missingHostId--); } int totalSites = sitesPerHostMap.values().stream().mapToInt(Number::intValue).sum(); if (totalSites % (kfactor + 1) != 0) { VoltDB.crashLocalVoltDB("Total number of sites is not divisable by the number of partitions.", false, null); } topology = AbstractTopology.getTopology(sitesPerHostMap, hostGroups, kfactor); if (topology.hasMissingPartitions()) { VoltDB.crashLocalVoltDB("Some partitions are missing in the topology", false, null); } //move partition masters from missing hosts to live hosts topology = AbstractTopology.shiftPartitionLeaders(topology, missingHosts); TopologyZKUtils.registerTopologyToZK(m_messenger.getZK(), topology); } return topology; } private TreeMap<Integer, Initiator> createIv2Initiators(Collection<Integer> partitions, StartAction startAction, List<Integer> m_partitionsToSitesAtStartupForExportInit) { TreeMap<Integer, Initiator> initiators = new TreeMap<>(); for (Integer partition : partitions) { Initiator initiator = new SpInitiator(m_messenger, partition, getStatsAgent(), m_snapshotCompletionMonitor, startAction); initiators.put(partition, initiator); m_partitionsToSitesAtStartupForExportInit.add(partition); } return initiators; } private void createSecondaryConnections(boolean isRejoin) { int partitionGroupCount = m_clusterSettings.get().hostcount() / (m_configuredReplicationFactor + 1); int localHostId = m_messenger.getHostId(); Set<Integer> peers = Sets.newHashSet(); if (m_configuredReplicationFactor > 0 && partitionGroupCount > 1) { Set<Integer> hostIdsWithinGroup = m_cartographer.getHostIdsWithinPartitionGroup(localHostId); if (isRejoin) { peers.addAll(hostIdsWithinGroup); // exclude local host id peers.remove(m_messenger.getHostId()); } else { for (Integer host : hostIdsWithinGroup) { // This node sends connection request to all its peers, once the connection // is established, both nodes will create a foreign host (contains a PicoNetwork thread). // That said, here we only connect to the nodes that have higher host id to avoid double // the network thread we expected. if (host > localHostId) { peers.add(host); } } } // it is possible if some nodes are inactive if (peers.isEmpty()) return; /** * Basic goal is each host should has the same number of connections compare to the number * without partition group layout. * * (targetConnectionsWithinPG - existingConnectionsWithinPG) is the the total number of secondary * connections we try to create, I want the secondary connections to have an even distribution * across all nodes within the partition group, and round up the result because this is * integer division, there is a trick to do this: (a + (b - 1)) / b * so it becomes (targetConnectionsWithinPG - existingConnectionsWithinPG) + (existingConnectionsWithinPG - 1) * which equals to (targetConnectionsWithinPG - 1). * * All the numbers are per node basis, PG is short for Partition Group */ int connectionsWithoutPG = m_clusterSettings.get().hostcount() - 1; int existingConnectionsWithinPG = hostIdsWithinGroup.size() - 1; int targetConnectionsWithinPG = Math.min( connectionsWithoutPG, CoreUtils.availableProcessors() / 4); int secondaryConnections = (targetConnectionsWithinPG - 1) / existingConnectionsWithinPG; Integer configNumberOfConnections = Integer.getInteger(SECONDARY_PICONETWORK_THREADS); if (configNumberOfConnections != null) { secondaryConnections = configNumberOfConnections; hostLog.info("Overridden secondary PicoNetwork network thread count:" + configNumberOfConnections); } else { hostLog.info("This node has " + secondaryConnections + " secondary PicoNetwork thread" + ((secondaryConnections > 1) ? "s" :"")); } m_messenger.createAuxiliaryConnections(peers, secondaryConnections); } } private final List<ScheduledFuture<?>> m_periodicWorks = new ArrayList<>(); /** * Schedule all the periodic works */ private void schedulePeriodicWorks() { // JMX stats broadcast m_periodicWorks.add(scheduleWork(new Runnable() { @Override public void run() { // A null here was causing a steady stream of annoying but apparently inconsequential // NPEs during a debug session of an unrelated unit test. if (m_statsManager != null) { m_statsManager.sendNotification(); } } }, 0, StatsManager.POLL_INTERVAL, TimeUnit.MILLISECONDS)); // small stats samples m_periodicWorks.add(scheduleWork(new Runnable() { @Override public void run() { SystemStatsCollector.asyncSampleSystemNow(false, false); } }, 0, 5, TimeUnit.SECONDS)); // medium stats samples m_periodicWorks.add(scheduleWork(new Runnable() { @Override public void run() { SystemStatsCollector.asyncSampleSystemNow(true, false); } }, 0, 1, TimeUnit.MINUTES)); // large stats samples m_periodicWorks.add(scheduleWork(new Runnable() { @Override public void run() { SystemStatsCollector.asyncSampleSystemNow(true, true); } }, 0, 6, TimeUnit.MINUTES)); // other enterprise setup EnterpriseMaintenance em = EnterpriseMaintenance.get(); if (em != null) { em.setupMaintenaceTasks(); } GCInspector.instance.start(m_periodicPriorityWorkThread, m_gcStats); } private void startHealthMonitor() { if (resMonitorWork != null) { m_globalServiceElector.unregisterService(m_healthMonitor); resMonitorWork.cancel(false); try { resMonitorWork.get(); } catch(Exception e) { } // Ignore exceptions because we don't really care about the result here. m_periodicWorks.remove(resMonitorWork); } m_healthMonitor = new HealthMonitor(m_catalogContext.getDeployment().getSystemsettings(), getSnmpTrapSender()); m_healthMonitor.logResourceLimitConfigurationInfo(); if (m_healthMonitor.hasResourceLimitsConfigured()) { m_globalServiceElector.registerService(m_healthMonitor); resMonitorWork = scheduleWork(m_healthMonitor, m_healthMonitor.getResourceCheckInterval(), m_healthMonitor.getResourceCheckInterval(), TimeUnit.SECONDS); m_periodicWorks.add(resMonitorWork); } } /** * Takes the deployment file given at initialization and the voltdb root given as * a command line options, and it performs the following tasks: * <p><ul> * <li>creates if necessary the voltdbroot directory * <li>fail if voltdbroot is already configured and populated with database artifacts * <li>creates command log, DR, snapshot, and export directories * <li>creates the config directory under voltdbroot * <li>moves the deployment file under the config directory * </ul> * @param config * @param dt a {@link DeploymentType} */ private void stageDeploymentFileForInitialize(Configuration config, DeploymentType dt) { String deprootFN = dt.getPaths().getVoltdbroot().getPath(); File deprootFH = new VoltFile(deprootFN); File cnfrootFH = config.m_voltdbRoot; if (!cnfrootFH.exists() && !cnfrootFH.mkdirs()) { VoltDB.crashLocalVoltDB("Unable to create the voltdbroot directory in " + cnfrootFH, false, null); } try { File depcanoFH = null; try { depcanoFH = deprootFH.getCanonicalFile(); } catch (IOException e) { depcanoFH = deprootFH; } File cnfcanoFH = cnfrootFH.getCanonicalFile(); if (!cnfcanoFH.equals(depcanoFH)) { dt.getPaths().getVoltdbroot().setPath(cnfrootFH.getPath()); } // root in deployment conflicts with command line voltdbroot if (!VoltDB.DBROOT.equals(deprootFN)) { consoleLog.info("Ignoring voltdbroot \"" + deprootFN + "\" specified in the deployment file"); hostLog.info("Ignoring voltdbroot \"" + deprootFN + "\" specified in the deployment file"); } } catch (IOException e) { VoltDB.crashLocalVoltDB( "Unable to resolve voltdbroot location: " + config.m_voltdbRoot, false, e); return; } // check for already existing artifacts List<String> nonEmptyPaths = managedPathsWithFiles(config, dt); if (!nonEmptyPaths.isEmpty()) { StringBuilder crashMessage = new StringBuilder("Files from a previous database session exist in the managed directories:"); for (String nonEmptyPath : nonEmptyPaths) { crashMessage.append("\n - " + nonEmptyPath); } crashMessage.append("\nUse the start command to start the initialized database or use init --force" + " to initialize a new database session overwriting existing files."); VoltDB.crashLocalVoltDB(crashMessage.toString()); return; } // create the config subdirectory File confDH = getConfigDirectory(config); if (!confDH.exists() && !confDH.mkdirs()) { VoltDB.crashLocalVoltDB("Unable to create the config directory " + confDH); return; } // create the remaining paths if (config.m_isEnterprise) { List<String> failed = m_nodeSettings.ensureDirectoriesExist(); if (!failed.isEmpty()) { String msg = "Unable to access or create the following directories:\n " + Joiner.on("\n ").join(failed); VoltDB.crashLocalVoltDB(msg); return; } } //Now its safe to Save .paths m_nodeSettings.store(); //Now that we are done with deployment configuration set all path null. dt.setPaths(null); // log message unconditionally indicating that the provided host-count and admin-mode settings in // deployment, if any, will be ignored consoleLog.info("When using the INIT command, some deployment file settings (hostcount and voltdbroot path) " + "are ignored"); hostLog.info("When using the INIT command, some deployment file settings (hostcount and voltdbroot path) are " + "ignored"); File depFH = getConfigLogDeployment(config); try (FileWriter fw = new FileWriter(depFH)) { fw.write(CatalogUtil.getDeployment(dt, true /* pretty print indent */)); } catch (IOException|RuntimeException e) { VoltDB.crashLocalVoltDB("Unable to marshal deployment configuration to " + depFH, false, e); } // Save cluster settings properties derived from the deployment file ClusterSettings.create(CatalogUtil.asClusterSettingsMap(dt)).store(); } private void stageSchemaFiles(Configuration config) { if (config.m_userSchema == null) { return; // nothing to do } assert( config.m_userSchema.isFile() ); // this is validated during command line parsing and will be true unless disk faults File stagedCatalogFH = new VoltFile(getStagedCatalogPath(getVoltDBRootPath())); if (!config.m_forceVoltdbCreate && stagedCatalogFH.exists()) { VoltDB.crashLocalVoltDB("A previous database was initialized with a schema. You must init with --force to overwrite the schema."); } final boolean standalone = false; final boolean isXCDR = false; VoltCompiler compiler = new VoltCompiler(standalone, isXCDR); if (!compiler.compileFromSchemaAndClasses(config.m_userSchema, config.m_stagedClassesPath, stagedCatalogFH)) { VoltDB.crashLocalVoltDB("Could not compile specified schema " + config.m_userSchema); } } private void stageInitializedMarker(Configuration config) { File depFH = new VoltFile(config.m_voltdbRoot, VoltDB.INITIALIZED_MARKER); try (PrintWriter pw = new PrintWriter(new FileWriter(depFH), true)) { pw.println(config.m_clusterName); } catch (IOException e) { VoltDB.crashLocalVoltDB("Unable to stage cluster name destination", false, e); } } private void deleteInitializationMarkers(Configuration configuration) { for (File c: configuration.getInitMarkers()) { MiscUtils.deleteRecursively(c); } } public static final String SECURITY_OFF_WARNING = "User authentication is not enabled." + " The database is accessible and could be modified or shut down by anyone on the network."; boolean readDeploymentAndCreateStarterCatalogContext(VoltDB.Configuration config) { /* * Debate with the cluster what the deployment file should be */ try { ZooKeeper zk = m_messenger.getZK(); byte deploymentBytes[] = null; try { deploymentBytes = org.voltcore.utils.CoreUtils.urlToBytes(m_config.m_pathToDeployment); } catch (Exception ex) { //Let us get bytes from ZK } DeploymentType deployment = null; try { if (deploymentBytes != null) { CatalogUtil.writeCatalogToZK(zk, // Fill in innocuous values for non-deployment stuff 0, 0L, 0L, new byte[] {}, // spin loop in Inits.LoadCatalog.run() needs // this to be of zero length until we have a real catalog. null, deploymentBytes); hostLog.info("URL of deployment: " + m_config.m_pathToDeployment); } else { CatalogAndIds catalogStuff = CatalogUtil.getCatalogFromZK(zk); deploymentBytes = catalogStuff.deploymentBytes; } } catch (KeeperException.NodeExistsException e) { CatalogAndIds catalogStuff = CatalogUtil.getCatalogFromZK(zk); byte[] deploymentBytesTemp = catalogStuff.deploymentBytes; if (deploymentBytesTemp != null) { //Check hash if its a supplied deployment on command line. //We will ignore the supplied or default deployment anyways. if (deploymentBytes != null && !m_config.m_deploymentDefault) { byte[] deploymentHashHere = CatalogUtil.makeDeploymentHash(deploymentBytes); if (!(Arrays.equals(deploymentHashHere, catalogStuff.getDeploymentHash()))) { hostLog.warn("The locally provided deployment configuration did not " + " match the configuration information found in the cluster."); } else { hostLog.info("Deployment configuration pulled from other cluster node."); } } //Use remote deployment obtained. deploymentBytes = deploymentBytesTemp; } else { hostLog.error("Deployment file could not be loaded locally or remotely, " + "local supplied path: " + m_config.m_pathToDeployment); deploymentBytes = null; } } catch(KeeperException.NoNodeException e) { // no deploymentBytes case is handled below. So just log this error. if (hostLog.isDebugEnabled()) { hostLog.debug("Error trying to get deployment bytes from cluster", e); } } if (deploymentBytes == null) { hostLog.error("Deployment information could not be obtained from cluster node or locally"); VoltDB.crashLocalVoltDB("No such deployment file: " + m_config.m_pathToDeployment, false, null); } if (deployment == null) { deployment = CatalogUtil.getDeployment(new ByteArrayInputStream(deploymentBytes)); } // wasn't a valid xml deployment file if (deployment == null) { hostLog.error("Not a valid XML deployment file at URL: " + m_config.m_pathToDeployment); VoltDB.crashLocalVoltDB("Not a valid XML deployment file at URL: " + m_config.m_pathToDeployment, false, null); } /* * Check for invalid deployment file settings (enterprise-only) in the community edition. * Trick here is to print out all applicable problems and then stop, rather than stopping * after the first one is found. */ if (!m_config.m_isEnterprise) { boolean shutdownDeployment = false; boolean shutdownAction = false; // check license features for community version if ((deployment.getCluster() != null) && (deployment.getCluster().getKfactor() > 0)) { consoleLog.error("K-Safety is not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } if ((deployment.getSnapshot() != null) && (deployment.getSnapshot().isEnabled())) { consoleLog.error("Snapshots are not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } if ((deployment.getCommandlog() != null) && (deployment.getCommandlog().isEnabled())) { consoleLog.error("Command logging is not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } if ((deployment.getExport() != null) && deployment.getExport().getConfiguration() != null && !deployment.getExport().getConfiguration().isEmpty()) { consoleLog.error("Export is not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } // check the start action for the community edition if (m_config.m_startAction != StartAction.CREATE) { consoleLog.error("Start action \"" + m_config.m_startAction.getClass().getSimpleName() + "\" is not supported in the community edition of VoltDB."); shutdownAction = true; } // if the process needs to stop, try to be helpful if (shutdownAction || shutdownDeployment) { String msg = "This process will exit. Please run VoltDB with "; if (shutdownDeployment) { msg += "a deployment file compatible with the community edition"; } if (shutdownDeployment && shutdownAction) { msg += " and "; } if (shutdownAction && !shutdownDeployment) { msg += "the CREATE start action"; } msg += "."; VoltDB.crashLocalVoltDB(msg, false, null); } } // note the heart beats are specified in seconds in xml, but ms internally HeartbeatType hbt = deployment.getHeartbeat(); if (hbt != null) { m_config.m_deadHostTimeoutMS = hbt.getTimeout() * 1000; m_messenger.setDeadHostTimeout(m_config.m_deadHostTimeoutMS); } else { hostLog.info("Dead host timeout set to " + m_config.m_deadHostTimeoutMS + " milliseconds"); } PartitionDetectionType pt = deployment.getPartitionDetection(); if (pt != null) { m_config.m_partitionDetectionEnabled = pt.isEnabled(); m_messenger.setPartitionDetectionEnabled(m_config.m_partitionDetectionEnabled); } // get any consistency settings into config ConsistencyType consistencyType = deployment.getConsistency(); if (consistencyType != null) { m_config.m_consistencyReadLevel = Consistency.ReadLevel.fromReadLevelType(consistencyType.getReadlevel()); } final String elasticSetting = deployment.getCluster().getElastic().trim().toUpperCase(); if (elasticSetting.equals("ENABLED")) { TheHashinator.setConfiguredHashinatorType(HashinatorType.ELASTIC); } else if (!elasticSetting.equals("DISABLED")) { VoltDB.crashLocalVoltDB("Error in deployment file, elastic attribute of " + "cluster element must be " + "'enabled' or 'disabled' but was '" + elasticSetting + "'", false, null); } else { TheHashinator.setConfiguredHashinatorType(HashinatorType.LEGACY); } // log system setting information SystemSettingsType sysType = deployment.getSystemsettings(); if (sysType != null) { if (sysType.getElastic() != null) { hostLog.info("Elastic duration set to " + sysType.getElastic().getDuration() + " milliseconds"); hostLog.info("Elastic throughput set to " + sysType.getElastic().getThroughput() + " mb/s"); } if (sysType.getTemptables() != null) { hostLog.info("Max temptable size set to " + sysType.getTemptables().getMaxsize() + " mb"); } if (sysType.getSnapshot() != null) { hostLog.info("Snapshot priority set to " + sysType.getSnapshot().getPriority() + " [0 - 10]"); } if (sysType.getQuery() != null) { if (sysType.getQuery().getTimeout() > 0) { hostLog.info("Query timeout set to " + sysType.getQuery().getTimeout() + " milliseconds"); m_config.m_queryTimeout = sysType.getQuery().getTimeout(); } else if (sysType.getQuery().getTimeout() == 0) { hostLog.info("Query timeout set to unlimited"); m_config.m_queryTimeout = 0; } } } // log a warning on console log if security setting is turned off, like durability warning. SecurityType securityType = deployment.getSecurity(); if (securityType == null || !securityType.isEnabled()) { consoleLog.warn(SECURITY_OFF_WARNING); } // create a dummy catalog to load deployment info into Catalog catalog = new Catalog(); // Need these in the dummy catalog Cluster cluster = catalog.getClusters().add("cluster"); cluster.getDatabases().add("database"); String result = CatalogUtil.compileDeployment(catalog, deployment, true); if (result != null) { // Any other non-enterprise deployment errors will be caught and handled here // (such as <= 0 host count) VoltDB.crashLocalVoltDB(result); } m_catalogContext = new CatalogContext( TxnEgo.makeZero(MpInitiator.MP_INIT_PID).getTxnId(), //txnid 0, //timestamp catalog, new DbSettings(m_clusterSettings, m_nodeSettings), new byte[] {}, null, deploymentBytes, 0, m_messenger); return ((deployment.getCommandlog() != null) && (deployment.getCommandlog().isEnabled())); } catch (Exception e) { throw new RuntimeException(e); } } @Override public void loadLegacyPathProperties(DeploymentType deployment) throws IOException { //Load deployment paths now if Legacy so that we access through the interface all the time. if (isRunningWithOldVerbs() && m_nodeSettings == null) { m_nodeSettings = NodeSettings.create(CatalogUtil.asNodeSettingsMap(deployment)); List<String> failed = m_nodeSettings.ensureDirectoriesExist(); if (!failed.isEmpty()) { String msg = "Unable to validate path settings:\n " + Joiner.on("\n ").join(failed); hostLog.fatal(msg); throw new IOException(msg); } } } static class ReadDeploymentResults { final byte [] deploymentBytes; final DeploymentType deployment; ReadDeploymentResults(byte [] deploymentBytes, DeploymentType deployment) { this.deploymentBytes = deploymentBytes; this.deployment = deployment; } } ReadDeploymentResults readPrimedDeployment(Configuration config) { /* * Debate with the cluster what the deployment file should be */ try { byte deploymentBytes[] = null; try { deploymentBytes = org.voltcore.utils.CoreUtils.urlToBytes(config.m_pathToDeployment); } catch (Exception ex) { //Let us get bytes from ZK } if (deploymentBytes == null) { hostLog.error("Deployment information could not be obtained from cluster node or locally"); VoltDB.crashLocalVoltDB("No such deployment file: " + config.m_pathToDeployment, false, null); } DeploymentType deployment = CatalogUtil.getDeployment(new ByteArrayInputStream(deploymentBytes)); // wasn't a valid xml deployment file if (deployment == null) { hostLog.error("Not a valid XML deployment file at URL: " + config.m_pathToDeployment); VoltDB.crashLocalVoltDB("Not a valid XML deployment file at URL: " + config.m_pathToDeployment, false, null); return new ReadDeploymentResults(deploymentBytes, deployment); } // Override local sites count if possible if (config.m_sitesperhost == VoltDB.UNDEFINED) { config.m_sitesperhost = deployment.getCluster().getSitesperhost(); } else { hostLog.info("Set the local sites count to " + config.m_sitesperhost); consoleLog.info("CLI overrides the local sites count to " + config.m_sitesperhost); } NodeSettings nodeSettings = null; // adjust deployment host count when the cluster members are given by mesh configuration // providers switch(config.m_startAction) { case GET: // once a voltdbroot is inited, the path properties contain the true path values Settings.initialize(config.m_voltdbRoot); // only override the local sites count nodeSettings = NodeSettings.create(config.asNodeSettingsMap()); break; case PROBE: // once a voltdbroot is inited, the path properties contain the true path values Settings.initialize(config.m_voltdbRoot); // only override the local sites count nodeSettings = NodeSettings.create(config.asNodeSettingsMap()); File nodeSettingsFH = new File(getConfigDirectory(config), "path.properties"); consoleLog.info("Loaded node-specific settings from " + nodeSettingsFH.getPath()); hostLog.info("Loaded node-specific settings from " + nodeSettingsFH.getPath()); break; case INITIALIZE: Settings.initialize(config.m_voltdbRoot); // voltdbroot value from config overrides voltdbroot value in the deployment // file nodeSettings = NodeSettings.create( config.asNodeSettingsMap(), config.asPathSettingsMap(), CatalogUtil.asNodeSettingsMap(deployment)); break; default: nodeSettings = NodeSettings.create( config.asNodeSettingsMap(), CatalogUtil.asNodeSettingsMap(deployment)); Settings.initialize(nodeSettings.getVoltDBRoot()); config.m_voltdbRoot = nodeSettings.getVoltDBRoot(); break; } m_nodeSettings = nodeSettings; //Now its safe to save node settings if (config.m_startAction != StartAction.GET) { m_nodeSettings.store(); } if (config.m_startAction == StartAction.PROBE) { // once initialized the path properties contain the true path values if (config.m_hostCount == VoltDB.UNDEFINED) { config.m_hostCount = 1; } } else { config.m_hostCount = deployment.getCluster().getHostcount(); } /* * Check for invalid deployment file settings (enterprise-only) in the community edition. * Trick here is to print out all applicable problems and then stop, rather than stopping * after the first one is found. */ if (!config.m_isEnterprise) { boolean shutdownDeployment = false; boolean shutdownAction = false; // check license features for community version if ((deployment.getCluster() != null) && (deployment.getCluster().getKfactor() > 0)) { consoleLog.error("K-Safety is not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } if ((deployment.getSnapshot() != null) && (deployment.getSnapshot().isEnabled())) { consoleLog.error("Snapshots are not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } if ((deployment.getCommandlog() != null) && (deployment.getCommandlog().isEnabled())) { consoleLog.error("Command logging is not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } if ((deployment.getExport() != null) && deployment.getExport().getConfiguration() != null && !deployment.getExport().getConfiguration().isEmpty()) { consoleLog.error("Export is not supported " + "in the community edition of VoltDB."); shutdownDeployment = true; } // check the start action for the community edition if (m_config.m_startAction != StartAction.CREATE) { consoleLog.error("Start action \"" + m_config.m_startAction.getClass().getSimpleName() + "\" is not supported in the community edition of VoltDB."); shutdownAction = true; } // if the process needs to stop, try to be helpful if (shutdownAction || shutdownDeployment) { String msg = "This process will exit. Please run VoltDB with "; if (shutdownDeployment) { msg += "a deployment file compatible with the community edition"; } if (shutdownDeployment && shutdownAction) { msg += " and "; } if (shutdownAction && !shutdownDeployment) { msg += "the CREATE start action"; } msg += "."; VoltDB.crashLocalVoltDB(msg, false, null); } } return new ReadDeploymentResults(deploymentBytes, deployment); } catch (Exception e) { throw new RuntimeException(e); } } void collectLocalNetworkMetadata() { boolean threw = false; JSONStringer stringer = new JSONStringer(); try { stringer.object(); stringer.key("interfaces").array(); /* * If no interface was specified, do a ton of work * to identify all ipv4 or ipv6 interfaces and * marshal them into JSON. Always put the ipv4 address first * so that the export client will use it */ if (m_config.m_externalInterface.equals("")) { LinkedList<NetworkInterface> interfaces = new LinkedList<>(); try { Enumeration<NetworkInterface> intfEnum = NetworkInterface.getNetworkInterfaces(); while (intfEnum.hasMoreElements()) { NetworkInterface intf = intfEnum.nextElement(); if (intf.isLoopback() || !intf.isUp()) { continue; } interfaces.offer(intf); } } catch (SocketException e) { throw new RuntimeException(e); } if (interfaces.isEmpty()) { stringer.value("localhost"); } else { boolean addedIp = false; while (!interfaces.isEmpty()) { NetworkInterface intf = interfaces.poll(); Enumeration<InetAddress> inetAddrs = intf.getInetAddresses(); Inet6Address inet6addr = null; Inet4Address inet4addr = null; while (inetAddrs.hasMoreElements()) { InetAddress addr = inetAddrs.nextElement(); if (addr instanceof Inet6Address) { inet6addr = (Inet6Address)addr; if (inet6addr.isLinkLocalAddress()) { inet6addr = null; } } else if (addr instanceof Inet4Address) { inet4addr = (Inet4Address)addr; } } if (inet4addr != null) { stringer.value(inet4addr.getHostAddress()); addedIp = true; } if (inet6addr != null) { stringer.value(inet6addr.getHostAddress()); addedIp = true; } } if (!addedIp) { stringer.value("localhost"); } } } else { stringer.value(m_config.m_externalInterface); } } catch (Exception e) { threw = true; hostLog.warn("Error while collecting data about local network interfaces", e); } try { if (threw) { stringer = new JSONStringer(); stringer.object(); stringer.key("interfaces").array(); stringer.value("localhost"); stringer.endArray(); } else { stringer.endArray(); } stringer.keySymbolValuePair("clientPort", m_config.m_port); stringer.keySymbolValuePair("clientInterface", m_config.m_clientInterface); stringer.keySymbolValuePair("adminPort", m_config.m_adminPort); stringer.keySymbolValuePair("adminInterface", m_config.m_adminInterface); stringer.keySymbolValuePair("httpPort", m_config.m_httpPort); stringer.keySymbolValuePair("httpInterface", m_config.m_httpPortInterface); stringer.keySymbolValuePair("internalPort", m_config.m_internalPort); stringer.keySymbolValuePair("internalInterface", m_config.m_internalInterface); String[] zkInterface = m_config.m_zkInterface.split(":"); stringer.keySymbolValuePair("zkPort", zkInterface[1]); stringer.keySymbolValuePair("zkInterface", zkInterface[0]); stringer.keySymbolValuePair("drPort", VoltDB.getReplicationPort(m_catalogContext.cluster.getDrproducerport())); stringer.keySymbolValuePair("drInterface", VoltDB.getDefaultReplicationInterface()); stringer.keySymbolValuePair("publicInterface", m_config.m_publicInterface); stringer.endObject(); JSONObject obj = new JSONObject(stringer.toString()); // possibly atomic swap from null to realz m_localMetadata = obj.toString(4); hostLog.debug("System Metadata is: " + m_localMetadata); } catch (Exception e) { hostLog.warn("Failed to collect data about lcoal network interfaces", e); } } @Override public boolean isBare() { return m_isBare; } void setBare(boolean flag) { m_isBare = flag; } /** * Start the voltcore HostMessenger. This joins the node * to the existing cluster. In the non rejoin case, this * function will return when the mesh is complete. If * rejoining, it will return when the node and agreement * site are synched to the existing cluster. */ MeshProber.Determination buildClusterMesh(ReadDeploymentResults readDepl) { final boolean bareAtStartup = m_config.m_forceVoltdbCreate || pathsWithRecoverableArtifacts(readDepl.deployment).isEmpty(); setBare(bareAtStartup); final Supplier<Integer> hostCountSupplier = new Supplier<Integer>() { @Override public Integer get() { return m_clusterSettings.get().hostcount(); } }; ClusterType clusterType = readDepl.deployment.getCluster(); MeshProber criteria = MeshProber.builder() .coordinators(m_config.m_coordinators) .versionChecker(m_versionChecker) .enterprise(m_config.m_isEnterprise) .startAction(m_config.m_startAction) .bare(bareAtStartup) .configHash(CatalogUtil.makeDeploymentHashForConfig(readDepl.deploymentBytes)) .hostCountSupplier(hostCountSupplier) .kfactor(clusterType.getKfactor()) .paused(m_config.m_isPaused) .nodeStateSupplier(m_statusTracker.getNodeStateSupplier()) .addAllowed(m_config.m_enableAdd) .safeMode(m_config.m_safeMode) .terminusNonce(getTerminusNonce()) .missingHostCount(m_config.m_missingHostCount) .build(); HostAndPort hostAndPort = criteria.getLeader(); String hostname = hostAndPort.getHostText(); int port = hostAndPort.getPort(); org.voltcore.messaging.HostMessenger.Config hmconfig; hmconfig = new org.voltcore.messaging.HostMessenger.Config(hostname, port); if (m_config.m_placementGroup != null) { hmconfig.group = m_config.m_placementGroup; } hmconfig.internalPort = m_config.m_internalPort; hmconfig.internalInterface = m_config.m_internalInterface; hmconfig.zkInterface = m_config.m_zkInterface; hmconfig.deadHostTimeout = m_config.m_deadHostTimeoutMS; hmconfig.factory = new VoltDbMessageFactory(); hmconfig.coreBindIds = m_config.m_networkCoreBindings; hmconfig.acceptor = criteria; hmconfig.localSitesCount = m_config.m_sitesperhost; m_messenger = new org.voltcore.messaging.HostMessenger(hmconfig, this); hostLog.info(String.format("Beginning inter-node communication on port %d.", m_config.m_internalPort)); try { m_messenger.start(); } catch (Exception e) { VoltDB.crashLocalVoltDB(e.getMessage(), true, e); } VoltZK.createPersistentZKNodes(m_messenger.getZK()); // Use the host messenger's hostId. m_myHostId = m_messenger.getHostId(); hostLog.info(String.format("Host id of this node is: %d", m_myHostId)); consoleLog.info(String.format("Host id of this node is: %d", m_myHostId)); MeshProber.Determination determination = criteria.waitForDetermination(); // paused is determined in the mesh formation exchanged if (determination.paused) { m_messenger.pause(); } else { m_messenger.unpause(); } // Semi-hacky check to see if we're attempting to rejoin to ourselves. // The leader node gets assigned host ID 0, always, so if we're the // leader and we're rejoining, this is clearly bad. if (m_myHostId == 0 && determination.startAction.doesJoin()) { VoltDB.crashLocalVoltDB("Unable to rejoin a node to itself. " + "Please check your command line and start action and try again.", false, null); } // load or store settings form/to zookeeper if (determination.startAction.doesJoin()) { m_clusterSettings.load(m_messenger.getZK()); m_clusterSettings.get().store(); } else if (m_myHostId == 0) { m_clusterSettings.store(m_messenger.getZK()); } m_clusterCreateTime = m_messenger.getInstanceId().getTimestamp(); return determination; } void logDebuggingInfo(int adminPort, int httpPort, String httpPortExtraLogMessage, boolean jsonEnabled) { String startAction = m_config.m_startAction.toString(); String startActionLog = "Database start action is " + (startAction.substring(0, 1).toUpperCase() + startAction.substring(1).toLowerCase()) + "."; if (!m_rejoining) { hostLog.info(startActionLog); } // print out awesome network stuff hostLog.info(String.format("Listening for native wire protocol clients on port %d.", m_config.m_port)); hostLog.info(String.format("Listening for admin wire protocol clients on port %d.", adminPort)); if (m_startMode == OperationMode.PAUSED) { hostLog.info(String.format("Started in admin mode. Clients on port %d will be rejected in admin mode.", m_config.m_port)); } if (getReplicationRole() == ReplicationRole.REPLICA) { consoleLog.info("Started as " + getReplicationRole().toString().toLowerCase() + " cluster. " + "Clients can only call read-only procedures."); } if (httpPortExtraLogMessage != null) { hostLog.info(httpPortExtraLogMessage); } if (httpPort != -1) { hostLog.info(String.format("Local machine HTTP monitoring is listening on port %d.", httpPort)); } else { hostLog.info(String.format("Local machine HTTP monitoring is disabled.")); } if (jsonEnabled) { hostLog.info(String.format("Json API over HTTP enabled at path /api/1.0/, listening on port %d.", httpPort)); } else { hostLog.info("Json API disabled."); } // java heap size long javamaxheapmem = ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax(); javamaxheapmem /= (1024 * 1024); hostLog.info(String.format("Maximum usable Java heap set to %d mb.", javamaxheapmem)); // Computed minimum heap requirement long minRqt = computeMinimumHeapRqt(MiscUtils.isPro(), m_catalogContext.tables.size(), (m_iv2Initiators.size() - 1), m_configuredReplicationFactor); hostLog.info("Minimum required Java heap for catalog and server config is " + minRqt + " MB."); SortedMap<String, String> dbgMap = m_catalogContext.getDebuggingInfoFromCatalog(true); for (String line : dbgMap.values()) { hostLog.info(line); } // print out a bunch of useful system info PlatformProperties pp = PlatformProperties.getPlatformProperties(); String[] lines = pp.toLogLines(getVersionString()).split("\n"); for (String line : lines) { hostLog.info(line.trim()); } if (m_catalogContext.cluster.getDrconsumerenabled() || m_catalogContext.cluster.getDrproducerenabled()) { hostLog.info("DR initializing with Cluster Id " + m_catalogContext.cluster.getDrclusterid() + ". The DR cluster was first started at " + new Date(m_clusterCreateTime).toString() + "."); } final ZooKeeper zk = m_messenger.getZK(); ZKUtil.ByteArrayCallback operationModeFuture = new ZKUtil.ByteArrayCallback(); /* * Publish our cluster metadata, and then retrieve the metadata * for the rest of the cluster */ try { zk.create( VoltZK.cluster_metadata + "/" + m_messenger.getHostId(), getLocalMetadata().getBytes("UTF-8"), Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, new ZKUtil.StringCallback(), null); zk.getData(VoltZK.operationMode, false, operationModeFuture, null); } catch (Exception e) { VoltDB.crashLocalVoltDB("Error creating \"/cluster_metadata\" node in ZK", true, e); } Map<Integer, String> clusterMetadata = new HashMap<>(0); /* * Spin and attempt to retrieve cluster metadata for all nodes in the cluster. */ Set<Integer> metadataToRetrieve = new HashSet<>(m_messenger.getLiveHostIds()); metadataToRetrieve.remove(m_messenger.getHostId()); while (!metadataToRetrieve.isEmpty()) { Map<Integer, ZKUtil.ByteArrayCallback> callbacks = new HashMap<>(); for (Integer hostId : metadataToRetrieve) { ZKUtil.ByteArrayCallback cb = new ZKUtil.ByteArrayCallback(); zk.getData(VoltZK.cluster_metadata + "/" + hostId, false, cb, null); callbacks.put(hostId, cb); } for (Map.Entry<Integer, ZKUtil.ByteArrayCallback> entry : callbacks.entrySet()) { try { ZKUtil.ByteArrayCallback cb = entry.getValue(); Integer hostId = entry.getKey(); clusterMetadata.put(hostId, new String(cb.getData(), "UTF-8")); metadataToRetrieve.remove(hostId); } catch (KeeperException.NoNodeException e) {} catch (Exception e) { VoltDB.crashLocalVoltDB("Error retrieving cluster metadata", true, e); } } } // print out cluster membership hostLog.info("About to list cluster interfaces for all nodes with format [ip1 ip2 ... ipN] client-port,admin-port,http-port"); for (int hostId : m_messenger.getLiveHostIds()) { if (hostId == m_messenger.getHostId()) { hostLog.info( String.format( " Host id: %d with interfaces: %s [SELF]", hostId, MiscUtils.formatHostMetadataFromJSON(getLocalMetadata()))); } else { String hostMeta = clusterMetadata.get(hostId); hostLog.info( String.format( " Host id: %d with interfaces: %s [PEER]", hostId, MiscUtils.formatHostMetadataFromJSON(hostMeta))); } } try { if (operationModeFuture.getData() != null) { String operationModeStr = new String(operationModeFuture.getData(), "UTF-8"); m_startMode = OperationMode.valueOf(operationModeStr); } } catch (KeeperException.NoNodeException e) {} catch (Exception e) { throw new RuntimeException(e); } } public static String[] extractBuildInfo(VoltLogger logger) { StringBuilder sb = new StringBuilder(64); try { InputStream buildstringStream = ClassLoader.getSystemResourceAsStream("buildstring.txt"); if (buildstringStream != null) { byte b; while ((b = (byte) buildstringStream.read()) != -1) { sb.append((char)b); } String parts[] = sb.toString().split(" ", 2); if (parts.length == 2) { parts[0] = parts[0].trim(); parts[1] = parts[0] + "_" + parts[1].trim(); return parts; } } } catch (Exception ignored) { } try { InputStream versionstringStream = new FileInputStream("version.txt"); try { byte b; while ((b = (byte) versionstringStream.read()) != -1) { sb.append((char)b); } return new String[] { sb.toString().trim(), "VoltDB" }; } finally { versionstringStream.close(); } } catch (Exception ignored2) { if (logger != null) { logger.l7dlog(Level.ERROR, LogKeys.org_voltdb_VoltDB_FailedToRetrieveBuildString.name(), null); } return new String[] { m_defaultVersionString, "VoltDB" }; } } @Override public void readBuildInfo(String editionTag) { String buildInfo[] = extractBuildInfo(hostLog); m_versionString = buildInfo[0]; m_buildString = buildInfo[1]; String buildString = m_buildString; if (m_buildString.contains("_")) buildString = m_buildString.split("_", 2)[1]; consoleLog.info(String.format("Build: %s %s %s", m_versionString, buildString, editionTag)); } void logSystemSettingFromCatalogContext() { if (m_catalogContext == null) { return; } Deployment deploy = m_catalogContext.cluster.getDeployment().get("deployment"); Systemsettings sysSettings = deploy.getSystemsettings().get("systemsettings"); if (sysSettings == null) { return; } hostLog.info("Elastic duration set to " + sysSettings.getElasticduration() + " milliseconds"); hostLog.info("Elastic throughput set to " + sysSettings.getElasticthroughput() + " mb/s"); hostLog.info("Max temptable size set to " + sysSettings.getTemptablemaxsize() + " mb"); hostLog.info("Snapshot priority set to " + sysSettings.getSnapshotpriority() + " [0 - 10]"); if (sysSettings.getQuerytimeout() > 0) { hostLog.info("Query timeout set to " + sysSettings.getQuerytimeout() + " milliseconds"); m_config.m_queryTimeout = sysSettings.getQuerytimeout(); } else if (sysSettings.getQuerytimeout() == 0) { hostLog.info("Query timeout set to unlimited"); m_config.m_queryTimeout = 0; } } /** * Start all the site's event loops. That's it. */ @Override public void run() { if (m_restoreAgent != null) { // start restore process m_restoreAgent.restore(); } else { onSnapshotRestoreCompletion(); onReplayCompletion(Long.MIN_VALUE, m_iv2InitiatorStartingTxnIds); } // Start the rejoin coordinator if (m_joinCoordinator != null) { try { m_statusTracker.setNodeState(NodeState.REJOINING); if (!m_joinCoordinator.startJoin(m_catalogContext.database)) { VoltDB.crashLocalVoltDB("Failed to join the cluster", true, null); } } catch (Exception e) { VoltDB.crashLocalVoltDB("Failed to join the cluster", true, e); } } m_isRunning = true; } /** * Try to shut everything down so they system is ready to call * initialize again. * @param mainSiteThread The thread that m_inititalized the VoltDB or * null if called from that thread. */ @Override public boolean shutdown(Thread mainSiteThread) throws InterruptedException { synchronized(m_startAndStopLock) { boolean did_it = false; if (m_mode != OperationMode.SHUTTINGDOWN) { did_it = true; m_mode = OperationMode.SHUTTINGDOWN; /* * Various scheduled tasks get crashy in unit tests if they happen to run * while other stuff is being shut down. Double catch of throwable is only for the sake of tests. */ try { for (ScheduledFuture<?> sc : m_periodicWorks) { sc.cancel(false); try { sc.get(); } catch (Throwable t) { } } } catch (Throwable t) { } //Shutdown import processors. ImportManager.instance().shutdown(); // clear resMonitorWork resMonitorWork = null; m_periodicWorks.clear(); m_snapshotCompletionMonitor.shutdown(); m_periodicWorkThread.shutdown(); m_periodicWorkThread.awaitTermination(356, TimeUnit.DAYS); m_periodicPriorityWorkThread.shutdown(); m_periodicPriorityWorkThread.awaitTermination(356, TimeUnit.DAYS); if (m_elasticJoinService != null) { m_elasticJoinService.shutdown(); } if (m_leaderAppointer != null) { m_leaderAppointer.shutdown(); } m_globalServiceElector.shutdown(); if (m_hasStartedSampler.get()) { m_sampler.setShouldStop(); m_sampler.join(); } // shutdown the web monitoring / json if (m_adminListener != null) m_adminListener.stop(); // shut down the client interface if (m_clientInterface != null) { m_clientInterface.shutdown(); m_clientInterface = null; } // send hostDown trap as client interface is // no longer available m_snmp.hostDown(FaultLevel.INFO, m_messenger.getHostId(), "Host is shutting down"); // tell the iv2 sites to stop their runloop if (m_iv2Initiators != null) { for (Initiator init : m_iv2Initiators.values()) init.shutdown(); } if (m_cartographer != null) { m_cartographer.shutdown(); } if (m_configLogger != null) { m_configLogger.join(); } // shut down Export and its connectors. ExportManager.instance().shutdown(); // After sites are terminated, shutdown the DRProducer. // The DRProducer is shared by all sites; don't kill it while any site is active. if (m_producerDRGateway != null) { try { m_producerDRGateway.shutdown(); } catch (InterruptedException e) { hostLog.warn("Interrupted shutting down invocation buffer server", e); } finally { m_producerDRGateway = null; } } shutdownReplicationConsumerRole(); if (m_snapshotIOAgent != null) { m_snapshotIOAgent.shutdown(); } // shut down the network/messaging stuff // Close the host messenger first, which should close down all of // the ForeignHost sockets cleanly if (m_messenger != null) { m_messenger.shutdown(); } m_messenger = null; // shutdown the cipher service CipherExecutor.SERVER.shutdown(); //Also for test code that expects a fresh stats agent if (m_opsRegistrar != null) { try { m_opsRegistrar.shutdown(); } finally { m_opsRegistrar = null; } } if (m_asyncCompilerAgent != null) { m_asyncCompilerAgent.shutdown(); m_asyncCompilerAgent = null; } ExportManager.instance().shutdown(); m_computationService.shutdown(); m_computationService.awaitTermination(1, TimeUnit.DAYS); m_computationService = null; m_catalogContext = null; m_initiatorStats = null; m_latencyStats = null; m_latencyCompressedStats = null; m_latencyHistogramStats = null; AdHocCompilerCache.clearHashCache(); org.voltdb.iv2.InitiatorMailbox.m_allInitiatorMailboxes.clear(); PartitionDRGateway.m_partitionDRGateways = ImmutableMap.of(); // probably unnecessary, but for tests it's nice because it // will do the memory checking and run finalizers System.gc(); System.runFinalization(); m_isRunning = false; } return did_it; } } @Override synchronized public void logUpdate(String xmlConfig, long currentTxnId, File voltroot) { // another site already did this work. if (currentTxnId == m_lastLogUpdateTxnId) { return; } else if (currentTxnId < m_lastLogUpdateTxnId) { throw new RuntimeException( "Trying to update logging config at transaction " + m_lastLogUpdateTxnId + " with an older transaction: " + currentTxnId); } hostLog.info("Updating RealVoltDB logging config from txnid: " + m_lastLogUpdateTxnId + " to " + currentTxnId); m_lastLogUpdateTxnId = currentTxnId; VoltLogger.configure(xmlConfig, voltroot); } /** Struct to associate a context with a counter of served sites */ private static class ContextTracker { ContextTracker(CatalogContext context, CatalogSpecificPlanner csp) { m_dispensedSites = 1; m_context = context; m_csp = csp; } long m_dispensedSites; final CatalogContext m_context; final CatalogSpecificPlanner m_csp; } /** Associate transaction ids to contexts */ private final HashMap<Long, ContextTracker>m_txnIdToContextTracker = new HashMap<>(); @Override public Pair<CatalogContext, CatalogSpecificPlanner> catalogUpdate( String diffCommands, byte[] newCatalogBytes, byte[] catalogBytesHash, int expectedCatalogVersion, long currentTxnId, long currentTxnUniqueId, byte[] deploymentBytes, byte[] deploymentHash, boolean requireCatalogDiffCmdsApplyToEE, boolean hasSchemaChange, boolean requiresNewExportGeneration) { try { synchronized(m_catalogUpdateLock) { final ReplicationRole oldRole = getReplicationRole(); m_statusTracker.setNodeState(NodeState.UPDATING); // A site is catching up with catalog updates if (currentTxnId <= m_catalogContext.m_transactionId && !m_txnIdToContextTracker.isEmpty()) { ContextTracker contextTracker = m_txnIdToContextTracker.get(currentTxnId); // This 'dispensed' concept is a little crazy fragile. Maybe it would be better // to keep a rolling N catalogs? Or perhaps to keep catalogs for N minutes? Open // to opinions here. contextTracker.m_dispensedSites++; int ttlsites = VoltDB.instance().getSiteTrackerForSnapshot().getSitesForHost(m_messenger.getHostId()).size(); if (contextTracker.m_dispensedSites == ttlsites) { m_txnIdToContextTracker.remove(currentTxnId); } return Pair.of( contextTracker.m_context, contextTracker.m_csp); } else if (m_catalogContext.catalogVersion != expectedCatalogVersion) { hostLog.fatal("Failed catalog update." + " expectedCatalogVersion: " + expectedCatalogVersion + " currentTxnId: " + currentTxnId + " currentTxnUniqueId: " + currentTxnUniqueId + " m_catalogContext.catalogVersion " + m_catalogContext.catalogVersion); throw new RuntimeException("Trying to update main catalog context with diff " + "commands generated for an out-of date catalog. Expected catalog version: " + expectedCatalogVersion + " does not match actual version: " + m_catalogContext.catalogVersion); } // get old debugging info SortedMap<String, String> oldDbgMap = m_catalogContext.getDebuggingInfoFromCatalog(false); byte[] oldDeployHash = m_catalogContext.deploymentHash; final String oldDRConnectionSource = m_catalogContext.cluster.getDrmasterhost(); // 0. A new catalog! Update the global context and the context tracker m_catalogContext = m_catalogContext.update( currentTxnId, currentTxnUniqueId, newCatalogBytes, catalogBytesHash, diffCommands, true, deploymentBytes, m_messenger, hasSchemaChange); final CatalogSpecificPlanner csp = new CatalogSpecificPlanner( m_asyncCompilerAgent, m_catalogContext); m_txnIdToContextTracker.put(currentTxnId, new ContextTracker( m_catalogContext, csp)); // log the stuff that's changed in this new catalog update SortedMap<String, String> newDbgMap = m_catalogContext.getDebuggingInfoFromCatalog(false); for (Entry<String, String> e : newDbgMap.entrySet()) { // skip log lines that are unchanged if (oldDbgMap.containsKey(e.getKey()) && oldDbgMap.get(e.getKey()).equals(e.getValue())) { continue; } hostLog.info(e.getValue()); } //Construct the list of partitions and sites because it simply doesn't exist anymore SiteTracker siteTracker = VoltDB.instance().getSiteTrackerForSnapshot(); List<Long> sites = siteTracker.getSitesForHost(m_messenger.getHostId()); List<Integer> partitions = new ArrayList<>(); for (Long site : sites) { Integer partition = siteTracker.getPartitionForSite(site); partitions.add(partition); } // 1. update the export manager. ExportManager.instance().updateCatalog(m_catalogContext, requireCatalogDiffCmdsApplyToEE, requiresNewExportGeneration, partitions); // 1.1 Update the elastic join throughput settings if (m_elasticJoinService != null) m_elasticJoinService.updateConfig(m_catalogContext); // 1.5 update the dead host timeout if (m_catalogContext.cluster.getHeartbeattimeout() * 1000 != m_config.m_deadHostTimeoutMS) { m_config.m_deadHostTimeoutMS = m_catalogContext.cluster.getHeartbeattimeout() * 1000; m_messenger.setDeadHostTimeout(m_config.m_deadHostTimeoutMS); } // 2. update client interface (asynchronously) // CI in turn updates the planner thread. if (m_clientInterface != null) { m_clientInterface.notifyOfCatalogUpdate(); } // 3. update HTTPClientInterface (asynchronously) // This purges cached connection state so that access with // stale auth info is prevented. if (m_adminListener != null) { m_adminListener.notifyOfCatalogUpdate(); } m_clientInterface.getDispatcher().notifyNTProcedureServiceOfPreCatalogUpdate(); // 4. Flush StatisticsAgent old user PROCEDURE statistics. // The stats agent will hold all other stats in memory. getStatsAgent().notifyOfCatalogUpdate(); // 4.5. (added) // Update the NT procedure service AFTER stats are cleared in the previous step m_clientInterface.getDispatcher().notifyNTProcedureServiceOfCatalogUpdate(); // 5. MPIs don't run fragments. Update them here. Do // this after flushing the stats -- this will re-register // the MPI statistics. if (m_MPI != null) { m_MPI.updateCatalog(diffCommands, m_catalogContext, csp, requireCatalogDiffCmdsApplyToEE, requiresNewExportGeneration); } // Update catalog for import processor this should be just/stop start and updat partitions. ImportManager.instance().updateCatalog(m_catalogContext, m_messenger); // 6. Perform updates required by the DR subsystem // 6.1. Perform any actions that would have been taken during the ordinary initialization path if (m_consumerDRGateway != null) { // 6.2. If we are a DR replica and the consumer was created // before the catalog update, we may care about a deployment // update. If it was created above, no need to notify // because the consumer already has the latest catalog. final String newDRConnectionSource = m_catalogContext.cluster.getDrmasterhost(); m_consumerDRGateway.updateCatalog(m_catalogContext, (newDRConnectionSource != null && !newDRConnectionSource.equals(oldDRConnectionSource) ? newDRConnectionSource : null), (byte) m_catalogContext.cluster.getPreferredsource()); } // Check if this is promotion if (oldRole == ReplicationRole.REPLICA && m_catalogContext.cluster.getDrrole().equals("master")) { // Promote replica to master promoteToMaster(); } // 6.3. If we are a DR master, update the DR table signature hash if (m_producerDRGateway != null) { m_producerDRGateway.updateCatalog(m_catalogContext, VoltDB.getReplicationPort(m_catalogContext.cluster.getDrproducerport())); } new ConfigLogging().logCatalogAndDeployment(); // log system setting information if the deployment config has changed if (!Arrays.equals(oldDeployHash, m_catalogContext.deploymentHash)) { logSystemSettingFromCatalogContext(); } //Before starting resource monitor update any Snmp configuration changes. if (m_snmp != null) { m_snmp.notifyOfCatalogUpdate(m_catalogContext.getDeployment().getSnmp()); } // restart resource usage monitoring task startHealthMonitor(); checkHeapSanity(MiscUtils.isPro(), m_catalogContext.tables.size(), (m_iv2Initiators.size() - 1), m_configuredReplicationFactor); checkThreadsSanity(); return Pair.of(m_catalogContext, csp); } } finally { //Set state back to UP m_statusTracker.setNodeState(NodeState.UP); } } @Override public Pair<CatalogContext, CatalogSpecificPlanner> settingsUpdate( ClusterSettings settings, final int expectedVersionId) { CatalogSpecificPlanner csp = new CatalogSpecificPlanner(m_asyncCompilerAgent, m_catalogContext); synchronized(m_catalogUpdateLock) { int stamp [] = new int[]{0}; ClusterSettings expect = m_clusterSettings.get(stamp); if ( stamp[0] == expectedVersionId && m_clusterSettings.compareAndSet(expect, settings, stamp[0], expectedVersionId+1) ) { try { settings.store(); } catch (SettingsException e) { hostLog.error(e); throw e; } } else if (stamp[0] != expectedVersionId+1) { String msg = "Failed to update cluster setting to version " + (expectedVersionId + 1) + ", from current version " + stamp[0] + ". Reloading from Zookeeper"; hostLog.warn(msg); m_clusterSettings.load(m_messenger.getZK()); } if (m_MPI != null) { m_MPI.updateSettings(m_catalogContext, csp); } // good place to set deadhost timeout once we make it a config } return Pair.of(m_catalogContext, csp); } @Override public VoltDB.Configuration getConfig() { return m_config; } @Override public String getBuildString() { return m_buildString == null ? "VoltDB" : m_buildString; } @Override public String getVersionString() { return m_versionString; } public final VersionChecker m_versionChecker = new VersionChecker() { @Override public boolean isCompatibleVersionString(String other) { return RealVoltDB.this.isCompatibleVersionString(other); } @Override public String getVersionString() { return RealVoltDB.this.getVersionString(); } @Override public String getBuildString() { return RealVoltDB.this.getBuildString(); } }; /** * Used for testing when you don't have an instance. Should do roughly what * {@link #isCompatibleVersionString(String)} does. */ public static boolean staticIsCompatibleVersionString(String versionString) { return versionString.matches(m_defaultHotfixableRegexPattern); } @Override public boolean isCompatibleVersionString(String versionString) { return versionString.matches(m_hotfixableRegexPattern); } @Override public String getEELibraryVersionString() { return m_defaultVersionString; } @Override public HostMessenger getHostMessenger() { return m_messenger; } @Override public ClientInterface getClientInterface() { return m_clientInterface; } @Override public OpsAgent getOpsAgent(OpsSelector selector) { return m_opsRegistrar.getAgent(selector); } @Override public StatsAgent getStatsAgent() { OpsAgent statsAgent = m_opsRegistrar.getAgent(OpsSelector.STATISTICS); assert(statsAgent instanceof StatsAgent); return (StatsAgent)statsAgent; } @Override public MemoryStats getMemoryStatsSource() { return m_memoryStats; } @Override public CatalogContext getCatalogContext() { return m_catalogContext; } /** * Tells if the VoltDB is running. m_isRunning needs to be set to true * when the run() method is called, and set to false when shutting down. * * @return true if the VoltDB is running. */ @Override public boolean isRunning() { return m_isRunning; } @Override public void halt() { SnmpTrapSender snmp = getSnmpTrapSender(); if (snmp != null) { try { snmp.hostDown(FaultLevel.INFO, m_messenger.getHostId(), "Host is shutting down because of @StopNode"); snmp.shutdown(); } catch (Throwable t) { VoltLogger log = new VoltLogger("HOST"); log.warn("failed to issue a crash SNMP trap", t); } } Thread shutdownThread = new Thread() { @Override public void run() { hostLog.warn("VoltDB node shutting down as requested by @StopNode command."); System.exit(0); } }; shutdownThread.start(); } /** * Debugging function - creates a record of the current state of the system. * @param out PrintStream to write report to. */ public void createRuntimeReport(PrintStream out) { // This function may be running in its own thread. out.print("MIME-Version: 1.0\n"); out.print("Content-type: multipart/mixed; boundary=\"reportsection\""); out.print("\n\n--reportsection\nContent-Type: text/plain\n\nClientInterface Report\n"); if (m_clientInterface != null) { out.print(m_clientInterface.toString() + "\n"); } } @Override public BackendTarget getBackendTargetType() { return m_config.m_backend; } @Override public synchronized void onExecutionSiteRejoinCompletion(long transferred) { m_executionSiteRecoveryFinish = System.currentTimeMillis(); m_executionSiteRecoveryTransferred = transferred; onRejoinCompletion(); } private void onRejoinCompletion() { // null out the rejoin coordinator if (m_joinCoordinator != null) { m_joinCoordinator.close(); } m_joinCoordinator = null; // Mark the data transfer as done so CL can make the right decision when a truncation snapshot completes m_rejoinDataPending = false; try { m_testBlockRecoveryCompletion.acquire(); } catch (InterruptedException e) {} final long delta = ((m_executionSiteRecoveryFinish - m_recoveryStartTime) / 1000); final long megabytes = m_executionSiteRecoveryTransferred / (1024 * 1024); final double megabytesPerSecond = megabytes / ((m_executionSiteRecoveryFinish - m_recoveryStartTime) / 1000.0); deleteStagedCatalogIfNeeded(); if (m_clientInterface != null) { m_clientInterface.mayActivateSnapshotDaemon(); try { m_clientInterface.startAcceptingConnections(); } catch (IOException e) { hostLog.l7dlog(Level.FATAL, LogKeys.host_VoltDB_ErrorStartAcceptingConnections.name(), e); VoltDB.crashLocalVoltDB("Error starting client interface.", true, e); } // send hostUp trap m_snmp.hostUp("Host is now a cluster member"); if (m_producerDRGateway != null && !m_producerDRGateway.isStarted()) { // Initialize DR producer and consumer start listening on the DR ports initializeDRProducer(); createDRConsumerIfNeeded(); prepareReplication(); } } startHealthMonitor(); try { if (m_adminListener != null) { m_adminListener.start(); } } catch (Exception e) { hostLog.l7dlog(Level.FATAL, LogKeys.host_VoltDB_ErrorStartHTTPListener.name(), e); VoltDB.crashLocalVoltDB("HTTP service unable to bind to port.", true, e); } // Allow export datasources to start consuming their binary deques safely // as at this juncture the initial truncation snapshot is already complete ExportManager.instance().startPolling(m_catalogContext); //Tell import processors that they can start ingesting data. ImportManager.instance().readyForData(m_catalogContext, m_messenger); if (m_config.m_startAction == StartAction.REJOIN) { consoleLog.info( "Node data recovery completed after " + delta + " seconds with " + megabytes + " megabytes transferred at a rate of " + megabytesPerSecond + " megabytes/sec"); } try { final ZooKeeper zk = m_messenger.getZK(); boolean logRecoveryCompleted = false; if (getCommandLog().getClass().getName().equals("org.voltdb.CommandLogImpl")) { String requestNode = zk.create(VoltZK.request_truncation_snapshot_node, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT_SEQUENTIAL); if (m_rejoinTruncationReqId == null) { m_rejoinTruncationReqId = requestNode; } } else { logRecoveryCompleted = true; } // Join creates a truncation snapshot as part of the join process, // so there is no need to wait for the truncation snapshot requested // above to finish. if (logRecoveryCompleted || m_joining) { if (m_rejoining) { CoreZK.removeRejoinNodeIndicatorForHost(m_messenger.getZK(), m_myHostId); m_rejoining = false; } if (m_joining) { CoreZK.removeJoinNodeIndicatorForHost(m_messenger.getZK(), m_myHostId); } String actionName = m_joining ? "join" : "rejoin"; m_joining = false; consoleLog.info(String.format("Node %s completed", actionName)); } } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to log host rejoin completion to ZK", true, e); } hostLog.info("Logging host rejoin completion to ZK"); m_statusTracker.setNodeState(NodeState.UP); Object args[] = { (VoltDB.instance().getMode() == OperationMode.PAUSED) ? "PAUSED" : "NORMAL"}; consoleLog.l7dlog( Level.INFO, LogKeys.host_VoltDB_ServerOpMode.name(), args, null); consoleLog.l7dlog( Level.INFO, LogKeys.host_VoltDB_ServerCompletedInitialization.name(), null, null); } @Override public CommandLog getCommandLog() { return m_commandLog; } @Override public OperationMode getMode() { return m_mode; } @Override public void setMode(OperationMode mode) { if (m_mode != mode) { if (mode == OperationMode.PAUSED) { m_config.m_isPaused = true; m_statusTracker.setNodeState(NodeState.PAUSED); hostLog.info("Server is entering admin mode and pausing."); } else if (m_mode == OperationMode.PAUSED) { m_config.m_isPaused = false; m_statusTracker.setNodeState(NodeState.UP); hostLog.info("Server is exiting admin mode and resuming operation."); } } m_mode = mode; } @Override public void setStartMode(OperationMode mode) { m_startMode = mode; } @Override public OperationMode getStartMode() { return m_startMode; } @Override public void promoteToMaster() { consoleLog.info("Promoting replication role from replica to master."); hostLog.info("Promoting replication role from replica to master."); shutdownReplicationConsumerRole(); if (m_clientInterface != null) { m_clientInterface.setReplicationRole(getReplicationRole()); } } private void replaceDRConsumerStatsWithDummy() { getStatsAgent().deregisterStatsSourcesFor(StatsSelector.DRCONSUMERNODE, 0); getStatsAgent().deregisterStatsSourcesFor(StatsSelector.DRCONSUMERPARTITION, 0); getStatsAgent().registerStatsSource(StatsSelector.DRCONSUMERNODE, 0, new DRConsumerStatsBase.DRConsumerNodeStatsBase()); getStatsAgent().registerStatsSource(StatsSelector.DRCONSUMERPARTITION, 0, new DRConsumerStatsBase.DRConsumerPartitionStatsBase()); } private void shutdownReplicationConsumerRole() { if (m_consumerDRGateway != null) { try { m_consumerDRGateway.shutdown(true); } catch (InterruptedException|ExecutionException e) { hostLog.warn("Interrupted shutting down dr replication", e); } finally { m_consumerDRGateway = null; } } } @Override public ReplicationRole getReplicationRole() { final String role = m_catalogContext.cluster.getDrrole(); if (role.equals(DrRoleType.REPLICA.value())) { return ReplicationRole.REPLICA; } else { return ReplicationRole.NONE; } } /** * Metadata is a JSON object */ @Override public String getLocalMetadata() { return m_localMetadata; } @Override public void onSnapshotRestoreCompletion() { if (!m_rejoining && !m_joining) { initializeDRProducer(); } } @Override public void onReplayCompletion(long txnId, Map<Integer, Long> perPartitionTxnIds) { /* * Remove the terminus file if it is there, which is written on shutdown --save */ new File(m_nodeSettings.getVoltDBRoot(), VoltDB.TERMINUS_MARKER).delete(); /* * Command log is already initialized if this is a rejoin or a join */ if ((m_commandLog != null) && (m_commandLog.needsInitialization())) { // Initialize command logger m_commandLog.init(m_catalogContext.cluster.getLogconfig().get("log").getLogsize(), txnId, m_cartographer.getPartitionCount(), m_config.m_commandLogBinding, perPartitionTxnIds); try { ZKCountdownLatch latch = new ZKCountdownLatch(m_messenger.getZK(), VoltZK.commandlog_init_barrier, m_messenger.getLiveHostIds().size()); latch.countDown(true); latch.await(); } catch (Exception e) { VoltDB.crashLocalVoltDB("Failed to init and wait on command log init barrier", true, e); } } /* * IV2: After the command log is initialized, force the writing of the initial * viable replay set. Turns into a no-op with no command log, on the non-leader sites, and on the MPI. */ for (Initiator initiator : m_iv2Initiators.values()) { initiator.enableWritingIv2FaultLog(); } /* * IV2: From this point on, not all node failures should crash global VoltDB. */ if (m_leaderAppointer != null) { m_leaderAppointer.onReplayCompletion(); } deleteStagedCatalogIfNeeded(); if (m_startMode != null) { m_mode = m_startMode; } else { // Shouldn't be here, but to be safe m_mode = OperationMode.RUNNING; } if (!m_rejoining && !m_joining) { if (m_clientInterface != null) { try { m_clientInterface.startAcceptingConnections(); } catch (IOException e) { hostLog.l7dlog(Level.FATAL, LogKeys.host_VoltDB_ErrorStartAcceptingConnections.name(), e); VoltDB.crashLocalVoltDB("Error starting client interface.", true, e); } // send hostUp trap m_snmp.hostUp("host is now a cluster member"); } // Start listening on the DR ports createDRConsumerIfNeeded(); prepareReplication(); startHealthMonitor(); // Allow export datasources to start consuming their binary deques safely // as at this juncture the initial truncation snapshot is already complete ExportManager.instance().startPolling(m_catalogContext); //Tell import processors that they can start ingesting data. ImportManager.instance().readyForData(m_catalogContext, m_messenger); } try { if (m_adminListener != null) { m_adminListener.start(); } } catch (Exception e) { hostLog.l7dlog(Level.FATAL, LogKeys.host_VoltDB_ErrorStartHTTPListener.name(), e); VoltDB.crashLocalVoltDB("HTTP service unable to bind to port.", true, e); } if (!m_rejoining && !m_joining) { Object args[] = { (m_mode == OperationMode.PAUSED) ? "PAUSED" : "NORMAL"}; consoleLog.l7dlog( Level.INFO, LogKeys.host_VoltDB_ServerOpMode.name(), args, null); consoleLog.l7dlog( Level.INFO, LogKeys.host_VoltDB_ServerCompletedInitialization.name(), null, null); m_statusTracker.setNodeState(NodeState.UP); } // Create a zk node to indicate initialization is completed m_messenger.getZK().create(VoltZK.init_completed, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, new ZKUtil.StringCallback(), null); } private void deleteStagedCatalogIfNeeded() { if (((m_commandLog != null) && m_commandLog.isEnabled()) || (m_terminusNonce != null)) { File stagedCatalog = new VoltFile(RealVoltDB.getStagedCatalogPath(getVoltDBRootPath())); if (stagedCatalog.exists()) { if (stagedCatalog.delete()) { hostLog.info("Saved copy of the initialized schema deleted because command logs and/or snapshots are in use."); } else { hostLog.warn("Failed to delete the saved copy of the initialized schema."); } } } } @Override public SnapshotCompletionMonitor getSnapshotCompletionMonitor() { return m_snapshotCompletionMonitor; } @Override public synchronized void recoveryComplete(String requestId) { assert(m_rejoinDataPending == false); if (m_rejoining) { if (m_rejoinTruncationReqId.compareTo(requestId) <= 0) { String actionName = m_joining ? "join" : "rejoin"; // remove the rejoin blocker CoreZK.removeRejoinNodeIndicatorForHost(m_messenger.getZK(), m_myHostId); consoleLog.info(String.format("Node %s completed", actionName)); m_rejoinTruncationReqId = null; m_rejoining = false; } else { // If we saw some other truncation request ID, then try the same one again. As long as we // don't flip the m_rejoining state, all truncation snapshot completions will call back to here. try { final ZooKeeper zk = m_messenger.getZK(); String requestNode = zk.create(VoltZK.request_truncation_snapshot_node, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT_SEQUENTIAL); if (m_rejoinTruncationReqId == null) { m_rejoinTruncationReqId = requestNode; } } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to retry post-rejoin truncation snapshot request.", true, e); } } } } @Override public ScheduledExecutorService getSES(boolean priority) { return priority ? m_periodicPriorityWorkThread : m_periodicWorkThread; } /** * See comment on {@link VoltDBInterface#scheduleWork(Runnable, long, long, TimeUnit)} vs * {@link VoltDBInterface#schedulePriorityWork(Runnable, long, long, TimeUnit)} */ @Override public ScheduledFuture<?> scheduleWork(Runnable work, long initialDelay, long delay, TimeUnit unit) { if (delay > 0) { return m_periodicWorkThread.scheduleWithFixedDelay(work, initialDelay, delay, unit); } else { return m_periodicWorkThread.schedule(work, initialDelay, unit); } } @Override public ListeningExecutorService getComputationService() { return m_computationService; } /** * Initialize the DR producer so that any binary log generated on recover * will be queued. This does NOT open the DR port. That will happen after * command log replay finishes. */ private void initializeDRProducer() { try { if (m_producerDRGateway != null) { m_producerDRGateway.startAndWaitForGlobalAgreement(); for (Initiator iv2init : m_iv2Initiators.values()) { iv2init.initDRGateway(m_config.m_startAction, m_producerDRGateway, shouldInitiatorCreateMPDRGateway(iv2init)); } m_producerDRGateway.truncateDRLog(); } } catch (Exception ex) { CoreUtils.printPortsInUse(hostLog); VoltDB.crashLocalVoltDB("Failed to initialize DR producer", false, ex); } } private void prepareReplication() { // Warning: This is called on the site thread if this host is rejoining try { if (m_consumerDRGateway != null) { if (m_config.m_startAction != StartAction.CREATE) { Pair<Byte, List<MeshMemberInfo>> expectedClusterMembers = m_producerDRGateway.getInitialConversations(); m_consumerDRGateway.setInitialConversationMembership(expectedClusterMembers.getFirst(), expectedClusterMembers.getSecond()); } m_consumerDRGateway.initialize(m_config.m_startAction != StartAction.CREATE); } if (m_producerDRGateway != null) { m_producerDRGateway.startListening(m_catalogContext.cluster.getDrproducerenabled(), VoltDB.getReplicationPort(m_catalogContext.cluster.getDrproducerport()), VoltDB.getDefaultReplicationInterface()); } } catch (Exception ex) { CoreUtils.printPortsInUse(hostLog); VoltDB.crashLocalVoltDB("Failed to initialize DR", false, ex); } } private boolean shouldInitiatorCreateMPDRGateway(Initiator initiator) { // The initiator map is sorted, the initiator that has the lowest local // partition ID gets to create the MP DR gateway return initiator.getPartitionId() == m_iv2Initiators.firstKey(); } private boolean createDRConsumerIfNeeded() { if (!m_config.m_isEnterprise || (m_consumerDRGateway != null)) { return false; } final String drRole = m_catalogContext.getCluster().getDrrole(); if (DrRoleType.REPLICA.value().equals(drRole) || DrRoleType.XDCR.value().equals(drRole)) { byte drConsumerClusterId = (byte)m_catalogContext.cluster.getDrclusterid(); final Pair<String, Integer> drIfAndPort = VoltZK.getDRInterfaceAndPortFromMetadata(m_localMetadata); try { Class<?> rdrgwClass = Class.forName("org.voltdb.dr2.ConsumerDRGatewayImpl"); Constructor<?> rdrgwConstructor = rdrgwClass.getConstructor( ClientInterface.class, Cartographer.class, HostMessenger.class, byte.class, byte.class, String.class, int.class); m_consumerDRGateway = (ConsumerDRGateway) rdrgwConstructor.newInstance( m_clientInterface, m_cartographer, m_messenger, drConsumerClusterId, (byte) m_catalogContext.cluster.getPreferredsource(), drIfAndPort.getFirst(), drIfAndPort.getSecond()); m_globalServiceElector.registerService(m_consumerDRGateway); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to load DR system", true, e); } return true; } return false; } // Thread safe @Override public void setReplicationActive(boolean active) { if (m_replicationActive.compareAndSet(!active, active)) { try { JSONStringer js = new JSONStringer(); js.object(); js.keySymbolValuePair("active", m_replicationActive.get()); js.endObject(); getHostMessenger().getZK().setData(VoltZK.replicationconfig, js.toString().getBytes("UTF-8"), -1); } catch (Exception e) { e.printStackTrace(); hostLog.error("Failed to write replication active state to ZK: " + e.getMessage()); } if (m_producerDRGateway != null) { m_producerDRGateway.setActive(active); } } } @Override public boolean getReplicationActive() { return m_replicationActive.get(); } @Override public ProducerDRGateway getNodeDRGateway() { return m_producerDRGateway; } @Override public ConsumerDRGateway getConsumerDRGateway() { return m_consumerDRGateway; } @Override public void onSyncSnapshotCompletion() { m_leaderAppointer.onSyncSnapshotCompletion(); } @Override public void setDurabilityUniqueIdListener(Integer partition, DurableUniqueIdListener listener) { if (partition == MpInitiator.MP_INIT_PID) { m_iv2Initiators.get(m_iv2Initiators.firstKey()).setDurableUniqueIdListener(listener); } else { Initiator init = m_iv2Initiators.get(partition); assert init != null; init.setDurableUniqueIdListener(listener); } } public ExecutionEngine debugGetSpiedEE(int partitionId) { if (m_config.m_backend == BackendTarget.NATIVE_EE_SPY_JNI) { BaseInitiator init = (BaseInitiator)m_iv2Initiators.get(partitionId); return init.debugGetSpiedEE(); } else { return null; } } @Override public SiteTracker getSiteTrackerForSnapshot() { return new SiteTracker(m_messenger.getHostId(), m_cartographer.getSiteTrackerMailboxMap(), 0); } /** * Create default deployment.xml file in voltdbroot if the deployment path is null. * * @return path to default deployment file * @throws IOException */ static String setupDefaultDeployment(VoltLogger logger) throws IOException { return setupDefaultDeployment(logger, CatalogUtil.getVoltDbRoot(null)); } /** * Create default deployment.xml file in voltdbroot if the deployment path is null. * * @return pathto default deployment file * @throws IOException */ static String setupDefaultDeployment(VoltLogger logger, File voltdbroot) throws IOException { File configInfoDir = new VoltFile(voltdbroot, Constants.CONFIG_DIR); configInfoDir.mkdirs(); File depFH = new VoltFile(configInfoDir, "deployment.xml"); if (!depFH.exists()) { logger.info("Generating default deployment file \"" + depFH.getAbsolutePath() + "\""); try (BufferedWriter bw = new BufferedWriter(new FileWriter(depFH))) { for (String line : defaultDeploymentXML) { bw.write(line); bw.newLine(); } } finally { } } return depFH.getAbsolutePath(); } /* * Validate the build string with the rest of the cluster * by racing to publish it to ZK and then comparing the one this process * has to the one in ZK. They should all match. The method returns a future * so that init can continue while the ZK call is pending since it ZK is pretty * slow. */ private Future<?> validateBuildString(final String buildString, ZooKeeper zk) { final SettableFuture<Object> retval = SettableFuture.create(); byte buildStringBytes[] = null; try { buildStringBytes = buildString.getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new AssertionError(e); } final byte buildStringBytesFinal[] = buildStringBytes; //Can use a void callback because ZK will execute the create and then the get in order //It's a race so it doesn't have to succeed zk.create( VoltZK.buildstring, buildStringBytes, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, new ZKUtil.StringCallback(), null); zk.getData(VoltZK.buildstring, false, new org.apache.zookeeper_voltpatches.AsyncCallback.DataCallback() { @Override public void processResult(int rc, String path, Object ctx, byte[] data, Stat stat) { KeeperException.Code code = KeeperException.Code.get(rc); if (code == KeeperException.Code.OK) { if (Arrays.equals(buildStringBytesFinal, data)) { retval.set(null); } else { try { hostLog.info("Different but compatible software versions on the cluster " + "and the rejoining node. Cluster version is {" + (new String(data, "UTF-8")).split("_")[0] + "}. Rejoining node version is {" + m_defaultVersionString + "}."); retval.set(null); } catch (UnsupportedEncodingException e) { retval.setException(new AssertionError(e)); } } } else { retval.setException(KeeperException.create(code)); } } }, null); return retval; } /** * See comment on {@link VoltDBInterface#schedulePriorityWork(Runnable, long, long, TimeUnit)} vs * {@link VoltDBInterface#scheduleWork(Runnable, long, long, TimeUnit)} */ @Override public ScheduledFuture<?> schedulePriorityWork(Runnable work, long initialDelay, long delay, TimeUnit unit) { if (delay > 0) { return m_periodicPriorityWorkThread.scheduleWithFixedDelay(work, initialDelay, delay, unit); } else { return m_periodicPriorityWorkThread.schedule(work, initialDelay, unit); } } private void checkHeapSanity(boolean isPro, int tableCount, int sitesPerHost, int kfactor) { long megabytes = 1024 * 1024; long maxMemory = Runtime.getRuntime().maxMemory() / megabytes; // DRv2 now is off heap long crazyThresh = computeMinimumHeapRqt(isPro, tableCount, sitesPerHost, kfactor); if (maxMemory < crazyThresh) { StringBuilder builder = new StringBuilder(); builder.append(String.format("The configuration of %d tables, %d sites-per-host, and k-factor of %d requires at least %d MB of Java heap memory. ", tableCount, sitesPerHost, kfactor, crazyThresh)); builder.append(String.format("The maximum amount of heap memory available to the JVM is %d MB. ", maxMemory)); builder.append("Please increase the maximum heap size using the VOLTDB_HEAPMAX environment variable and then restart VoltDB."); consoleLog.warn(builder.toString()); } } // Compute the minimum required heap to run this configuration. This comes from the documentation, // http://voltdb.com/docs/PlanningGuide/MemSizeServers.php#MemSizeHeapGuidelines // Any changes there should get reflected here and vice versa. static public long computeMinimumHeapRqt(boolean isPro, int tableCount, int sitesPerHost, int kfactor) { long baseRqt = 384; long tableRqt = 10 * tableCount; // K-safety Heap consumption drop to 8 MB (per node) // Snapshot cost 32 MB (per node) // Theoretically, 40 MB (per node) should be enough long rejoinRqt = (isPro && kfactor > 0) ? 128 * sitesPerHost : 0; return baseRqt + tableRqt + rejoinRqt; } private void checkThreadsSanity() { int tableCount = m_catalogContext.tables.size(); int partitions = m_iv2Initiators.size() - 1; int replicates = m_configuredReplicationFactor; int importPartitions = ImportManager.getPartitionsCount(); int exportTableCount = ExportManager.instance().getExportTablesCount(); int exportNonceCount = ExportManager.instance().getConnCount(); int expThreadsCount = computeThreadsCount(tableCount, partitions, replicates, importPartitions, exportTableCount, exportNonceCount); // if the expected number of threads exceeds the limit, update the limit. if (m_maxThreadsCount < expThreadsCount) { updateMaxThreadsLimit(); } // do insane check again. if (m_maxThreadsCount < expThreadsCount) { StringBuilder builder = new StringBuilder(); builder.append(String.format("The configuration of %d tables, %d partitions, %d replicates, ", tableCount, partitions, replicates)); builder.append(String.format("with importer configuration of %d importer partitions, ", importPartitions)); builder.append(String.format("with exporter configuration of %d export tables %d partitions %d replicates, ", exportTableCount, partitions, replicates)); builder.append(String.format("approximately requires %d threads.", expThreadsCount)); builder.append(String.format("The maximum number of threads to the system is %d. \n", m_maxThreadsCount)); builder.append("Please increase the maximum system threads number or reduce the number of threads in your program, and then restart VoltDB. \n"); consoleLog.warn(builder.toString()); } } private void updateMaxThreadsLimit() { String[] command = {"bash", "-c" ,"ulimit -u"}; String cmd_rst = ShellTools.local_cmd(command); try { m_maxThreadsCount = Integer.parseInt(cmd_rst.substring(0, cmd_rst.length() - 1)); } catch(Exception e) { m_maxThreadsCount = Integer.MAX_VALUE; } } private int computeThreadsCount(int tableCount, int partitionCount, int replicateCount, int importerPartitionCount, int exportTableCount, int exportNonceCount) { final int clusterBaseCount = 5; final int hostBaseCount = 56; return clusterBaseCount + (hostBaseCount + partitionCount) + computeImporterThreads(importerPartitionCount) + computeExporterThreads(exportTableCount, partitionCount, replicateCount, exportNonceCount); } private int computeImporterThreads(int importerPartitionCount) { if (importerPartitionCount == 0) { return 0; } int importerBaseCount = 6; return importerBaseCount + importerPartitionCount; } private int computeExporterThreads(int exportTableCount, int partitionCount, int replicateCount, int exportNonceCount) { if (exportTableCount == 0) { return 0; } int exporterBaseCount = 1; return exporterBaseCount + partitionCount * exportTableCount + exportNonceCount; } @Override public <T> ListenableFuture<T> submitSnapshotIOWork(Callable<T> work) { assert m_snapshotIOAgent != null; return m_snapshotIOAgent.submit(work); } @Override public long getClusterUptime() { return System.currentTimeMillis() - getHostMessenger().getInstanceId().getTimestamp(); } @Override public long getClusterCreateTime() { return m_clusterCreateTime; } @Override public void setClusterCreateTime(long clusterCreateTime) { m_clusterCreateTime = clusterCreateTime; if (m_catalogContext.cluster.getDrconsumerenabled() || m_catalogContext.cluster.getDrproducerenabled()) { hostLog.info("Restoring DR with Cluster Id " + m_catalogContext.cluster.getDrclusterid() + ". The DR cluster was first started at " + new Date(m_clusterCreateTime).toString() + "."); } } @Override public SnmpTrapSender getSnmpTrapSender() { return m_snmp; } private final Supplier<String> terminusNonceSupplier = Suppliers.memoize(new Supplier<String>() { @Override public String get() { File markerFH = new File(m_nodeSettings.getVoltDBRoot(), VoltDB.TERMINUS_MARKER); // file needs to be both writable and readable as it will be deleted onRestoreComplete if (!markerFH.exists() || !markerFH.isFile() || !markerFH.canRead() || !markerFH.canWrite()) { return null; } String nonce = null; try (BufferedReader rdr = new BufferedReader(new FileReader(markerFH))){ nonce = rdr.readLine(); } catch (IOException e) { Throwables.propagate(e); // highly unlikely } // make sure that there is a snapshot associated with the terminus nonce HashMap<String, Snapshot> snapshots = new HashMap<>(); FileFilter filter = new SnapshotUtil.SnapshotFilter(); SnapshotUtil.retrieveSnapshotFiles( m_nodeSettings.resolve(m_nodeSettings.getSnapshoth()), snapshots, filter, false, SnapshotPathType.SNAP_AUTO, hostLog); return snapshots.containsKey(nonce) ? nonce : null; } }); /** * Reads the file containing the startup snapshot nonce * @return null if the file is not accessible, or the startup snapshot nonce */ private String getTerminusNonce() { return terminusNonceSupplier.get(); } @Override public Cartographer getCartograhper() { return m_cartographer; } @Override public void swapTables(String oneTable, String otherTable) { if (m_consumerDRGateway != null) { Table tableA = m_catalogContext.tables.get(oneTable); Table tableB = m_catalogContext.tables.get(otherTable); assert (tableA != null && tableB != null); if (tableA.getIsdred() && tableB.getIsdred()) { long signatureHashA = Hashing.sha1().hashString(tableA.getSignature(), Charsets.UTF_8).asLong(); long signatureHashB = Hashing.sha1().hashString(tableB.getSignature(), Charsets.UTF_8).asLong(); Set<Pair<String, Long>> swappedTables = new HashSet<>(); swappedTables.add(Pair.of(oneTable.toUpperCase(), signatureHashA)); swappedTables.add(Pair.of(otherTable.toUpperCase(), signatureHashB)); m_consumerDRGateway.swapTables(swappedTables); } } } public static void printDiagnosticInformation(CatalogContext context, String procName, LoadedProcedureSet procSet) { StringBuilder sb = new StringBuilder(); final CatalogMap<Procedure> catalogProcedures = context.database.getProcedures(); PureJavaCrc32C crc = new PureJavaCrc32C(); sb.append("Statements within " + procName + ": ").append("\n"); for (final Procedure proc : catalogProcedures) { if (proc.getTypeName().equals(procName)) { for (Statement stmt : proc.getStatements()) { // compute hash for determinism check crc.reset(); String sqlText = stmt.getSqltext(); crc.update(sqlText.getBytes(Constants.UTF8ENCODING)); int hash = (int) crc.getValue(); sb.append("Statement Hash: ").append(hash); sb.append(", Statement SQL: ").append(sqlText); for (PlanFragment frag : stmt.getFragments()) { byte[] planHash = Encoder.hexDecode(frag.getPlanhash()); long planId = ActivePlanRepository.getFragmentIdForPlanHash(planHash); String stmtText = ActivePlanRepository.getStmtTextForPlanHash(planHash); byte[] jsonPlan = ActivePlanRepository.planForFragmentId(planId); sb.append(", Plan Fragment Id:").append(planId); sb.append(", Plan Stmt Text:").append(stmtText); sb.append(", Json Plan:").append(new String(jsonPlan)); } sb.append("\n"); } } } sb.append("Default CRUD Procedures: ").append("\n"); for (Entry<String, Procedure> pair : context.m_defaultProcs.m_defaultProcMap.entrySet()) { crc.reset(); String sqlText = DefaultProcedureManager.sqlForDefaultProc(pair.getValue()); crc.update(sqlText.getBytes(Constants.UTF8ENCODING)); int hash = (int) crc.getValue(); sb.append("Statement Hash: ").append(hash); sb.append(", Statement SQL: ").append(sqlText); ProcedureRunner runner = procSet.getProcByName(pair.getValue().getTypeName()); for (Statement stmt : runner.getCatalogProcedure().getStatements()) { for (PlanFragment frag : stmt.getFragments()) { byte[] planHash = Encoder.hexDecode(frag.getPlanhash()); long planId = ActivePlanRepository.getFragmentIdForPlanHash(planHash); String stmtText = ActivePlanRepository.getStmtTextForPlanHash(planHash); byte[] jsonPlan = ActivePlanRepository.planForFragmentId(planId); sb.append(", Plan Fragment Id:").append(planId); sb.append(", Plan Stmt Text:").append(stmtText); sb.append(", Json Plan:").append(new String(jsonPlan)); } } sb.append("\n"); } hostLog.error(sb.toString()); } }