/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.sysprocs.saverestore; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; import org.voltcore.logging.VoltLogger; import org.voltcore.utils.Pair; import org.voltdb.ParameterSet; import org.voltdb.VoltDB; import org.voltdb.VoltSystemProcedure.SynthesizedPlanFragment; import org.voltdb.VoltTableRow; import org.voltdb.catalog.Table; import org.voltdb.dtxn.SiteTracker; import org.voltdb.sysprocs.SysProcFragmentId; public class PartitionedTableSaveFileState extends TableSaveFileState { private static final VoltLogger LOG = new VoltLogger(PartitionedTableSaveFileState.class.getName()); private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT"); public PartitionedTableSaveFileState(String tableName, long txnId) { super(tableName, txnId); } @Override void addHostData(VoltTableRow row) throws IOException { assert(row.getString("TABLE").equals(getTableName())); if (m_totalPartitions == 0) { // XXX this cast should be okay unless we exceed MAX_INT partitions m_totalPartitions = (int) row.getLong("TOTAL_PARTITIONS"); } checkSiteConsistency(row); // throws if inconsistent int originalPartitionId = (int) row.getLong("PARTITION"); m_partitionsSeen.add(originalPartitionId); int currentHostId = (int) row.getLong("CURRENT_HOST_ID"); Set<Pair<Integer, Integer>> partitions_at_host = null; if (!(m_partitionsAtHost.containsKey(currentHostId))) { partitions_at_host = new HashSet<Pair<Integer, Integer>>(); m_partitionsAtHost.put( currentHostId, partitions_at_host); } partitions_at_host = m_partitionsAtHost.get(currentHostId); partitions_at_host.add( Pair.of( originalPartitionId, (int) row.getLong("ORIGINAL_HOST_ID"))); } @Override public boolean isConsistent() { boolean consistent = ((m_partitionsSeen.size() == m_totalPartitions) && (m_partitionsSeen.first() == 0) && (m_partitionsSeen.last() == m_totalPartitions - 1)); if (!consistent) { m_consistencyResult = "Table: " + getTableName() + " is missing " + (m_totalPartitions - m_partitionsSeen.size()) + " out of " + m_totalPartitions + " total partitions" + " (partitions seen: " + m_partitionsSeen + ")"; } else { m_consistencyResult = "Table: " + getTableName() + " has consistent savefile state."; } return consistent; } int getTotalPartitions() { return m_totalPartitions; } @Override public SynthesizedPlanFragment[] generateRestorePlan(Table catalogTable, SiteTracker st) { SynthesizedPlanFragment[] restore_plan = null; LOG.info("Total partitions for Table: " + getTableName() + ": " + getTotalPartitions()); if (!catalogTable.getIsreplicated()) { restore_plan = generatePartitionedToPartitionedPlan(st); } else { restore_plan = generatePartitionedToReplicatedPlan(st); } return restore_plan; } private void checkSiteConsistency(VoltTableRow row) throws IOException { if (!row.getString("IS_REPLICATED").equals("FALSE")) { String error = "Table: " + getTableName() + " was partitioned " + "but has a savefile which indicates replication at site: " + row.getLong("CURRENT_HOST_ID"); m_consistencyResult = error; throw new IOException(error); } if ((int) row.getLong("TOTAL_PARTITIONS") != getTotalPartitions()) { String error = "Table: " + getTableName() + " has a savefile " + " with an inconsistent number of total partitions: " + row.getLong("TOTAL_PARTITIONS") + " (previous values were " + getTotalPartitions() + ") at site: " + row.getLong("CURRENT_HOST_ID"); m_consistencyResult = error; throw new IOException(error); } } private SynthesizedPlanFragment[] generatePartitionedToReplicatedPlan(SiteTracker st) { ArrayList<SynthesizedPlanFragment> restorePlan = new ArrayList<SynthesizedPlanFragment>(); Set<Integer> coveredPartitions = new HashSet<Integer>(); Iterator<Entry<Integer, Set<Pair<Integer, Integer>>>> partitionAtHostItr = m_partitionsAtHost.entrySet().iterator(); // looping through all current hosts having .vpt files of this table while(partitionAtHostItr.hasNext()) { Entry<Integer, Set<Pair<Integer, Integer>>> partitionAtHost = partitionAtHostItr.next(); Integer host = partitionAtHost.getKey(); List<Integer> loadPartitions = new ArrayList<Integer>(); List<Integer> loadOrigHosts = new ArrayList<Integer>(); Set<Pair<Integer, Integer>> partitionAndOrigHostSet = partitionAtHost.getValue(); Iterator<Pair<Integer, Integer>> itr = partitionAndOrigHostSet.iterator(); // calculate which available partitions not yet been covered and put // its partition_id and orig_host_id in loadPartitions and loadOrigHosts while(itr.hasNext()) { Pair<Integer, Integer> pair = itr.next(); if(!coveredPartitions.contains(pair.getFirst())) { loadPartitions.add(pair.getFirst()); loadOrigHosts.add(pair.getSecond()); coveredPartitions.add(pair.getFirst()); } } // if there are some work to do if(loadPartitions.size() > 0){ int[] relevantPartitionIds = com.google_voltpatches.common.primitives.Ints.toArray(loadPartitions); int[] originalHosts = com.google_voltpatches.common.primitives.Ints.toArray(loadOrigHosts); List<Long> sitesAtHost = st.getSitesForHost(host); // for each site of this host, generate one work fragment and let them execute in parallel for(Long site : sitesAtHost) { restorePlan.add(constructDistributePartitionedTableFragment( site, relevantPartitionIds, originalHosts, true)); } } } restorePlan.add(constructDistributePartitionedTableAggregatorFragment(true)); assert(coveredPartitions.size() == m_partitionsSeen.size()); return restorePlan.toArray(new SynthesizedPlanFragment[0]); } private SynthesizedPlanFragment[] generatePartitionedToPartitionedPlan(SiteTracker st) { LOG.info("Partition set: " + m_partitionsSeen); ArrayList<SynthesizedPlanFragment> restorePlan = new ArrayList<SynthesizedPlanFragment>(); HashSet<Integer> coveredPartitions = new HashSet<Integer>(); HashMap<Integer, ArrayList<Integer>> hostsToUncoveredPartitions = new HashMap<Integer, ArrayList<Integer>>(); HashMap<Integer, ArrayList<Integer>> hostsToOriginalHosts = new HashMap<Integer, ArrayList<Integer>>(); for (Integer host : m_partitionsAtHost.keySet()) { hostsToUncoveredPartitions.put(host, new ArrayList<Integer>()); hostsToOriginalHosts.put(host, new ArrayList<Integer>()); } /* * Loop through the list of hosts repeatedly. Each time pick only one * partition to distribute from each host. This ensures some load * balancing. */ while (!coveredPartitions.containsAll(m_partitionsSeen)) { Iterator<Integer> hosts = m_partitionsAtHost.keySet().iterator(); // Track if progress was made, if nothing to distribute // was found and we aren't covering then it is missing partitions int numPartitionsUsed = 0; while (hosts.hasNext()) { /** * Get the list of partitions on this host and remove all that * were covered already */ Integer nextHost = hosts.next(); Set<Pair<Integer, Integer>> partitionsAndOrigHosts = new HashSet<Pair<Integer, Integer>>( m_partitionsAtHost.get(nextHost)); Iterator<Pair<Integer, Integer>> removeCoveredIterator = partitionsAndOrigHosts .iterator(); List<Integer> uncoveredPartitionsAtHostList = hostsToUncoveredPartitions .get(nextHost); ArrayList<Integer> originalHosts = hostsToOriginalHosts .get(nextHost); while (removeCoveredIterator.hasNext()) { Pair<Integer, Integer> p = removeCoveredIterator.next(); if (coveredPartitions.contains(p.getFirst())) { removeCoveredIterator.remove(); } } /* * If there is a partition left that isn't covered select it for * distribution */ Iterator<Pair<Integer, Integer>> candidatePartitions = partitionsAndOrigHosts .iterator(); if (candidatePartitions.hasNext()) { Pair<Integer, Integer> p = candidatePartitions.next(); coveredPartitions.add(p.getFirst()); uncoveredPartitionsAtHostList.add(p.getFirst()); originalHosts.add(p.getSecond()); numPartitionsUsed++; } } if (numPartitionsUsed == 0 && !coveredPartitions.containsAll(m_partitionsSeen)) { LOG.error("Could not find a host to distribute some partitions"); return null; } } SNAP_LOG.info("Distribution plan for table " + getTableName()); for (Integer host : m_partitionsAtHost.keySet()) { List<Integer> uncoveredPartitionsAtHostList = hostsToUncoveredPartitions .get(host); ArrayList<Integer> originalHosts = hostsToOriginalHosts.get(host); List<Long> sitesAtHost = VoltDB.instance().getSiteTrackerForSnapshot() .getSitesForHost(host); int originalHostsArray[] = new int[originalHosts.size()]; int qq = 0; for (int originalHostId : originalHosts) originalHostsArray[qq++] = originalHostId; int uncoveredPartitionsAtHost[] = new int[uncoveredPartitionsAtHostList .size()]; for (int ii = 0; ii < uncoveredPartitionsAtHostList.size(); ii++) { uncoveredPartitionsAtHost[ii] = uncoveredPartitionsAtHostList .get(ii); } StringBuilder sb = new StringBuilder(); sb.append("\tHost ").append(host) .append(" will distribute partitions "); for (Integer partition : uncoveredPartitionsAtHostList) { sb.append(partition).append(' '); } SNAP_LOG.info(sb.toString()); /* * Assigning the FULL workload to each site. At the actual host * static synchronization in the procedure will ensure the work is * distributed across every ES in a meaningful way. */ for (Long site : sitesAtHost) { restorePlan.add(constructDistributePartitionedTableFragment( site, uncoveredPartitionsAtHost, originalHostsArray, false)); } } restorePlan .add(constructDistributePartitionedTableAggregatorFragment(false)); return restorePlan.toArray(new SynthesizedPlanFragment[0]); } private SynthesizedPlanFragment constructDistributePartitionedTableFragment( long distributorSiteId, // site which will execute this plan fragment int uncoveredPartitionsAtHost[], // which partitions' data in the .vpt files will be extracted as TableSaveFile int originalHostsArray[], // used to locate .vpt files boolean asReplicated) { int result_dependency_id = getNextDependencyId(); SynthesizedPlanFragment plan_fragment = new SynthesizedPlanFragment(); plan_fragment.fragmentId = (asReplicated ? SysProcFragmentId.PF_restoreDistributePartitionedTableAsReplicated : SysProcFragmentId.PF_restoreDistributePartitionedTableAsPartitioned); plan_fragment.multipartition = false; plan_fragment.siteId = distributorSiteId; plan_fragment.outputDepId = result_dependency_id; plan_fragment.inputDepIds = new int[] {}; addPlanDependencyId(result_dependency_id); plan_fragment.parameters = ParameterSet.fromArrayNoCopy( getTableName(), originalHostsArray, uncoveredPartitionsAtHost, result_dependency_id, getIsRecoverParam()); return plan_fragment; } private SynthesizedPlanFragment constructDistributePartitionedTableAggregatorFragment(boolean asReplicated) { int result_dependency_id = getNextDependencyId(); SynthesizedPlanFragment plan_fragment = new SynthesizedPlanFragment(); plan_fragment.fragmentId = SysProcFragmentId.PF_restoreReceiveResultTables; plan_fragment.multipartition = false; plan_fragment.outputDepId = result_dependency_id; plan_fragment.inputDepIds = getPlanDependencyIds(); setRootDependencyId(result_dependency_id); plan_fragment.parameters = ParameterSet.fromArrayNoCopy( result_dependency_id, (asReplicated ? "Aggregating partitioned-to-replicated table restore results" : "Aggregating partitioned table restore results"), getIsRecoverParam()); return plan_fragment; } // XXX-BLAH should this move to SiteTracker? public Set<Pair<Integer, Integer>> getPartitionsAtHost(int hostId) { return m_partitionsAtHost.get(hostId); } Set<Integer> getPartitionSet() { return m_partitionsSeen; } /** * Set of original PartitionId */ private final TreeSet<Integer> m_partitionsSeen = new TreeSet<Integer>(); /** * Map from a current host id to a pair of an original * partition id and the original host id */ private final Map<Integer, Set<Pair<Integer, Integer>>> m_partitionsAtHost = new HashMap<Integer, Set<Pair<Integer, Integer>>>(); private int m_totalPartitions = 0; }