/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ignite.cache.affinity.rendezvous; import java.io.Externalizable; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.UUID; import org.apache.ignite.IgniteLogger; import org.apache.ignite.cache.affinity.AffinityFunction; import org.apache.ignite.cache.affinity.AffinityFunctionContext; import org.apache.ignite.cluster.ClusterNode; import org.apache.ignite.configuration.CacheConfiguration; import org.apache.ignite.internal.processors.cache.GridCacheUtils; import org.apache.ignite.internal.util.typedef.F; import org.apache.ignite.internal.util.typedef.internal.A; import org.apache.ignite.internal.util.typedef.internal.LT; import org.apache.ignite.internal.util.typedef.internal.U; import org.apache.ignite.lang.IgniteBiPredicate; import org.apache.ignite.lang.IgniteBiTuple; import org.apache.ignite.resources.LoggerResource; import org.jetbrains.annotations.Nullable; /** * Affinity function for partitioned cache based on Highest Random Weight algorithm. * This function supports the following configuration: * <ul> * <li> * {@code partitions} - Number of partitions to spread across nodes. * </li> * <li> * {@code excludeNeighbors} - If set to {@code true}, will exclude same-host-neighbors * from being backups of each other. This flag can be ignored in cases when topology has no enough nodes * for assign backups. * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * </li> * <li> * {@code backupFilter} - Optional filter for back up nodes. If provided, then only * nodes that pass this filter will be selected as backup nodes. If not provided, then * primary and backup nodes will be selected out of all nodes available for this cache. * </li> * </ul> * <p> * Cache affinity can be configured for individual caches via {@link CacheConfiguration#getAffinity()} method. */ public class RendezvousAffinityFunction implements AffinityFunction, Externalizable { /** */ private static final long serialVersionUID = 0L; /** Default number of partitions. */ public static final int DFLT_PARTITION_COUNT = 1024; /** Comparator. */ private static final Comparator<IgniteBiTuple<Long, ClusterNode>> COMPARATOR = new HashComparator(); /** Number of partitions. */ private int parts; /** Mask to use in calculation when partitions count is power of 2. */ private transient int mask = -1; /** Exclude neighbors flag. */ private boolean exclNeighbors; /** Exclude neighbors warning. */ private transient boolean exclNeighborsWarn; /** Optional backup filter. First node is primary, second node is a node being tested. */ private IgniteBiPredicate<ClusterNode, ClusterNode> backupFilter; /** Optional affinity backups filter. The first node is a node being tested, * the second is a list of nodes that are already assigned for a given partition (the first node in the list * is primary). */ private IgniteBiPredicate<ClusterNode, List<ClusterNode>> affinityBackupFilter; /** Logger instance. */ @LoggerResource private transient IgniteLogger log; /** * Empty constructor with all defaults. */ public RendezvousAffinityFunction() { this(false); } /** * Initializes affinity with flag to exclude same-host-neighbors from being backups of each other * and specified number of backups. * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @param exclNeighbors {@code True} if nodes residing on the same host may not act as backups * of each other. */ public RendezvousAffinityFunction(boolean exclNeighbors) { this(exclNeighbors, DFLT_PARTITION_COUNT); } /** * Initializes affinity with flag to exclude same-host-neighbors from being backups of each other, * and specified number of backups and partitions. * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @param exclNeighbors {@code True} if nodes residing on the same host may not act as backups * of each other. * @param parts Total number of partitions. */ public RendezvousAffinityFunction(boolean exclNeighbors, int parts) { this(exclNeighbors, parts, null); } /** * Initializes optional counts for replicas and backups. * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @param parts Total number of partitions. * @param backupFilter Optional back up filter for nodes. If provided, backups will be selected * from all nodes that pass this filter. First argument for this filter is primary node, and second * argument is node being tested. * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. */ public RendezvousAffinityFunction(int parts, @Nullable IgniteBiPredicate<ClusterNode, ClusterNode> backupFilter) { this(false, parts, backupFilter); } /** * Private constructor. * * @param exclNeighbors Exclude neighbors flag. * @param parts Partitions count. * @param backupFilter Backup filter. */ private RendezvousAffinityFunction(boolean exclNeighbors, int parts, IgniteBiPredicate<ClusterNode, ClusterNode> backupFilter) { A.ensure(parts > 0, "parts > 0"); A.ensure(parts <= CacheConfiguration.MAX_PARTITIONS_COUNT, "parts <=" + CacheConfiguration.MAX_PARTITIONS_COUNT); this.exclNeighbors = exclNeighbors; setPartitions(parts); this.backupFilter = backupFilter; } /** * Gets total number of key partitions. To ensure that all partitions are * equally distributed across all nodes, please make sure that this * number is significantly larger than a number of nodes. Also, partition * size should be relatively small. Try to avoid having partitions with more * than quarter million keys. * <p> * Note that for fully replicated caches this method should always * return {@code 1}. * * @return Total partition count. */ public int getPartitions() { return parts; } /** * Sets total number of partitions.If the number of partitions is a power of two, * the PowerOfTwo hashing method will be used. Otherwise the Standard hashing * method will be applied. * * @param parts Total number of partitions. * @return {@code this} for chaining. */ public RendezvousAffinityFunction setPartitions(int parts) { A.ensure(parts <= CacheConfiguration.MAX_PARTITIONS_COUNT, "parts <= " + CacheConfiguration.MAX_PARTITIONS_COUNT); A.ensure(parts > 0, "parts > 0"); this.parts = parts; mask = (parts & (parts - 1)) == 0 ? parts - 1 : -1; return this; } /** * Gets optional backup filter. If not {@code null}, backups will be selected * from all nodes that pass this filter. First node passed to this filter is primary node, * and second node is a node being tested. * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @return Optional backup filter. */ @Nullable public IgniteBiPredicate<ClusterNode, ClusterNode> getBackupFilter() { return backupFilter; } /** * Sets optional backup filter. If provided, then backups will be selected from all * nodes that pass this filter. First node being passed to this filter is primary node, * and second node is a node being tested. * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @param backupFilter Optional backup filter. * @deprecated Use {@code affinityBackupFilter} instead. * @return {@code this} for chaining. */ @Deprecated public RendezvousAffinityFunction setBackupFilter( @Nullable IgniteBiPredicate<ClusterNode, ClusterNode> backupFilter) { this.backupFilter = backupFilter; return this; } /** * Gets optional backup filter. If not {@code null}, backups will be selected * from all nodes that pass this filter. First node passed to this filter is a node being tested, * and the second parameter is a list of nodes that are already assigned for a given partition (primary node is the first in the list). * <p> * Note that {@code affinityBackupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @return Optional backup filter. */ @Nullable public IgniteBiPredicate<ClusterNode, List<ClusterNode>> getAffinityBackupFilter() { return affinityBackupFilter; } /** * Sets optional backup filter. If provided, then backups will be selected from all * nodes that pass this filter. First node being passed to this filter is a node being tested, * and the second parameter is a list of nodes that are already assigned for a given partition (primary node is the first in the list). * <p> * Note that {@code affinityBackupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @param affinityBackupFilter Optional backup filter. * @return {@code this} for chaining. */ public RendezvousAffinityFunction setAffinityBackupFilter( @Nullable IgniteBiPredicate<ClusterNode, List<ClusterNode>> affinityBackupFilter) { this.affinityBackupFilter = affinityBackupFilter; return this; } /** * Checks flag to exclude same-host-neighbors from being backups of each other (default is {@code false}). * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @return {@code True} if nodes residing on the same host may not act as backups of each other. */ public boolean isExcludeNeighbors() { return exclNeighbors; } /** * Sets flag to exclude same-host-neighbors from being backups of each other (default is {@code false}). * <p> * Note that {@code backupFilter} is ignored if {@code excludeNeighbors} is set to {@code true}. * * @param exclNeighbors {@code True} if nodes residing on the same host may not act as backups of each other. * @return {@code this} for chaining. */ public RendezvousAffinityFunction setExcludeNeighbors(boolean exclNeighbors) { this.exclNeighbors = exclNeighbors; return this; } /** * Resolves node hash. * * @param node Cluster node; * @return Node hash. */ public Object resolveNodeHash(ClusterNode node) { return node.consistentId(); } /** * Returns collection of nodes (primary first) for specified partition. * * @param part Partition. * @param nodes Nodes. * @param backups Number of backups. * @param neighborhoodCache Neighborhood. * @return Assignment. */ public List<ClusterNode> assignPartition(int part, List<ClusterNode> nodes, int backups, @Nullable Map<UUID, Collection<ClusterNode>> neighborhoodCache) { if (nodes.size() <= 1) return nodes; IgniteBiTuple<Long, ClusterNode> [] hashArr = (IgniteBiTuple<Long, ClusterNode> [])new IgniteBiTuple[nodes.size()]; for (int i = 0; i < nodes.size(); i++) { ClusterNode node = nodes.get(i); Object nodeHash = resolveNodeHash(node); long hash = hash(nodeHash.hashCode(), part); hashArr[i] = F.t(hash, node); } final int primaryAndBackups = backups == Integer.MAX_VALUE ? nodes.size() : Math.min(backups + 1, nodes.size()); Iterable<ClusterNode> sortedNodes = new LazyLinearSortedContainer(hashArr, primaryAndBackups); // REPLICATED cache case if (backups == Integer.MAX_VALUE) return replicatedAssign(nodes, sortedNodes); Iterator<ClusterNode> it = sortedNodes.iterator(); List<ClusterNode> res = new ArrayList<>(primaryAndBackups); Collection<ClusterNode> allNeighbors = new HashSet<>(); ClusterNode primary = it.next(); res.add(primary); if (exclNeighbors) allNeighbors.addAll(neighborhoodCache.get(primary.id())); // Select backups. if (backups > 0) { while (it.hasNext() && res.size() < primaryAndBackups) { ClusterNode node = it.next(); if (exclNeighbors) { if (!allNeighbors.contains(node)) { res.add(node); allNeighbors.addAll(neighborhoodCache.get(node.id())); } } else if ((backupFilter != null && backupFilter.apply(primary, node)) || (affinityBackupFilter != null && affinityBackupFilter.apply(node, res)) || (affinityBackupFilter == null && backupFilter == null) ) { res.add(node); if (exclNeighbors) allNeighbors.addAll(neighborhoodCache.get(node.id())); } } } if (res.size() < primaryAndBackups && nodes.size() >= primaryAndBackups && exclNeighbors) { // Need to iterate again in case if there are no nodes which pass exclude neighbors backups criteria. it = sortedNodes.iterator(); it.next(); while (it.hasNext() && res.size() < primaryAndBackups) { ClusterNode node = it.next(); if (!res.contains(node)) res.add(node); } if (!exclNeighborsWarn) { LT.warn(log, "Affinity function excludeNeighbors property is ignored " + "because topology has no enough nodes to assign backups.", "Affinity function excludeNeighbors property is ignored " + "because topology has no enough nodes to assign backups."); exclNeighborsWarn = true; } } assert res.size() <= primaryAndBackups; return res; } /** * Creates assignment for REPLICATED cache * * @param nodes Topology. * @param sortedNodes Sorted for specified partitions nodes. * @return Assignment. */ private List<ClusterNode> replicatedAssign(List<ClusterNode> nodes, Iterable<ClusterNode> sortedNodes) { ClusterNode primary = sortedNodes.iterator().next(); List<ClusterNode> res = new ArrayList<>(nodes.size()); res.add(primary); for (ClusterNode n : nodes) if (!n.equals(primary)) res.add(n); assert res.size() == nodes.size() : "Not enough backups: " + res.size(); return res; } /** * The pack partition number and nodeHash.hashCode to long and mix it by hash function based on the Wang/Jenkins * hash. * * @param key0 Hash key. * @param key1 Hash key. * @see <a href="https://gist.github.com/badboy/6267743#64-bit-mix-functions">64 bit mix functions</a> * @return Long hash key. */ private static long hash(int key0, int key1) { long key = (key0 & 0xFFFFFFFFL) | ((key1 & 0xFFFFFFFFL) << 32); key = (~key) + (key << 21); // key = (key << 21) - key - 1; key ^= (key >>> 24); key += (key << 3) + (key << 8); // key * 265 key ^= (key >>> 14); key += (key << 2) + (key << 4); // key * 21 key ^= (key >>> 28); key += (key << 31); return key; } /** {@inheritDoc} */ @Override public void reset() { // No-op. } /** {@inheritDoc} */ @Override public int partitions() { return parts; } /** {@inheritDoc} */ @Override public int partition(Object key) { if (key == null) throw new IllegalArgumentException("Null key is passed for a partition calculation. " + "Make sure that an affinity key that is used is initialized properly."); if (mask >= 0) { int h; return ((h = key.hashCode()) ^ (h >>> 16)) & mask; } return U.safeAbs(key.hashCode() % parts); } /** {@inheritDoc} */ @Override public List<List<ClusterNode>> assignPartitions(AffinityFunctionContext affCtx) { List<List<ClusterNode>> assignments = new ArrayList<>(parts); Map<UUID, Collection<ClusterNode>> neighborhoodCache = exclNeighbors ? GridCacheUtils.neighbors(affCtx.currentTopologySnapshot()) : null; List<ClusterNode> nodes = affCtx.currentTopologySnapshot(); for (int i = 0; i < parts; i++) { List<ClusterNode> partAssignment = assignPartition(i, nodes, affCtx.backups(), neighborhoodCache); assignments.add(partAssignment); } return assignments; } /** {@inheritDoc} */ @Override public void removeNode(UUID nodeId) { // No-op. } /** {@inheritDoc} */ @Override public void writeExternal(ObjectOutput out) throws IOException { out.writeInt(parts); out.writeBoolean(exclNeighbors); out.writeObject(backupFilter); } /** {@inheritDoc} */ @SuppressWarnings("unchecked") @Override public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { setPartitions(in.readInt()); exclNeighbors = in.readBoolean(); backupFilter = (IgniteBiPredicate<ClusterNode, ClusterNode>)in.readObject(); } /** * */ private static class HashComparator implements Comparator<IgniteBiTuple<Long, ClusterNode>>, Serializable { /** */ private static final long serialVersionUID = 0L; /** {@inheritDoc} */ @Override public int compare(IgniteBiTuple<Long, ClusterNode> o1, IgniteBiTuple<Long, ClusterNode> o2) { return o1.get1() < o2.get1() ? -1 : o1.get1() > o2.get1() ? 1 : o1.get2().id().compareTo(o2.get2().id()); } } /** * Sorts the initial array with linear sort algorithm array */ private static class LazyLinearSortedContainer implements Iterable<ClusterNode> { /** Initial node-hash array. */ private final IgniteBiTuple<Long, ClusterNode>[] arr; /** Count of the sorted elements */ private int sorted; /** * @param arr Node / partition hash list. * @param needFirstSortedCnt Estimate count of elements to return by iterator. */ LazyLinearSortedContainer(IgniteBiTuple<Long, ClusterNode>[] arr, int needFirstSortedCnt) { this.arr = arr; if (needFirstSortedCnt > (int)Math.log(arr.length)) { Arrays.sort(arr, COMPARATOR); sorted = arr.length; } } /** {@inheritDoc} */ @Override public Iterator<ClusterNode> iterator() { return new SortIterator(); } /** * */ private class SortIterator implements Iterator<ClusterNode> { /** Index of the first unsorted element. */ private int cur; /** {@inheritDoc} */ @Override public boolean hasNext() { return cur < arr.length; } /** {@inheritDoc} */ @Override public ClusterNode next() { if (!hasNext()) throw new NoSuchElementException(); if (cur < sorted) return arr[cur++].get2(); IgniteBiTuple<Long, ClusterNode> min = arr[cur]; int minIdx = cur; for (int i = cur + 1; i < arr.length; i++) { if (COMPARATOR.compare(arr[i], min) < 0) { minIdx = i; min = arr[i]; } } if (minIdx != cur) { arr[minIdx] = arr[cur]; arr[cur] = min; } sorted = cur++; return min.get2(); } /** {@inheritDoc} */ @Override public void remove() { throw new UnsupportedOperationException("Remove doesn't supported"); } } } }