package org.infinispan.server.hotrod; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Optional; import org.infinispan.AdvancedCache; import org.infinispan.Cache; import org.infinispan.commons.logging.LogFactory; import org.infinispan.configuration.cache.Configuration; import org.infinispan.distribution.DistributionManager; import org.infinispan.distribution.ch.ConsistentHash; import org.infinispan.distribution.ch.KeyPartitioner; import org.infinispan.distribution.ch.impl.HashFunctionPartitioner; import org.infinispan.distribution.group.impl.PartitionerConsistentHash; import org.infinispan.distribution.group.impl.GroupingPartitioner; import org.infinispan.manager.EmbeddedCacheManager; import org.infinispan.remoting.rpc.RpcManager; import org.infinispan.remoting.transport.Address; import org.infinispan.server.hotrod.Events.Event; import org.infinispan.server.hotrod.logging.Log; import org.infinispan.server.hotrod.transport.ExtendedByteBuf; import org.infinispan.util.KeyValuePair; import io.netty.buffer.ByteBuf; /** * Hot Rod encoder for protocol version 1.1 * * @author Galder ZamarreƱo * @since 5.1 */ public abstract class AbstractEncoder1x implements VersionedEncoder { protected final Log log = LogFactory.getLog(getClass(), Log.class); protected final boolean trace = log.isTraceEnabled(); @Override public void writeEvent(Event e, ByteBuf buf) { // Not implemented in this version of the protocol } @Override public void writeHeader(Response r, ByteBuf buf, Cache<Address, ServerAddress> addressCache, HotRodServer server) { AbstractTopologyResponse topologyResp = getTopologyResponse(r, addressCache, server); buf.writeByte(Constants.MAGIC_RES); ExtendedByteBuf.writeUnsignedLong(r.messageId, buf); buf.writeByte(r.operation.getResponseOpCode()); buf.writeByte(r.status.getCode()); if (topologyResp != null) { if (topologyResp instanceof TopologyAwareResponse) { TopologyAwareResponse tar = (TopologyAwareResponse) topologyResp; if (r.clientIntel == Constants.INTELLIGENCE_TOPOLOGY_AWARE) writeTopologyUpdate(tar, buf); else writeLimitedHashTopologyUpdate(tar, buf); } else if (topologyResp instanceof AbstractHashDistAwareResponse) { writeHashTopologyUpdate((AbstractHashDistAwareResponse) topologyResp, server, r, buf); } else { throw new IllegalArgumentException("Unsupported response instance: " + topologyResp); } } else { writeNoTopologyUpdate(buf); } } @Override public void writeResponse(Response r, ByteBuf buf, EmbeddedCacheManager cacheManager, HotRodServer server) { if (r instanceof ResponseWithPrevious) { Optional<byte[]> prev = ((ResponseWithPrevious) r).previous; if (prev.isPresent()) ExtendedByteBuf.writeRangedBytes(prev.get(), buf); else ExtendedByteBuf.writeUnsignedInt(0, buf); } else if (r instanceof StatsResponse) { Map<String, String> stats = ((StatsResponse) r).stats; ExtendedByteBuf.writeUnsignedInt(stats.size(), buf); for (Map.Entry<String, String> entry : stats.entrySet()) { ExtendedByteBuf.writeString(entry.getKey(), buf); ExtendedByteBuf.writeString(entry.getValue(), buf); } } else if (r instanceof GetWithVersionResponse) { GetWithVersionResponse gwvr = (GetWithVersionResponse) r; if (gwvr.status == OperationStatus.Success) { buf.writeLong(gwvr.dataVersion); ExtendedByteBuf.writeRangedBytes(gwvr.data, buf); } } else if (r instanceof GetWithMetadataResponse) { GetWithMetadataResponse gwmr = (GetWithMetadataResponse) r; if (gwmr.status == OperationStatus.Success) { byte flags = (gwmr.lifespan < 0 ? Constants.INFINITE_LIFESPAN : (byte) 0); flags |= (gwmr.maxIdle < 0 ? Constants.INFINITE_MAXIDLE : (byte) 0); buf.writeByte(flags); if (gwmr.lifespan >= 0) { buf.writeLong(gwmr.created); ExtendedByteBuf.writeUnsignedInt(gwmr.lifespan, buf); } if (gwmr.maxIdle >= 0) { buf.writeLong(gwmr.lastUsed); ExtendedByteBuf.writeUnsignedInt(gwmr.maxIdle, buf); } buf.writeLong(gwmr.dataVersion); ExtendedByteBuf.writeRangedBytes(gwmr.data, buf); } } else if (r instanceof BulkGetResponse) { if (trace) log.trace("About to respond to bulk get request"); BulkGetResponse bgr = (BulkGetResponse) r; if (bgr.status == OperationStatus.Success) { int count; if (bgr.count != 0) { log.tracef("About to write (max) %d messages to the client", bgr.count); count = bgr.count; } else { count = Integer.MAX_VALUE; } Iterator<Map.Entry<byte[], byte[]>> iterator = bgr.entries.iterator(); while (iterator.hasNext() && count-- > 0) { Map.Entry<byte[], byte[]> entry = iterator.next(); buf.writeByte(1); // Not done ExtendedByteBuf.writeRangedBytes(entry.getKey(), buf); ExtendedByteBuf.writeRangedBytes(entry.getValue(), buf); } buf.writeByte(0); // Done } } else if (r instanceof BulkGetKeysResponse) { log.trace("About to respond to bulk get keys request"); BulkGetKeysResponse bgkr = (BulkGetKeysResponse) r; if (bgkr.status == OperationStatus.Success) { while (bgkr.iterator.hasNext()) { byte[] key = bgkr.iterator.next(); buf.writeByte(1); // Not done ExtendedByteBuf.writeRangedBytes(key, buf); } buf.writeByte(0); // Done } } else if (r instanceof GetResponse) { if (r.status == OperationStatus.Success) ExtendedByteBuf.writeRangedBytes(((GetResponse) r).data, buf); } else if (r instanceof QueryResponse) { ExtendedByteBuf.writeRangedBytes(((QueryResponse) r).result, buf); } else if (r instanceof ErrorResponse) { ExtendedByteBuf.writeString(((ErrorResponse) r).msg, buf); } else if (buf == null) { throw new IllegalArgumentException("Response received is unknown: " + r); } } AbstractTopologyResponse getTopologyResponse(Response r, Cache<Address, ServerAddress> addressCache, HotRodServer server) { // If clustered, set up a cache for topology information if (addressCache != null) { switch (r.clientIntel) { case Constants.INTELLIGENCE_TOPOLOGY_AWARE: case Constants.INTELLIGENCE_HASH_DISTRIBUTION_AWARE: // Use the request cache's topology id as the HotRod topologyId. AdvancedCache cache = server.getCacheInstance(r.cacheName, addressCache.getCacheManager(), false, true); RpcManager rpcManager = cache.getRpcManager(); // Only send a topology update if the cache is clustered int currentTopologyId = rpcManager == null ? Constants.DEFAULT_TOPOLOGY_ID : rpcManager.getTopologyId(); // AND if the client's topology id is smaller than the server's topology id if (currentTopologyId >= Constants.DEFAULT_TOPOLOGY_ID && r.topologyId < currentTopologyId) return generateTopologyResponse(r, addressCache, server, currentTopologyId); } } return null; } private AbstractTopologyResponse generateTopologyResponse(Response r, Cache<Address, ServerAddress> addressCache, HotRodServer server, int currentTopologyId) { // If the topology cache is incomplete, we assume that a node has joined but hasn't added his HotRod // endpoint address to the topology cache yet. We delay the topology update until the next client // request by returning null here (so the client topology id stays the same). // If a new client connects while the join is in progress, though, we still have to generate a topology // response. Same if we have cache manager that is a member of the cluster but doesn't have a HotRod // endpoint (aka a storage-only node), and a HotRod server shuts down. // Our workaround is to send a "partial" topology update when the topology cache is incomplete, but the // difference between the client topology id and the server topology id is 2 or more. The partial update // will have the topology id of the server - 1, so it won't prevent a regular topology update if/when // the topology cache is updated. AdvancedCache<byte[], byte[]> cache = server.getCacheInstance(r.cacheName, addressCache.getCacheManager(), false, true); List<Address> cacheMembers = cache.getRpcManager().getMembers(); int responseTopologyId = currentTopologyId; if (!addressCache.keySet().containsAll(cacheMembers)) { // At least one cache member is missing from the topology cache int clientTopologyId = r.topologyId; if (currentTopologyId - clientTopologyId < 2) { // Postpone topology update return null; } else { // Send partial topology update responseTopologyId -= 1; } } Configuration config = cache.getCacheConfiguration(); if (r.clientIntel == Constants.INTELLIGENCE_TOPOLOGY_AWARE || !config.clustering().cacheMode().isDistributed()) { return new TopologyAwareResponse(responseTopologyId, addressCache, 0); } else { // Must be 3 and distributed return createHashDistAwareResp(responseTopologyId, addressCache, config); } } protected AbstractHashDistAwareResponse createHashDistAwareResp(int topologyId, Map<Address, ServerAddress> serverEndpointsMap, Configuration cfg) { return new HashDistAwareResponse(topologyId, serverEndpointsMap, 0, cfg.clustering().hash().numOwners(), Constants.DEFAULT_CONSISTENT_HASH_VERSION_1x, Integer.MAX_VALUE); } void writeHashTopologyUpdate(AbstractHashDistAwareResponse h, HotRodServer server, Response r, ByteBuf buffer) { AdvancedCache<byte[], byte[]> cache = server.getCacheInstance(r.cacheName, server.getCacheManager(), false, true); DistributionManager distManager = cache.getDistributionManager(); ConsistentHash ch = distManager.getWriteConsistentHash(); Map<Address, ServerAddress> topologyMap = h.serverEndpointsMap; if (topologyMap.isEmpty()) { log.noMembersInHashTopology(ch, topologyMap.toString()); buffer.writeByte(0); // Topology not changed } else { log.tracef("Write hash distribution change response header %s", h); // This is not quite correct, as the ownership of segments on the 1.0/1.1/1.2 clients is not exactly // the same as on the server. But the difference appears only for (numSegment*numOwners/MAX_INT) // of the keys (at the "segment borders"), so it's still much better than having no hash information. // The idea here is to be able to be compatible with clients running version 1.0 of the protocol. // TODO Need a check somewhere on startup, this only works with the default key partitioner int numSegments = ch.getNumSegments(); KeyPartitioner keyPartitioner = ((PartitionerConsistentHash) ch).getKeyPartitioner(); List<Integer> segmentHashIds = extractSegmentEndHashes(keyPartitioner); List<KeyValuePair<ServerAddress, Integer>> serverHashes = new ArrayList<>(numSegments); for (Map.Entry<Address, ServerAddress> entry : topologyMap.entrySet()) { for (int segmentIdx = 0; segmentIdx < numSegments; ++segmentIdx) { int ownerIdx = ch.locateOwnersForSegment(segmentIdx).indexOf(entry.getKey()); if (ownerIdx >= 0) { Integer segmentHashId = segmentHashIds.get(segmentIdx); int hashId = (segmentHashId + ownerIdx) & Integer.MAX_VALUE; serverHashes.add(new KeyValuePair<>(entry.getValue(), hashId)); } } } // TODO: this seems to be numOwners * numSegments looking at above logic, this doesn't seem correct. Seems // totalNumServers below should be the # of unique addresses that own at least one segment. int totalNumServers = serverHashes.size(); writeCommonHashTopologyHeader(buffer, h.topologyId, h.numOwners, h.hashFunction, h.hashSpace, totalNumServers); for (KeyValuePair<ServerAddress, Integer> serverHash : serverHashes) { ExtendedByteBuf.writeString(serverHash.getKey().getHost(), buffer); ExtendedByteBuf.writeUnsignedShort(serverHash.getKey().getPort(), buffer); int hashId = serverHash.getValue(); if (trace) // TODO: why need cast to Object.... log.tracef("Writing hash id %d for %s:%s", (Object) hashId, serverHash.getKey().getHost(), serverHash.getKey().getPort()); buffer.writeInt(hashId); } } } List<Integer> extractSegmentEndHashes(KeyPartitioner keyPartitioner) { if (keyPartitioner instanceof HashFunctionPartitioner) { return ((HashFunctionPartitioner) keyPartitioner).getSegmentEndHashes(); } else if (keyPartitioner instanceof GroupingPartitioner) { return extractSegmentEndHashes(((GroupingPartitioner) keyPartitioner).unwrap()); } else { return Collections.emptyList(); } } void writeLimitedHashTopologyUpdate(AbstractTopologyResponse t, ByteBuf buffer) { log.tracef("Return limited hash distribution aware header because the client %s doesn't ", t); Map<Address, ServerAddress> topologyMap = t.serverEndpointsMap; if (topologyMap.isEmpty()) { log.noMembersInTopology(); buffer.writeByte(0); // Topology not changed } else { writeCommonHashTopologyHeader(buffer, t.topologyId, 0, (byte) 0, 0, topologyMap.size()); for (ServerAddress address : topologyMap.values()) { ExtendedByteBuf.writeString(address.getHost(), buffer); ExtendedByteBuf.writeUnsignedShort(address.getPort(), buffer); buffer.writeInt(0); // Address' hash id } } } void writeTopologyUpdate(TopologyAwareResponse t, ByteBuf buffer) { Map<Address, ServerAddress> topologyMap = t.serverEndpointsMap; if (topologyMap.isEmpty()) { log.noMembersInTopology(); buffer.writeByte(0); // Topology not changed } else { log.tracef("Write topology change response header %s", t); buffer.writeByte(1); // Topology changed ExtendedByteBuf.writeUnsignedInt(t.topologyId, buffer); ExtendedByteBuf.writeUnsignedInt(topologyMap.size(), buffer); for (ServerAddress address : topologyMap.values()) { ExtendedByteBuf.writeString(address.getHost(), buffer); ExtendedByteBuf.writeUnsignedShort(address.getPort(), buffer); } } } void writeNoTopologyUpdate(ByteBuf buffer) { log.trace("Write topology response header with no change"); buffer.writeByte(0); } protected void writeCommonHashTopologyHeader(ByteBuf buffer, int viewId, int numOwners, byte hashFct, int hashSpace, int numServers) { buffer.writeByte(1); // Topology changed ExtendedByteBuf.writeUnsignedInt(viewId, buffer); ExtendedByteBuf.writeUnsignedShort(numOwners, buffer); // Num key owners buffer.writeByte(hashFct); // Hash function ExtendedByteBuf.writeUnsignedInt(hashSpace, buffer); // Hash space ExtendedByteBuf.writeUnsignedInt(numServers, buffer); log.tracef("Topology will contain %d addresses", numServers); } }