package org.infinispan.server.hotrod; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.infinispan.Cache; import org.infinispan.commons.logging.LogFactory; import org.infinispan.commons.util.CloseableIterator; import org.infinispan.configuration.cache.CacheMode; import org.infinispan.configuration.cache.Configuration; import org.infinispan.container.entries.CacheEntry; import org.infinispan.container.entries.InternalCacheEntry; import org.infinispan.distribution.ch.ConsistentHash; import org.infinispan.factories.ComponentRegistry; import org.infinispan.manager.EmbeddedCacheManager; import org.infinispan.remoting.transport.Address; import org.infinispan.server.hotrod.logging.Log; import org.infinispan.server.hotrod.transport.ExtendedByteBuf; import org.infinispan.topology.CacheTopology; import io.netty.buffer.ByteBuf; /** * @author Galder ZamarreƱo */ class Encoder2x implements VersionedEncoder { private static final Log log = LogFactory.getLog(Encoder2x.class, Log.class); private static final boolean isTrace = log.isTraceEnabled(); @Override public void writeEvent(Events.Event e, ByteBuf buf) { if (isTrace) log.tracef("Write event %s", e); buf.writeByte(Constants.MAGIC_RES); ExtendedByteBuf.writeUnsignedLong(e.messageId, buf); buf.writeByte(e.op.getResponseOpCode()); buf.writeByte(OperationStatus.Success.getCode()); buf.writeByte(0); // no topology change ExtendedByteBuf.writeRangedBytes(e.listenerId, buf); e.writeEvent(buf); } @Override public void writeHeader(Response r, ByteBuf buf, Cache<Address, ServerAddress> addressCache, HotRodServer server) { // Sometimes an error happens before we have added the cache to the knownCaches/knownCacheConfigurations map // If that happens, we pretend the cache is LOCAL and we skip the topology update String cacheName = r.cacheName.isEmpty() ? server.getConfiguration().defaultCacheName() : r.cacheName; ComponentRegistry cr = server.getCacheRegistry(cacheName); Configuration configuration = server.getCacheConfiguration(cacheName); CacheMode cacheMode = configuration == null ? CacheMode.LOCAL : configuration.clustering().cacheMode(); CacheTopology cacheTopology = cacheMode.isClustered() ? cr.getStateTransferManager().getCacheTopology() : null; Optional<AbstractTopologyResponse> newTopology = getTopologyResponse(r, addressCache, cacheMode, cacheTopology); buf.writeByte(Constants.MAGIC_RES); ExtendedByteBuf.writeUnsignedLong(r.messageId, buf); buf.writeByte(r.operation.getResponseOpCode()); writeStatus(r, buf, server, configuration); if (newTopology.isPresent()) { AbstractTopologyResponse topology = newTopology.get(); if (topology instanceof TopologyAwareResponse) { writeTopologyUpdate((TopologyAwareResponse) topology, buf); if (r.clientIntel == Constants.INTELLIGENCE_HASH_DISTRIBUTION_AWARE) writeEmptyHashInfo(topology, buf); } else if (topology instanceof HashDistAware20Response) { writeHashTopologyUpdate((HashDistAware20Response) topology, cacheTopology, buf); } else { throw new IllegalArgumentException("Unsupported response: " + topology); } } else { if (isTrace) log.trace("Write topology response header with no change"); buf.writeByte(0); } } private void writeStatus(Response r, ByteBuf buf, HotRodServer server, Configuration cfg) { if (server == null || Constants.isVersionPre24(r.version)) buf.writeByte(r.status.getCode()); else { OperationStatus st = OperationStatus.withCompatibility(r.status, cfg.compatibility().enabled()); buf.writeByte(st.getCode()); } } private void writeTopologyUpdate(TopologyAwareResponse t, ByteBuf buffer) { Map<Address, ServerAddress> topologyMap = t.serverEndpointsMap; if (topologyMap.isEmpty()) { log.noMembersInTopology(); buffer.writeByte(0); // Topology not changed } else { if (isTrace) log.tracef("Write topology change response header %s", t); buffer.writeByte(1); // Topology changed ExtendedByteBuf.writeUnsignedInt(t.topologyId, buffer); ExtendedByteBuf.writeUnsignedInt(topologyMap.size(), buffer); for (ServerAddress address : topologyMap.values()) { ExtendedByteBuf.writeString(address.getHost(), buffer); ExtendedByteBuf.writeUnsignedShort(address.getPort(), buffer); } } } private void writeEmptyHashInfo(AbstractTopologyResponse t, ByteBuf buffer) { if (isTrace) log.tracef("Return limited hash distribution aware header because the client %s doesn't ", t); buffer.writeByte(0); // Hash Function Version ExtendedByteBuf.writeUnsignedInt(t.numSegments, buffer); } private void writeHashTopologyUpdate(HashDistAware20Response h, CacheTopology cacheTopology, ByteBuf buf) { // Calculate members first, in case there are no members ConsistentHash ch = cacheTopology.getReadConsistentHash(); Map<Address, ServerAddress> members = h.serverEndpointsMap.entrySet().stream().filter(e -> ch.getMembers().contains(e.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); if (isTrace) { log.trace("Topology cache contains: " + h.serverEndpointsMap); log.trace("After read consistent hash filter, members are: " + members); } if (members.isEmpty()) { log.noMembersInHashTopology(ch, h.serverEndpointsMap.toString()); buf.writeByte(0); // Topology not changed } else { if (isTrace) log.tracef("Write hash distribution change response header %s", h); buf.writeByte(1); // Topology changed ExtendedByteBuf.writeUnsignedInt(h.topologyId, buf); // Topology ID // Write members AtomicInteger indexCount = new AtomicInteger(-1); ExtendedByteBuf.writeUnsignedInt(members.size(), buf); Map<Address, Integer> indexedMembers = new HashMap<>(); members.forEach((addr, serverAddr) -> { ExtendedByteBuf.writeString(serverAddr.getHost(), buf); ExtendedByteBuf.writeUnsignedShort(serverAddr.getPort(), buf); indexCount.incrementAndGet(); indexedMembers.put(addr, indexCount.get()); // easier indexing }); // Write segment information int numSegments = ch.getNumSegments(); buf.writeByte(h.hashFunction); // Hash function ExtendedByteBuf.writeUnsignedInt(numSegments, buf); for (int segmentId = 0; segmentId < numSegments; ++segmentId) { List<Address> owners = ch.locateOwnersForSegment(segmentId).stream().filter(members::containsKey).collect(Collectors.toList()); int ownersSize = owners.size(); if (ownersSize == 0) { // When sending partial updates, number of owners could be 0, // in which case just take the first member in the list. buf.writeByte(1); ExtendedByteBuf.writeUnsignedInt(0, buf); } else { buf.writeByte(ownersSize); owners.forEach(ownerAddr -> { Integer index = indexedMembers.get(ownerAddr); if (index != null) { ExtendedByteBuf.writeUnsignedInt(index, buf); } }); } } } } private Optional<AbstractTopologyResponse> getTopologyResponse(Response r, Cache<Address, ServerAddress> addressCache, CacheMode cacheMode, CacheTopology cacheTopology) { // If clustered, set up a cache for topology information if (addressCache != null) { switch (r.clientIntel) { case Constants.INTELLIGENCE_TOPOLOGY_AWARE: case Constants.INTELLIGENCE_HASH_DISTRIBUTION_AWARE: { // Only send a topology update if the cache is clustered if (cacheMode.isClustered()) { // Use the request cache's topology id as the HotRod topologyId. int currentTopologyId = cacheTopology.getTopologyId(); // AND if the client's topology id is smaller than the server's topology id if (r.topologyId < currentTopologyId) return generateTopologyResponse(r, addressCache, cacheMode, cacheTopology); } } } } return Optional.empty(); } private Optional<AbstractTopologyResponse> generateTopologyResponse(Response r, Cache<Address, ServerAddress> addressCache, CacheMode cacheMode, CacheTopology cacheTopology) { // If the topology cache is incomplete, we assume that a node has joined but hasn't added his HotRod // endpoint address to the topology cache yet. We delay the topology update until the next client // request by returning null here (so the client topology id stays the same). // If a new client connects while the join is in progress, though, we still have to generate a topology // response. Same if we have cache manager that is a member of the cluster but doesn't have a HotRod // endpoint (aka a storage-only node), and a HotRod server shuts down. // Our workaround is to send a "partial" topology update when the topology cache is incomplete, but the // difference between the client topology id and the server topology id is 2 or more. The partial update // will have the topology id of the server - 1, so it won't prevent a regular topology update if/when // the topology cache is updated. int currentTopologyId = cacheTopology.getTopologyId(); List<Address> cacheMembers = cacheTopology.getMembers(); Map<Address, ServerAddress> serverEndpoints = new HashMap<>(); addressCache.forEach(serverEndpoints::put); int topologyId = currentTopologyId; if (isTrace) { log.tracef("Check for partial topologies: members=%s, endpoints=%s, client-topology=%s, server-topology=%s", cacheMembers, cacheMembers, r.topologyId, topologyId); } if (!serverEndpoints.keySet().containsAll(cacheMembers)) { // At least one cache member is missing from the topology cache int clientTopologyId = r.topologyId; if (currentTopologyId - clientTopologyId < 2) { if (isTrace) log.trace("Postpone topology update"); return Optional.empty(); // Postpone topology update } else { // Send partial topology update topologyId -= 1; if (isTrace) log.tracef("Send partial topology update with topology id %s", topologyId); } } if (r.clientIntel == Constants.INTELLIGENCE_HASH_DISTRIBUTION_AWARE && !cacheMode.isInvalidation()) { int numSegments = cacheTopology.getReadConsistentHash().getNumSegments(); return Optional.of(new HashDistAware20Response(topologyId, serverEndpoints, numSegments, Constants.DEFAULT_CONSISTENT_HASH_VERSION)); } else { return Optional.of(new TopologyAwareResponse(topologyId, serverEndpoints, 0)); } } @Override public void writeResponse(Response response, ByteBuf buf, EmbeddedCacheManager cacheManager, HotRodServer server) { switch(response.operation) { case GET: { GetResponse r = (GetResponse) response; if (r.status == OperationStatus.Success) ExtendedByteBuf.writeRangedBytes(r.data, buf); break; } case GET_WITH_METADATA: { GetWithMetadataResponse r = (GetWithMetadataResponse) response; if (r.status == OperationStatus.Success) { writeMetadata(r.lifespan, r.maxIdle, r.created, r.lastUsed, r.dataVersion, buf); ExtendedByteBuf.writeRangedBytes(r.data, buf); } break; } case GET_WITH_VERSION: { GetWithVersionResponse r = (GetWithVersionResponse) response; if (r.status == OperationStatus.Success) { buf.writeLong(r.dataVersion); ExtendedByteBuf.writeRangedBytes(r.data, buf); } break; } case GET_STREAM: { GetStreamResponse r = (GetStreamResponse) response; if (r.status == OperationStatus.Success) { writeMetadata(r.lifespan, r.maxIdle, r.created, r.lastUsed, r.dataVersion, buf); ExtendedByteBuf.writeRangedBytes(r.data, r.offset, buf); } break; } case PUT: case PUT_IF_ABSENT: case REPLACE: case REPLACE_IF_UNMODIFIED: case REMOVE: case REMOVE_IF_UNMODIFIED: { if (response instanceof ResponseWithPrevious) { ResponseWithPrevious r = (ResponseWithPrevious) response; if (!r.previous.isPresent()) ExtendedByteBuf.writeUnsignedInt(0, buf); else ExtendedByteBuf.writeRangedBytes(r.previous.get(), buf); } break; } case STATS: { StatsResponse r = (StatsResponse) response; ExtendedByteBuf.writeUnsignedInt(r.stats.size(), buf); for (Map.Entry<String, String> stat : r.stats.entrySet()) { ExtendedByteBuf.writeString(stat.getKey(), buf); ExtendedByteBuf.writeString(stat.getValue(), buf); } break; } case PING: case CLEAR: case CONTAINS_KEY: case PUT_ALL: case PUT_STREAM: case ITERATION_END: case ADD_CLIENT_LISTENER: case REMOVE_CLIENT_LISTENER: // Empty response break; case SIZE: { SizeResponse r = (SizeResponse) response; ExtendedByteBuf.writeUnsignedLong(r.size, buf); break; } case AUTH_MECH_LIST: { AuthMechListResponse r = (AuthMechListResponse) response; ExtendedByteBuf.writeUnsignedInt(r.mechs.size(), buf); r.mechs.forEach(s -> ExtendedByteBuf.writeString(s, buf)); break; } case AUTH: { AuthResponse r = (AuthResponse) response; if (r.challenge != null) { buf.writeBoolean(false); ExtendedByteBuf.writeRangedBytes(r.challenge, buf); } else { buf.writeBoolean(true); ExtendedByteBuf.writeUnsignedInt(0, buf); } break; } case EXEC: { ExecResponse r = (ExecResponse) response; ExtendedByteBuf.writeRangedBytes(r.result, buf); break; } case BULK_GET: { BulkGetResponse r = (BulkGetResponse) response; if (isTrace) log.trace("About to respond to bulk get request"); if (r.status == OperationStatus.Success) { try (CloseableIterator<Map.Entry<byte[], byte[]>> iterator = r.entries.iterator()) { int max = Integer.MAX_VALUE; if (r.count != 0) { if (isTrace) log.tracef("About to write (max) %d messages to the client", r.count); max = r.count; } int count = 0; while (iterator.hasNext() && count < max) { Map.Entry<byte[], byte[]> entry = iterator.next(); buf.writeByte(1); // Not done ExtendedByteBuf.writeRangedBytes(entry.getKey(), buf); ExtendedByteBuf.writeRangedBytes(entry.getValue(), buf); count++; } buf.writeByte(0); // Done } } break; } case BULK_GET_KEYS: { BulkGetKeysResponse r = (BulkGetKeysResponse) response; if (r.status == OperationStatus.Success) { r.iterator.forEachRemaining(key -> { buf.writeByte(1); // Not done ExtendedByteBuf.writeRangedBytes(key, buf); }); buf.writeByte(0); // Done } break; } case QUERY: { QueryResponse r = (QueryResponse) response; ExtendedByteBuf.writeRangedBytes(r.result, buf); break; } case ITERATION_START: { IterationStartResponse r = (IterationStartResponse) response; ExtendedByteBuf.writeString(r.iterationId, buf); break; } case ITERATION_NEXT: { IterationNextResponse r = (IterationNextResponse) response; ExtendedByteBuf.writeRangedBytes(r.iterationResult.segmentsToBytes(), buf); List<CacheEntry> entries = r.iterationResult.getEntries(); ExtendedByteBuf.writeUnsignedInt(entries.size(), buf); Optional<Integer> projectionLength = projectionInfo(entries, r.version); projectionLength.ifPresent(i -> ExtendedByteBuf.writeUnsignedInt(i, buf)); entries.forEach(cacheEntry -> { if (Constants.isVersionPost24(r.version)) { if (r.iterationResult.isMetadata()) { buf.writeByte(1); InternalCacheEntry ice = (InternalCacheEntry) cacheEntry; int lifespan = ice.getLifespan() < 0 ? -1 : (int) (ice.getLifespan() / 1000); int maxIdle = ice.getMaxIdle() < 0 ? -1 : (int) (ice.getMaxIdle() / 1000); long lastUsed = ice.getLastUsed(); long created = ice.getCreated(); long dataVersion = CacheDecodeContext.extractVersion(ice.getMetadata().version()); writeMetadata(lifespan, maxIdle, created, lastUsed, dataVersion, buf); } else { buf.writeByte(0); } } Object key = cacheEntry.getKey(); Object value = cacheEntry.getValue(); if (r.iterationResult.isCompatEnabled()) { key = r.iterationResult.unbox(key); value = r.iterationResult.unbox(value); } ExtendedByteBuf.writeRangedBytes((byte[]) key, buf); if (value instanceof Object[]) { for (Object o : (Object[]) value) { ExtendedByteBuf.writeRangedBytes((byte[]) o, buf); } } else if (value instanceof byte[]) { ExtendedByteBuf.writeRangedBytes((byte[]) value, buf); } else { throw new IllegalArgumentException("Unsupported type passed: " + value.getClass()); } }); break; } case GET_ALL: { GetAllResponse r = (GetAllResponse) response; if (r.status == OperationStatus.Success) { ExtendedByteBuf.writeUnsignedInt(r.entries.size(), buf); r.entries.forEach((k, v) -> { ExtendedByteBuf.writeRangedBytes(k, buf); ExtendedByteBuf.writeRangedBytes(v, buf); }); } break; } case ERROR: { ErrorResponse r = (ErrorResponse) response; ExtendedByteBuf.writeString(r.msg, buf); break; } case CACHE_ENTRY_CREATED_EVENT: case CACHE_ENTRY_MODIFIED_EVENT: case CACHE_ENTRY_REMOVED_EVENT: case CACHE_ENTRY_EXPIRED_EVENT: throw new UnsupportedOperationException(response.toString()); default: throw new UnsupportedOperationException(response.toString()); } } static void writeMetadata(int lifespan, int maxIdle, long created, long lastUsed, long dataVersion, ByteBuf buf) { int flags = (lifespan < 0 ? Constants.INFINITE_LIFESPAN : 0) + (maxIdle < 0 ? Constants.INFINITE_MAXIDLE : 0); buf.writeByte(flags); if (lifespan >= 0) { buf.writeLong(created); ExtendedByteBuf.writeUnsignedInt(lifespan, buf); } if (maxIdle >= 0) { buf.writeLong(lastUsed); ExtendedByteBuf.writeUnsignedInt(maxIdle, buf); } buf.writeLong(dataVersion); } static Optional<Integer> projectionInfo(List<CacheEntry> entries, byte version) { if (!entries.isEmpty()) { CacheEntry entry = entries.get(0); if (entry.getValue() instanceof Object[]) { return Optional.of(((Object[]) entry.getValue()).length); } else if (!Constants.isVersionPre24(version)) { return Optional.of(1); } } return Optional.empty(); } }