/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.internal.processors.query.h2.twostep;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.Arrays;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import javax.cache.CacheException;
import org.apache.ignite.IgniteCheckedException;
import org.apache.ignite.IgniteClientDisconnectedException;
import org.apache.ignite.IgniteException;
import org.apache.ignite.IgniteLogger;
import org.apache.ignite.cache.query.QueryCancelledException;
import org.apache.ignite.cluster.ClusterNode;
import org.apache.ignite.events.DiscoveryEvent;
import org.apache.ignite.events.Event;
import org.apache.ignite.events.EventType;
import org.apache.ignite.internal.GridKernalContext;
import org.apache.ignite.internal.GridTopic;
import org.apache.ignite.internal.IgniteInterruptedCheckedException;
import org.apache.ignite.internal.managers.communication.GridIoPolicy;
import org.apache.ignite.internal.managers.communication.GridMessageListener;
import org.apache.ignite.internal.managers.eventstorage.GridLocalEventListener;
import org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion;
import org.apache.ignite.internal.processors.cache.GridCacheContext;
import org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionState;
import org.apache.ignite.internal.processors.cache.query.GridCacheQueryMarshallable;
import org.apache.ignite.internal.processors.cache.query.GridCacheSqlQuery;
import org.apache.ignite.internal.processors.cache.query.GridCacheTwoStepQuery;
import org.apache.ignite.internal.processors.query.GridQueryCacheObjectsIterator;
import org.apache.ignite.internal.processors.query.GridQueryCancel;
import org.apache.ignite.internal.processors.query.GridRunningQueryInfo;
import org.apache.ignite.internal.processors.query.h2.IgniteH2Indexing;
import org.apache.ignite.internal.processors.query.h2.opt.GridH2QueryContext;
import org.apache.ignite.internal.processors.query.h2.sql.GridSqlSortColumn;
import org.apache.ignite.internal.processors.query.h2.sql.GridSqlType;
import org.apache.ignite.internal.processors.query.h2.twostep.messages.GridQueryCancelRequest;
import org.apache.ignite.internal.processors.query.h2.twostep.messages.GridQueryFailResponse;
import org.apache.ignite.internal.processors.query.h2.twostep.messages.GridQueryNextPageRequest;
import org.apache.ignite.internal.processors.query.h2.twostep.messages.GridQueryNextPageResponse;
import org.apache.ignite.internal.processors.query.h2.twostep.msg.GridH2QueryRequest;
import org.apache.ignite.internal.util.GridIntIterator;
import org.apache.ignite.internal.util.GridIntList;
import org.apache.ignite.internal.util.GridSpinBusyLock;
import org.apache.ignite.internal.util.typedef.CIX2;
import org.apache.ignite.internal.util.typedef.F;
import org.apache.ignite.internal.util.typedef.X;
import org.apache.ignite.internal.util.typedef.internal.U;
import org.apache.ignite.lang.IgniteBiClosure;
import org.apache.ignite.lang.IgniteFuture;
import org.apache.ignite.lang.IgnitePredicate;
import org.apache.ignite.plugin.extensions.communication.Message;
import org.h2.command.ddl.CreateTableData;
import org.h2.engine.Session;
import org.h2.index.Cursor;
import org.h2.index.Index;
import org.h2.jdbc.JdbcConnection;
import org.h2.result.Row;
import org.h2.table.Column;
import org.h2.util.IntArray;
import org.h2.value.Value;
import org.jetbrains.annotations.Nullable;
import org.jsr166.ConcurrentHashMap8;
import static java.util.Collections.singletonList;
import static org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion.NONE;
import static org.apache.ignite.internal.processors.cache.query.GridCacheQueryType.SQL_FIELDS;
import static org.apache.ignite.internal.processors.cache.query.GridCacheSqlQuery.EMPTY_PARAMS;
import static org.apache.ignite.internal.processors.query.h2.IgniteH2Indexing.setupConnection;
import static org.apache.ignite.internal.processors.query.h2.opt.DistributedJoinMode.OFF;
import static org.apache.ignite.internal.processors.query.h2.opt.GridH2QueryType.REDUCE;
import static org.apache.ignite.internal.processors.query.h2.sql.GridSqlQuerySplitter.mergeTableIdentifier;
/**
* Reduce query executor.
*/
public class GridReduceQueryExecutor {
/** */
private static final String MERGE_INDEX_UNSORTED = "merge_scan";
/** */
private static final String MERGE_INDEX_SORTED = "merge_sorted";
/** */
private static final Set<ClusterNode> UNMAPPED_PARTS = Collections.emptySet();
/** */
private GridKernalContext ctx;
/** */
private IgniteH2Indexing h2;
/** */
private IgniteLogger log;
/** */
private final AtomicLong qryIdGen;
/** */
private final ConcurrentMap<Long, QueryRun> runs = new ConcurrentHashMap8<>();
/** */
private volatile List<GridThreadLocalTable> fakeTbls = Collections.emptyList();
/** */
private final Lock fakeTblsLock = new ReentrantLock();
/** */
private final GridSpinBusyLock busyLock;
/** */
private final CIX2<ClusterNode,Message> locNodeHnd = new CIX2<ClusterNode,Message>() {
@Override public void applyx(ClusterNode locNode, Message msg) {
h2.mapQueryExecutor().onMessage(locNode.id(), msg);
}
};
/**
* @param qryIdGen Query ID generator.
* @param busyLock Busy lock.
*/
public GridReduceQueryExecutor(AtomicLong qryIdGen, GridSpinBusyLock busyLock) {
this.qryIdGen = qryIdGen;
this.busyLock = busyLock;
}
/**
* @param ctx Context.
* @param h2 H2 Indexing.
* @throws IgniteCheckedException If failed.
*/
public void start(final GridKernalContext ctx, final IgniteH2Indexing h2) throws IgniteCheckedException {
this.ctx = ctx;
this.h2 = h2;
log = ctx.log(GridReduceQueryExecutor.class);
ctx.io().addMessageListener(GridTopic.TOPIC_QUERY, new GridMessageListener() {
@Override public void onMessage(UUID nodeId, Object msg) {
if (!busyLock.enterBusy())
return;
try {
if (msg instanceof GridCacheQueryMarshallable)
((GridCacheQueryMarshallable)msg).unmarshall(ctx.config().getMarshaller(), ctx);
GridReduceQueryExecutor.this.onMessage(nodeId, msg);
}
finally {
busyLock.leaveBusy();
}
}
});
ctx.event().addLocalEventListener(new GridLocalEventListener() {
@Override public void onEvent(final Event evt) {
UUID nodeId = ((DiscoveryEvent)evt).eventNode().id();
for (QueryRun r : runs.values()) {
for (GridMergeIndex idx : r.idxs) {
if (idx.hasSource(nodeId)) {
handleNodeLeft(r, nodeId);
break;
}
}
}
}
}, EventType.EVT_NODE_FAILED, EventType.EVT_NODE_LEFT);
}
/**
* @param r Query run.
* @param nodeId Left node ID.
*/
private void handleNodeLeft(QueryRun r, UUID nodeId) {
// Will attempt to retry. If reduce query was started it will fail on next page fetching.
retry(r, h2.readyTopologyVersion(), nodeId);
}
/**
* @param nodeId Node ID.
* @param msg Message.
*/
public void onMessage(UUID nodeId, Object msg) {
try {
assert msg != null;
ClusterNode node = ctx.discovery().node(nodeId);
if (node == null)
return; // Node left, ignore.
boolean processed = true;
if (msg instanceof GridQueryNextPageResponse)
onNextPage(node, (GridQueryNextPageResponse)msg);
else if (msg instanceof GridQueryFailResponse)
onFail(node, (GridQueryFailResponse)msg);
else
processed = false;
if (processed && log.isDebugEnabled())
log.debug("Processed response: " + nodeId + "->" + ctx.localNodeId() + " " + msg);
}
catch(Throwable th) {
U.error(log, "Failed to process message: " + msg, th);
}
}
/**
* @param node Node.
* @param msg Message.
*/
private void onFail(ClusterNode node, GridQueryFailResponse msg) {
QueryRun r = runs.get(msg.queryRequestId());
fail(r, node.id(), msg.error(), msg.failCode());
}
/**
* @param r Query run.
* @param nodeId Failed node ID.
* @param msg Error message.
*/
private void fail(QueryRun r, UUID nodeId, String msg, byte failCode) {
if (r != null) {
CacheException e = new CacheException("Failed to execute map query on the node: " + nodeId + ", " + msg);
if (failCode == GridQueryFailResponse.CANCELLED_BY_ORIGINATOR)
e.addSuppressed(new QueryCancelledException());
r.state(e, nodeId);
}
}
/**
* @param node Node.
* @param msg Message.
*/
private void onNextPage(final ClusterNode node, GridQueryNextPageResponse msg) {
final long qryReqId = msg.queryRequestId();
final int qry = msg.query();
final int seg = msg.segmentId();
final QueryRun r = runs.get(qryReqId);
if (r == null) // Already finished with error or canceled.
return;
final int pageSize = r.pageSize;
GridMergeIndex idx = r.idxs.get(msg.query());
GridResultPage page;
try {
page = new GridResultPage(ctx, node.id(), msg) {
@Override public void fetchNextPage() {
Object errState = r.state.get();
if (errState != null) {
CacheException err0 = errState instanceof CacheException ? (CacheException)errState : null;
if (err0 != null && err0.getCause() instanceof IgniteClientDisconnectedException)
throw err0;
CacheException e = new CacheException("Failed to fetch data from node: " + node.id());
if (err0 != null)
e.addSuppressed(err0);
throw e;
}
try {
GridQueryNextPageRequest msg0 = new GridQueryNextPageRequest(qryReqId, qry, seg, pageSize);
if (node.isLocal())
h2.mapQueryExecutor().onMessage(ctx.localNodeId(), msg0);
else
ctx.io().sendToGridTopic(node, GridTopic.TOPIC_QUERY, msg0, GridIoPolicy.QUERY_POOL);
}
catch (IgniteCheckedException e) {
throw new CacheException("Failed to fetch data from node: " + node.id(), e);
}
}
};
}
catch (Exception e) {
U.error(log, "Error in message.", e);
fail(r, node.id(), "Error in message.", GridQueryFailResponse.GENERAL_ERROR);
return;
}
idx.addPage(page);
if (msg.retry() != null)
retry(r, msg.retry(), node.id());
else if (msg.page() == 0) // Do count down on each first page received.
r.latch.countDown();
}
/**
* @param r Query run.
* @param retryVer Retry version.
* @param nodeId Node ID.
*/
private void retry(QueryRun r, AffinityTopologyVersion retryVer, UUID nodeId) {
r.state(retryVer, nodeId);
}
/**
* @param cctx Cache context for main space.
* @param extraSpaces Extra spaces.
* @return {@code true} If preloading is active.
*/
private boolean isPreloadingActive(final GridCacheContext<?, ?> cctx, List<Integer> extraSpaces) {
if (hasMovingPartitions(cctx))
return true;
if (extraSpaces != null) {
for (int i = 0; i < extraSpaces.size(); i++) {
if (hasMovingPartitions(cacheContext(extraSpaces.get(i))))
return true;
}
}
return false;
}
/**
* @param cctx Cache context.
* @return {@code True} If cache has partitions in {@link GridDhtPartitionState#MOVING} state.
*/
private boolean hasMovingPartitions(GridCacheContext<?, ?> cctx) {
return !cctx.isLocal() && cctx.topology().hasMovingPartitions();
}
/**
* @param cacheId Cache ID.
* @return Cache context.
*/
private GridCacheContext<?,?> cacheContext(Integer cacheId) {
return ctx.cache().context().cacheContext(cacheId);
}
/**
* @param topVer Topology version.
* @param cctx Cache context.
* @param parts Partitions.
*/
private Map<ClusterNode, IntArray> stableDataNodesMap(AffinityTopologyVersion topVer,
final GridCacheContext<?, ?> cctx, @Nullable final int[] parts) {
Map<ClusterNode, IntArray> mapping = new HashMap<>();
// Explicit partitions mapping is not applicable to replicated cache.
if (cctx.isReplicated()) {
for (ClusterNode clusterNode : cctx.affinity().assignment(topVer).primaryPartitionNodes())
mapping.put(clusterNode, null);
return mapping;
}
List<List<ClusterNode>> assignment = cctx.affinity().assignment(topVer).assignment();
boolean needPartsFilter = parts != null;
GridIntIterator iter = needPartsFilter ? new GridIntList(parts).iterator() :
U.forRange(0, cctx.affinity().partitions());
while(iter.hasNext()) {
int partId = iter.next();
List<ClusterNode> partNodes = assignment.get(partId);
if (partNodes.size() > 0) {
ClusterNode prim = partNodes.get(0);
if (!needPartsFilter) {
mapping.put(prim, null);
continue;
}
IntArray partIds = mapping.get(prim);
if (partIds == null) {
partIds = new IntArray();
mapping.put(prim, partIds);
}
partIds.add(partId);
}
}
return mapping;
}
/**
* @param isReplicatedOnly If we must only have replicated caches.
* @param topVer Topology version.
* @param cctx Cache context for main space.
* @param extraSpaces Extra spaces.
* @param parts Partitions.
* @return Data nodes or {@code null} if repartitioning started and we need to retry.
*/
private Map<ClusterNode, IntArray> stableDataNodes(
boolean isReplicatedOnly,
AffinityTopologyVersion topVer,
final GridCacheContext<?, ?> cctx,
List<Integer> extraSpaces,
int[] parts) {
Map<ClusterNode, IntArray> map = stableDataNodesMap(topVer, cctx, parts);
Set<ClusterNode> nodes = map.keySet();
if (F.isEmpty(map))
throw new CacheException("Failed to find data nodes for cache: " + cctx.name());
if (!F.isEmpty(extraSpaces)) {
for (int i = 0; i < extraSpaces.size(); i++) {
GridCacheContext<?,?> extraCctx = cacheContext(extraSpaces.get(i));
String extraSpace = extraCctx.name();
if (extraCctx.isLocal())
continue; // No consistency guaranties for local caches.
if (isReplicatedOnly && !extraCctx.isReplicated())
throw new CacheException("Queries running on replicated cache should not contain JOINs " +
"with partitioned tables [rCache=" + cctx.name() + ", pCache=" + extraSpace + "]");
Set<ClusterNode> extraNodes = stableDataNodesMap(topVer, extraCctx, parts).keySet();
if (F.isEmpty(extraNodes))
throw new CacheException("Failed to find data nodes for cache: " + extraSpace);
if (isReplicatedOnly && extraCctx.isReplicated()) {
nodes.retainAll(extraNodes);
if (map.isEmpty()) {
if (isPreloadingActive(cctx, extraSpaces))
return null; // Retry.
else
throw new CacheException("Caches have distinct sets of data nodes [cache1=" + cctx.name() +
", cache2=" + extraSpace + "]");
}
}
else if (!isReplicatedOnly && extraCctx.isReplicated()) {
if (!extraNodes.containsAll(nodes))
if (isPreloadingActive(cctx, extraSpaces))
return null; // Retry.
else
throw new CacheException("Caches have distinct sets of data nodes [cache1=" + cctx.name() +
", cache2=" + extraSpace + "]");
}
else if (!isReplicatedOnly && !extraCctx.isReplicated()) {
if (!extraNodes.equals(nodes))
if (isPreloadingActive(cctx, extraSpaces))
return null; // Retry.
else
throw new CacheException("Caches have distinct sets of data nodes [cache1=" + cctx.name() +
", cache2=" + extraSpace + "]");
}
else
throw new IllegalStateException();
}
}
return map;
}
/**
* @param cctx Cache context.
* @param qry Query.
* @param keepPortable Keep portable.
* @param enforceJoinOrder Enforce join order of tables.
* @param timeoutMillis Timeout in milliseconds.
* @param cancel Query cancel.
* @param params Query parameters.
* @param parts Partitions.
* @return Rows iterator.
*/
public Iterator<List<?>> query(
GridCacheContext<?, ?> cctx,
GridCacheTwoStepQuery qry,
boolean keepPortable,
boolean enforceJoinOrder,
int timeoutMillis,
GridQueryCancel cancel,
Object[] params,
final int[] parts
) {
if (F.isEmpty(params))
params = EMPTY_PARAMS;
final boolean isReplicatedOnly = qry.isReplicatedOnly();
// Fail if all caches are replicated and explicit partitions are set.
for (int attempt = 0;; attempt++) {
if (attempt != 0) {
try {
Thread.sleep(attempt * 10); // Wait for exchange.
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new CacheException("Query was interrupted.", e);
}
}
final long qryReqId = qryIdGen.incrementAndGet();
final String space = cctx.name();
final QueryRun r = new QueryRun(qryReqId, qry.originalSql(), space,
h2.connectionForSpace(space), qry.mapQueries().size(), qry.pageSize(),
U.currentTimeMillis(), cancel);
AffinityTopologyVersion topVer = h2.readyTopologyVersion();
List<Integer> extraSpaces = qry.extraCaches();
Collection<ClusterNode> nodes = null;
// Explicit partition mapping for unstable topology.
Map<ClusterNode, IntArray> partsMap = null;
// Explicit partitions mapping for query.
Map<ClusterNode, IntArray> qryMap = null;
// Partitions are not supported for queries over all replicated caches.
if (cctx.isReplicated() && parts != null) {
boolean failIfReplicatedOnly = true;
for (Integer cacheId : extraSpaces) {
if (!cacheContext(cacheId).isReplicated()) {
failIfReplicatedOnly = false;
break;
}
}
if (failIfReplicatedOnly)
throw new CacheException("Partitions are not supported for replicated caches");
}
if (qry.isLocal())
nodes = singletonList(ctx.discovery().localNode());
else {
if (isPreloadingActive(cctx, extraSpaces)) {
if (isReplicatedOnly)
nodes = replicatedUnstableDataNodes(cctx, extraSpaces);
else {
partsMap = partitionedUnstableDataNodes(cctx, extraSpaces);
if (partsMap != null) {
qryMap = narrowForQuery(partsMap, parts);
nodes = qryMap == null ? null : qryMap.keySet();
}
}
} else {
qryMap = stableDataNodes(isReplicatedOnly, topVer, cctx, extraSpaces, parts);
if (qryMap != null)
nodes = qryMap.keySet();
}
if (nodes == null)
continue; // Retry.
assert !nodes.isEmpty();
if (isReplicatedOnly || qry.explain()) {
ClusterNode locNode = ctx.discovery().localNode();
// Always prefer local node if possible.
if (nodes.contains(locNode))
nodes = singletonList(locNode);
else {
// Select random data node to run query on a replicated data or
// get EXPLAIN PLAN from a single node.
nodes = singletonList(F.rand(nodes));
}
}
}
int tblIdx = 0;
final boolean skipMergeTbl = !qry.explain() && qry.skipMergeTable();
final int segmentsPerIndex = qry.explain() || isReplicatedOnly ? 1 :
findFirstPartitioned(cctx, extraSpaces).config().getQueryParallelism();
int replicatedQrysCnt = 0;
for (GridCacheSqlQuery mapQry : qry.mapQueries()) {
GridMergeIndex idx;
if (!skipMergeTbl) {
GridMergeTable tbl;
try {
tbl = createMergeTable(r.conn, mapQry, qry.explain());
}
catch (IgniteCheckedException e) {
throw new IgniteException(e);
}
idx = tbl.getMergeIndex();
fakeTable(r.conn, tblIdx++).innerTable(tbl);
}
else
idx = GridMergeIndexUnsorted.createDummy(ctx);
// If the query has only replicated tables, we have to run it on a single node only.
if (!mapQry.isPartitioned()) {
ClusterNode node = F.rand(nodes);
mapQry.node(node.id());
replicatedQrysCnt++;
idx.setSources(singletonList(node), 1); // Replicated tables can have only 1 segment.
}
else
idx.setSources(nodes, segmentsPerIndex);
idx.setPageSize(r.pageSize);
r.idxs.add(idx);
}
r.latch = new CountDownLatch(isReplicatedOnly ? 1 :
(r.idxs.size() - replicatedQrysCnt) * nodes.size() * segmentsPerIndex + replicatedQrysCnt);
runs.put(qryReqId, r);
try {
cancel.checkCancelled();
if (ctx.clientDisconnected()) {
throw new CacheException("Query was cancelled, client node disconnected.",
new IgniteClientDisconnectedException(ctx.cluster().clientReconnectFuture(),
"Client node disconnected."));
}
List<GridCacheSqlQuery> mapQrys = qry.mapQueries();
if (qry.explain()) {
mapQrys = new ArrayList<>(qry.mapQueries().size());
for (GridCacheSqlQuery mapQry : qry.mapQueries())
mapQrys.add(new GridCacheSqlQuery("EXPLAIN " + mapQry.query())
.parameterIndexes(mapQry.parameterIndexes()));
}
final boolean distributedJoins = qry.distributedJoins();
final Collection<ClusterNode> finalNodes = nodes;
cancel.set(new Runnable() {
@Override public void run() {
send(finalNodes, new GridQueryCancelRequest(qryReqId), null, false);
}
});
boolean retry = false;
// Always enforce join order on map side to have consistent behavior.
int flags = GridH2QueryRequest.FLAG_ENFORCE_JOIN_ORDER;
if (distributedJoins)
flags |= GridH2QueryRequest.FLAG_DISTRIBUTED_JOINS;
if (qry.isLocal())
flags |= GridH2QueryRequest.FLAG_IS_LOCAL;
if (qry.explain())
flags |= GridH2QueryRequest.FLAG_EXPLAIN;
if (isReplicatedOnly)
flags |= GridH2QueryRequest.FLAG_REPLICATED;
if (send(nodes,
new GridH2QueryRequest()
.requestId(qryReqId)
.topologyVersion(topVer)
.pageSize(r.pageSize)
.caches(qry.caches())
.tables(distributedJoins ? qry.tables() : null)
.partitions(convert(partsMap))
.queries(mapQrys)
.parameters(params)
.flags(flags)
.timeout(timeoutMillis),
parts == null ? null : new ExplicitPartitionsSpecializer(qryMap),
false)) {
awaitAllReplies(r, nodes, cancel);
Object state = r.state.get();
if (state != null) {
if (state instanceof CacheException) {
CacheException err = (CacheException)state;
if (err.getCause() instanceof IgniteClientDisconnectedException)
throw err;
if (wasCancelled(err))
throw new QueryCancelledException(); // Throw correct exception.
throw new CacheException("Failed to run map query remotely.", err);
}
if (state instanceof AffinityTopologyVersion) {
retry = true;
// If remote node asks us to retry then we have outdated full partition map.
h2.awaitForReadyTopologyVersion((AffinityTopologyVersion)state);
}
}
}
else // Send failed.
retry = true;
Iterator<List<?>> resIter = null;
if (!retry) {
if (skipMergeTbl) {
List<List<?>> res = new ArrayList<>();
// Simple UNION ALL can have multiple indexes.
for (GridMergeIndex idx : r.idxs) {
Cursor cur = idx.findInStream(null, null);
while (cur.next()) {
Row row = cur.get();
int cols = row.getColumnCount();
List<Object> resRow = new ArrayList<>(cols);
for (int c = 0; c < cols; c++)
resRow.add(row.getValue(c).getObject());
res.add(resRow);
}
}
resIter = res.iterator();
}
else {
cancel.checkCancelled();
UUID locNodeId = ctx.localNodeId();
setupConnection(r.conn, false, enforceJoinOrder);
GridH2QueryContext.set(new GridH2QueryContext(locNodeId, locNodeId, qryReqId, REDUCE)
.pageSize(r.pageSize).distributedJoinMode(OFF));
try {
if (qry.explain())
return explainPlan(r.conn, space, qry, params);
GridCacheSqlQuery rdc = qry.reduceQuery();
ResultSet res = h2.executeSqlQueryWithTimer(space,
r.conn,
rdc.query(),
F.asList(rdc.parameters(params)),
false, // The statement will cache some extra thread local objects.
timeoutMillis,
cancel);
resIter = new IgniteH2Indexing.FieldsIterator(res);
}
finally {
GridH2QueryContext.clearThreadLocal();
}
}
}
if (retry) {
if (Thread.currentThread().isInterrupted())
throw new IgniteInterruptedCheckedException("Query was interrupted.");
continue;
}
return new GridQueryCacheObjectsIterator(resIter, cctx, keepPortable);
}
catch (IgniteCheckedException | RuntimeException e) {
U.closeQuiet(r.conn);
if (e instanceof CacheException) {
if (wasCancelled((CacheException)e))
throw new CacheException("Failed to run reduce query locally.", new QueryCancelledException());
throw (CacheException)e;
}
Throwable cause = e;
if (e instanceof IgniteCheckedException) {
Throwable disconnectedErr =
((IgniteCheckedException)e).getCause(IgniteClientDisconnectedException.class);
if (disconnectedErr != null)
cause = disconnectedErr;
}
throw new CacheException("Failed to run reduce query locally.", cause);
}
finally {
// Make sure any activity related to current attempt is cancelled.
cancelRemoteQueriesIfNeeded(nodes, r, qryReqId, qry.distributedJoins());
if (!runs.remove(qryReqId, r))
U.warn(log, "Query run was already removed: " + qryReqId);
if (!skipMergeTbl) {
for (int i = 0, mapQrys = qry.mapQueries().size(); i < mapQrys; i++)
fakeTable(null, i).innerTable(null); // Drop all merge tables.
}
}
}
}
/**
* @param cctx Cache context for main space.
* @param extraSpaces Extra spaces.
* @return The first partitioned cache context.
*/
private GridCacheContext<?,?> findFirstPartitioned(GridCacheContext<?,?> cctx, List<Integer> extraSpaces) {
if (cctx.isLocal())
throw new CacheException("Cache is LOCAL: " + cctx.name());
if (!cctx.isReplicated())
return cctx;
for (int i = 0 ; i < extraSpaces.size(); i++) {
GridCacheContext<?, ?> extraCctx = cacheContext(extraSpaces.get(i));
if (!extraCctx.isReplicated() && !extraCctx.isLocal())
return extraCctx;
}
throw new IllegalStateException("Failed to find partitioned cache.");
}
/**
* Returns true if the exception is triggered by query cancel.
*
* @param e Exception.
* @return {@code true} if exception is caused by cancel.
*/
private boolean wasCancelled(CacheException e) {
return X.hasSuppressed(e, QueryCancelledException.class);
}
/**
* @param nodes Query nodes.
* @param r Query run.
* @param qryReqId Query id.
* @param distributedJoins Distributed join flag.
*/
private void cancelRemoteQueriesIfNeeded(Collection<ClusterNode> nodes,
QueryRun r,
long qryReqId,
boolean distributedJoins)
{
// For distributedJoins need always send cancel request to cleanup resources.
if (distributedJoins)
send(nodes, new GridQueryCancelRequest(qryReqId), null, false);
else {
for (GridMergeIndex idx : r.idxs) {
if (!idx.fetchedAll()) {
send(nodes, new GridQueryCancelRequest(qryReqId), null, false);
break;
}
}
}
}
/**
* @param r Query run.
* @param nodes Nodes to check periodically if they alive.
* @param cancel Query cancel.
* @throws IgniteInterruptedCheckedException If interrupted.
*/
private void awaitAllReplies(QueryRun r, Collection<ClusterNode> nodes, GridQueryCancel cancel)
throws IgniteInterruptedCheckedException, QueryCancelledException {
while (!U.await(r.latch, 500, TimeUnit.MILLISECONDS)) {
cancel.checkCancelled();
for (ClusterNode node : nodes) {
if (!ctx.discovery().alive(node)) {
handleNodeLeft(r, node.id());
assert r.latch.getCount() == 0;
return;
}
}
}
}
/**
* Gets or creates new fake table for index.
*
* @param c Connection.
* @param idx Index of table.
* @return Table.
*/
private GridThreadLocalTable fakeTable(Connection c, int idx) {
List<GridThreadLocalTable> tbls = fakeTbls;
assert tbls.size() >= idx;
if (tbls.size() == idx) { // If table for such index does not exist, create one.
fakeTblsLock.lock();
try {
if ((tbls = fakeTbls).size() == idx) { // Double check inside of lock.
try (Statement stmt = c.createStatement()) {
stmt.executeUpdate("CREATE TABLE " + mergeTableIdentifier(idx) +
"(fake BOOL) ENGINE \"" + GridThreadLocalTable.Engine.class.getName() + '"');
}
catch (SQLException e) {
throw new IllegalStateException(e);
}
List<GridThreadLocalTable> newTbls = new ArrayList<>(tbls.size() + 1);
newTbls.addAll(tbls);
newTbls.add(GridThreadLocalTable.Engine.getCreated());
fakeTbls = tbls = newTbls;
}
}
finally {
fakeTblsLock.unlock();
}
}
return tbls.get(idx);
}
/**
* Calculates data nodes for replicated caches on unstable topology.
*
* @param cctx Cache context for main space.
* @param extraSpaces Extra spaces.
* @return Collection of all data nodes owning all the caches or {@code null} for retry.
*/
private Collection<ClusterNode> replicatedUnstableDataNodes(GridCacheContext<?, ?> cctx,
List<Integer> extraSpaces) {
int i = 0;
// The main cache is allowed to be partitioned.
if (!cctx.isReplicated()) {
assert !F.isEmpty(extraSpaces): "no extra replicated caches with partitioned main cache";
// Just replace the main cache with the first one extra.
cctx = cacheContext(extraSpaces.get(i++));
assert cctx.isReplicated(): "all the extra caches must be replicated here";
}
Set<ClusterNode> nodes = replicatedUnstableDataNodes(cctx);
if (F.isEmpty(nodes))
return null; // Retry.
if (!F.isEmpty(extraSpaces)) {
for (;i < extraSpaces.size(); i++) {
GridCacheContext<?, ?> extraCctx = cacheContext(extraSpaces.get(i));
if (extraCctx.isLocal())
continue;
if (!extraCctx.isReplicated())
throw new CacheException("Queries running on replicated cache should not contain JOINs " +
"with tables in partitioned caches [rCache=" + cctx.name() + ", pCache=" + extraCctx.name() + "]");
Set<ClusterNode> extraOwners = replicatedUnstableDataNodes(extraCctx);
if (F.isEmpty(extraOwners))
return null; // Retry.
nodes.retainAll(extraOwners);
if (nodes.isEmpty())
return null; // Retry.
}
}
return nodes;
}
/**
* @param space Cache name.
* @param topVer Topology version.
* @return Collection of data nodes.
*/
private Collection<ClusterNode> dataNodes(String space, AffinityTopologyVersion topVer) {
Collection<ClusterNode> res = ctx.discovery().cacheAffinityNodes(space, topVer);
return res != null ? res : Collections.<ClusterNode>emptySet();
}
/**
* Collects all the nodes owning all the partitions for the given replicated cache.
*
* @param cctx Cache context.
* @return Owning nodes or {@code null} if we can't find owners for some partitions.
*/
private Set<ClusterNode> replicatedUnstableDataNodes(GridCacheContext<?,?> cctx) {
assert cctx.isReplicated() : cctx.name() + " must be replicated";
String space = cctx.name();
Set<ClusterNode> dataNodes = new HashSet<>(dataNodes(space, NONE));
if (dataNodes.isEmpty())
throw new CacheException("Failed to find data nodes for cache: " + space);
// Find all the nodes owning all the partitions for replicated cache.
for (int p = 0, parts = cctx.affinity().partitions(); p < parts; p++) {
List<ClusterNode> owners = cctx.topology().owners(p);
if (F.isEmpty(owners))
return null; // Retry.
dataNodes.retainAll(owners);
if (dataNodes.isEmpty())
return null; // Retry.
}
return dataNodes;
}
/**
* Calculates partition mapping for partitioned cache on unstable topology.
*
* @param cctx Cache context for main space.
* @param extraSpaces Extra spaces.
* @return Partition mapping or {@code null} if we can't calculate it due to repartitioning and we need to retry.
*/
@SuppressWarnings("unchecked")
private Map<ClusterNode, IntArray> partitionedUnstableDataNodes(GridCacheContext<?,?> cctx,
List<Integer> extraSpaces) {
assert !cctx.isLocal() : cctx.name() + " must not be LOCAL";
// If the main cache is replicated, just replace it with the first partitioned.
cctx = findFirstPartitioned(cctx, extraSpaces);
final int partsCnt = cctx.affinity().partitions();
if (extraSpaces != null) { // Check correct number of partitions for partitioned caches.
for (int i = 0; i < extraSpaces.size(); i++) {
GridCacheContext<?, ?> extraCctx = cacheContext(extraSpaces.get(i));
if (extraCctx.isReplicated() || extraCctx.isLocal())
continue;
int parts = extraCctx.affinity().partitions();
if (parts != partsCnt)
throw new CacheException("Number of partitions must be the same for correct collocation [cache1=" +
cctx.name() + ", parts1=" + partsCnt + ", cache2=" + extraCctx.name() + ", parts2=" + parts + "]");
}
}
Set<ClusterNode>[] partLocs = new Set[partsCnt];
// Fill partition locations for main cache.
for (int p = 0, parts = cctx.affinity().partitions(); p < parts; p++) {
List<ClusterNode> owners = cctx.topology().owners(p);
if (F.isEmpty(owners)) {
// Handle special case: no mapping is configured for a partition.
if (F.isEmpty(cctx.affinity().assignment(NONE).get(p))) {
partLocs[p] = UNMAPPED_PARTS; // Mark unmapped partition.
continue;
}
else if (!F.isEmpty(dataNodes(cctx.name(), NONE)))
return null; // Retry.
throw new CacheException("Failed to find data nodes [cache=" + cctx.name() + ", part=" + p + "]");
}
partLocs[p] = new HashSet<>(owners);
}
if (extraSpaces != null) {
// Find owner intersections for each participating partitioned cache partition.
// We need this for logical collocation between different partitioned caches with the same affinity.
for (int i = 0; i < extraSpaces.size(); i++) {
GridCacheContext<?, ?> extraCctx = cacheContext(extraSpaces.get(i));
// This is possible if we have replaced a replicated cache with a partitioned one earlier.
if (cctx == extraCctx)
continue;
if (extraCctx.isReplicated() || extraCctx.isLocal())
continue;
for (int p = 0, parts = extraCctx.affinity().partitions(); p < parts; p++) {
List<ClusterNode> owners = extraCctx.topology().owners(p);
if (partLocs[p] == UNMAPPED_PARTS)
continue; // Skip unmapped partitions.
if (F.isEmpty(owners)) {
if (!F.isEmpty(dataNodes(extraCctx.name(), NONE)))
return null; // Retry.
throw new CacheException("Failed to find data nodes [cache=" + extraCctx.name() + ", part=" + p + "]");
}
if (partLocs[p] == null)
partLocs[p] = new HashSet<>(owners);
else {
partLocs[p].retainAll(owners); // Intersection of owners.
if (partLocs[p].isEmpty())
return null; // Intersection is empty -> retry.
}
}
}
// Filter nodes where not all the replicated caches loaded.
for (int i = 0; i < extraSpaces.size(); i++) {
GridCacheContext<?,?> extraCctx = cacheContext(extraSpaces.get(i));
if (!extraCctx.isReplicated())
continue;
Set<ClusterNode> dataNodes = replicatedUnstableDataNodes(extraCctx);
if (F.isEmpty(dataNodes))
return null; // Retry.
for (Set<ClusterNode> partLoc : partLocs) {
if (partLoc == UNMAPPED_PARTS)
continue; // Skip unmapped partition.
partLoc.retainAll(dataNodes);
if (partLoc.isEmpty())
return null; // Retry.
}
}
}
// Collect the final partitions mapping.
Map<ClusterNode, IntArray> res = new HashMap<>();
// Here partitions in all IntArray's will be sorted in ascending order, this is important.
for (int p = 0; p < partLocs.length; p++) {
Set<ClusterNode> pl = partLocs[p];
// Skip unmapped partitions.
if (pl == UNMAPPED_PARTS)
continue;
assert !F.isEmpty(pl) : pl;
ClusterNode n = pl.size() == 1 ? F.first(pl) : F.rand(pl);
IntArray parts = res.get(n);
if (parts == null)
res.put(n, parts = new IntArray());
parts.add(p);
}
return res;
}
/**
* @param c Connection.
* @param space Space.
* @param qry Query.
* @param params Query parameters.
* @return Cursor for plans.
* @throws IgniteCheckedException if failed.
*/
private Iterator<List<?>> explainPlan(JdbcConnection c, String space, GridCacheTwoStepQuery qry, Object[] params)
throws IgniteCheckedException {
List<List<?>> lists = new ArrayList<>();
for (int i = 0, mapQrys = qry.mapQueries().size(); i < mapQrys; i++) {
ResultSet rs = h2.executeSqlQueryWithTimer(space, c,
"SELECT PLAN FROM " + mergeTableIdentifier(i), null, false, 0, null);
lists.add(F.asList(getPlan(rs)));
}
int tblIdx = 0;
for (GridCacheSqlQuery mapQry : qry.mapQueries()) {
GridMergeTable tbl = createMergeTable(c, mapQry, false);
fakeTable(c, tblIdx++).innerTable(tbl);
}
GridCacheSqlQuery rdc = qry.reduceQuery();
ResultSet rs = h2.executeSqlQueryWithTimer(space,
c,
"EXPLAIN " + rdc.query(),
F.asList(rdc.parameters(params)),
false,
0,
null);
lists.add(F.asList(getPlan(rs)));
return lists.iterator();
}
/**
* @param rs Result set.
* @return Plan.
* @throws IgniteCheckedException If failed.
*/
private String getPlan(ResultSet rs) throws IgniteCheckedException {
try {
if (!rs.next())
throw new IllegalStateException();
return rs.getString(1);
}
catch (SQLException e) {
throw new IgniteCheckedException(e);
}
}
/**
* @param nodes Nodes.
* @param msg Message.
* @param specialize Optional closure to specialize message for each node.
* @param runLocParallel Run local handler in parallel thread.
* @return {@code true} If all messages sent successfully.
*/
private boolean send(
Collection<ClusterNode> nodes,
Message msg,
@Nullable IgniteBiClosure<ClusterNode, Message, Message> specialize,
boolean runLocParallel
) {
if (log.isDebugEnabled())
log.debug("Sending: [msg=" + msg + ", nodes=" + nodes + ", specialize=" + specialize + "]");
return h2.send(GridTopic.TOPIC_QUERY,
GridTopic.TOPIC_QUERY.ordinal(),
nodes,
msg,
specialize,
locNodeHnd,
GridIoPolicy.QUERY_POOL,
runLocParallel);
}
/**
* @param ints Ints.
* @return Array.
*/
public static int[] toArray(IntArray ints) {
int[] res = new int[ints.size()];
ints.toArray(res);
return res;
}
/**
* @param m Map.
* @return Converted map.
*/
private static Map<UUID, int[]> convert(Map<ClusterNode, IntArray> m) {
if (m == null)
return null;
Map<UUID, int[]> res = U.newHashMap(m.size());
for (Map.Entry<ClusterNode,IntArray> entry : m.entrySet())
res.put(entry.getKey().id(), toArray(entry.getValue()));
return res;
}
/**
* @param conn Connection.
* @param qry Query.
* @param explain Explain.
* @return Table.
* @throws IgniteCheckedException If failed.
*/
@SuppressWarnings("unchecked")
private GridMergeTable createMergeTable(JdbcConnection conn, GridCacheSqlQuery qry, boolean explain)
throws IgniteCheckedException {
try {
Session ses = (Session)conn.getSession();
CreateTableData data = new CreateTableData();
data.tableName = "T___";
data.schema = ses.getDatabase().getSchema(ses.getCurrentSchemaName());
data.create = true;
if (!explain) {
LinkedHashMap<String,?> colsMap = qry.columns();
assert colsMap != null;
ArrayList<Column> cols = new ArrayList<>(colsMap.size());
for (Map.Entry<String,?> e : colsMap.entrySet()) {
String alias = e.getKey();
GridSqlType t = (GridSqlType)e.getValue();
assert !F.isEmpty(alias);
Column c = new Column(alias, t.type(), t.precision(), t.scale(), t.displaySize());
cols.add(c);
}
data.columns = cols;
}
else
data.columns = planColumns();
boolean sortedIndex = !F.isEmpty(qry.sortColumns());
GridMergeTable tbl = new GridMergeTable(data);
ArrayList<Index> idxs = new ArrayList<>(2);
if (explain) {
idxs.add(new GridMergeIndexUnsorted(ctx, tbl,
sortedIndex ? MERGE_INDEX_SORTED : MERGE_INDEX_UNSORTED));
}
else if (sortedIndex) {
List<GridSqlSortColumn> sortCols = (List<GridSqlSortColumn>)qry.sortColumns();
GridMergeIndexSorted sortedMergeIdx = new GridMergeIndexSorted(ctx, tbl, MERGE_INDEX_SORTED,
GridSqlSortColumn.toIndexColumns(tbl, sortCols));
idxs.add(GridMergeTable.createScanIndex(sortedMergeIdx));
idxs.add(sortedMergeIdx);
}
else
idxs.add(new GridMergeIndexUnsorted(ctx, tbl, MERGE_INDEX_UNSORTED));
tbl.indexes(idxs);
return tbl;
}
catch (Exception e) {
U.closeQuiet(conn);
throw new IgniteCheckedException(e);
}
}
/**
* @return Columns.
*/
private static ArrayList<Column> planColumns() {
ArrayList<Column> res = new ArrayList<>(1);
res.add(new Column("PLAN", Value.STRING));
return res;
}
/**
* @param reconnectFut Reconnect future.
*/
public void onDisconnected(IgniteFuture<?> reconnectFut) {
CacheException err = new CacheException("Query was cancelled, client node disconnected.",
new IgniteClientDisconnectedException(reconnectFut, "Client node disconnected."));
for (Map.Entry<Long, QueryRun> e : runs.entrySet())
e.getValue().disconnected(err);
}
/**
* Collect queries that already running more than specified duration.
*
* @param duration Duration to check.
* @return Collection of IDs and statements of long running queries.
*/
public Collection<GridRunningQueryInfo> longRunningQueries(long duration) {
Collection<GridRunningQueryInfo> res = new ArrayList<>();
long curTime = U.currentTimeMillis();
for (QueryRun run : runs.values()) {
if (run.qry.longQuery(curTime, duration))
res.add(run.qry);
}
return res;
}
/**
* Cancel specified queries.
*
* @param queries Queries IDs to cancel.
*/
public void cancelQueries(Collection<Long> queries) {
for (Long qryId : queries) {
QueryRun run = runs.get(qryId);
if (run != null)
run.qry.cancel();
}
}
/**
* Query run.
*/
private static class QueryRun {
/** */
private final GridRunningQueryInfo qry;
/** */
private final List<GridMergeIndex> idxs;
/** */
private CountDownLatch latch;
/** */
private final JdbcConnection conn;
/** */
private final int pageSize;
/** Can be either CacheException in case of error or AffinityTopologyVersion to retry if needed. */
private final AtomicReference<Object> state = new AtomicReference<>();
/**
* @param id Query ID.
* @param qry Query text.
* @param cache Cache where query was executed.
* @param conn Connection.
* @param idxsCnt Number of indexes.
* @param pageSize Page size.
* @param startTime Start time.
* @param cancel Query cancel handler.
*/
private QueryRun(Long id, String qry, String cache, Connection conn, int idxsCnt, int pageSize, long startTime, GridQueryCancel cancel) {
this.qry = new GridRunningQueryInfo(id, qry, SQL_FIELDS, cache, startTime, cancel, false);
this.conn = (JdbcConnection)conn;
this.idxs = new ArrayList<>(idxsCnt);
this.pageSize = pageSize > 0 ? pageSize : GridCacheTwoStepQuery.DFLT_PAGE_SIZE;
}
/**
* @param o Fail state object.
* @param nodeId Node ID.
*/
void state(Object o, @Nullable UUID nodeId) {
assert o != null;
assert o instanceof CacheException || o instanceof AffinityTopologyVersion : o.getClass();
if (!state.compareAndSet(null, o))
return;
while (latch.getCount() != 0) // We don't need to wait for all nodes to reply.
latch.countDown();
CacheException e = o instanceof CacheException ? (CacheException) o : null;
for (GridMergeIndex idx : idxs) // Fail all merge indexes.
idx.fail(nodeId, e);
}
/**
* @param e Error.
*/
void disconnected(CacheException e) {
state(e, null);
}
}
/** */
private Map<ClusterNode, IntArray> narrowForQuery(Map<ClusterNode, IntArray> partsMap, int[] parts) {
if (parts == null)
return partsMap;
Map<ClusterNode, IntArray> cp = U.newHashMap(partsMap.size());
for (Map.Entry<ClusterNode, IntArray> entry : partsMap.entrySet()) {
IntArray filtered = new IntArray(parts.length);
IntArray orig = entry.getValue();
for (int i = 0; i < orig.size(); i++) {
int p = orig.get(i);
if (Arrays.binarySearch(parts, p) >= 0)
filtered.add(p);
}
if (filtered.size() > 0)
cp.put(entry.getKey(), filtered);
}
return cp.isEmpty() ? null : cp;
}
/** */
private static class ExplicitPartitionsSpecializer implements IgniteBiClosure<ClusterNode, Message, Message> {
/** Partitions map. */
private final Map<ClusterNode, IntArray> partsMap;
/**
* @param partsMap Partitions map.
*/
public ExplicitPartitionsSpecializer(Map<ClusterNode, IntArray> partsMap) {
this.partsMap = partsMap;
}
/** {@inheritDoc} */
@Override public Message apply(ClusterNode node, Message msg) {
GridH2QueryRequest rq = new GridH2QueryRequest((GridH2QueryRequest)msg);
rq.queryPartitions(toArray(partsMap.get(node)));
return rq;
}
}
}