/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.execution; import java.util.Collection; import java.util.NavigableSet; import java.util.Set; import java.util.stream.Collectors; import org.diqube.execution.consumers.ColumnVersionBuiltConsumer; import org.diqube.execution.consumers.GenericConsumer; import org.diqube.executionenv.ExecutionEnvironment; import org.diqube.executionenv.querystats.QueryableColumnShard; /** * Helper class providing methods for handling row IDs when a {@link ColumnVersionBuiltConsumer} is in place. * * @author Bastian Gloeckle */ public class ColumnVersionBuiltHelper { /** * Finds those row IDs out of a set of row IDs whose values are all available in specific columns of an * {@link ExecutionEnvironment} and manages a set of row IDs that could not be processed yet. * * @param env * The {@link ExecutionEnvironment} that is queried for the rows that are available in the specified columns. * @param columns * The columns that should be checked in "env" for the availability of values for the given rowIds. * @param activeRowIds * The row IDs that should be worked on now as provided by any other input {@link GenericConsumer}. This set * will be adjusted accordingly. After this method returns, the activeRowIds will contain only row IDs that * are available in {@link ExecutionEnvironment}. It might contain any additional row IDs of the * notYetProcessedRowIds parameter object, as those row IDs were not processed yet, but it might be possible * to process them now as the corresponding rows became available in the {@link ExecutionEnvironment}. Those * row IDs that are added to activeRowIds from notYetProcessedRowIds are removed from the latter. * @param notYetProcessedRowIds * A set containing those row IDs that have not yet been processed. See details above. * @return The maximum row ID for which all columns contain values or -1 if not all columns are available in the given * env. */ public long publishActiveRowIds(ExecutionEnvironment env, Collection<String> columns, NavigableSet<Long> activeRowIds, NavigableSet<Long> notYetProcessedRowIds) { long maxRowId; Collection<QueryableColumnShard> cols = columns.stream().map(colName -> env.getColumnShard(colName)).collect(Collectors.toList()); if (cols.stream().anyMatch(col -> col == null)) { // at least one of the needed columns is not available at all yet. notYetProcessedRowIds.addAll(activeRowIds); activeRowIds.clear(); return -1L; } if (cols.stream().anyMatch(col -> env.getPureStandardColumnShard(col.getName()) != null)) { maxRowId = cols.stream().filter(col -> env.getPureStandardColumnShard(col.getName()) != null) .mapToLong(column -> column.getFirstRowId() + env.getPureStandardColumnShard(column.getName()).getNumberOfRowsInColumnShard() - 1) . // min().getAsLong(); } else // only ConstantColumnShards maxRowId = cols.iterator().next().getFirstRowId(); Set<Long> activeRowIdsNotAvailable = activeRowIds.tailSet(maxRowId, false); if (!activeRowIdsNotAvailable.isEmpty()) { notYetProcessedRowIds.addAll(activeRowIdsNotAvailable); activeRowIdsNotAvailable.clear(); } Set<Long> notYetProcessedAvailable = notYetProcessedRowIds.headSet(maxRowId, true); if (!notYetProcessedAvailable.isEmpty()) { activeRowIds.addAll(notYetProcessedAvailable); notYetProcessedAvailable.clear(); } return maxRowId; } }