/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.server.control; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Properties; import java.util.UUID; import java.util.stream.Collectors; import org.apache.thrift.TException; import org.diqube.data.column.ColumnType; import org.diqube.data.table.AdjustableTable; import org.diqube.data.table.AdjustableTable.TableShardsOverlappingException; import org.diqube.data.table.Table; import org.diqube.data.table.TableFactory; import org.diqube.data.table.TableShard; import org.diqube.executionenv.TableRegistry; import org.diqube.executionenv.TableRegistry.TableLoadImpossibleException; import org.diqube.loader.CsvLoader; import org.diqube.loader.DiqubeLoader; import org.diqube.loader.JsonLoader; import org.diqube.loader.LoadException; import org.diqube.loader.Loader; import org.diqube.loader.LoaderColumnInfo; import org.diqube.server.metadata.ServerTableMetadataPublisher; import org.diqube.server.metadata.ServerTableMetadataPublisher.MergeImpossibleException; import org.diqube.server.queryremote.flatten.ClusterFlattenServiceHandler; import org.diqube.thrift.base.thrift.TableMetadata; import org.diqube.thrift.base.util.RUuidUtil; import org.diqube.util.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Loads the table shard whose data is referred to by a .control file. * * @author Bastian Gloeckle */ public class ControlFileLoader { private static final Logger logger = LoggerFactory.getLogger(ControlFileLoader.class); public static final String KEY_FILE = "file"; public static final String KEY_TYPE = "type"; public static final String KEY_TABLE = "table"; public static final String KEY_FIRST_ROWID = "firstRowId"; public static final String KEY_COLTYPE_PREFIX = "columnType."; public static final String KEY_DEFAULT_COLTYPE = "defaultColumnType"; public static final String KEY_AUTO_FLATTEN = "autoFlatten"; public static final String TYPE_CSV = "csv"; public static final String TYPE_JSON = "json"; public static final String TYPE_DIQUBE = "diqube"; private File controlFile; private TableRegistry tableRegistry; private TableFactory tableFactory; private CsvLoader csvLoader; private JsonLoader jsonLoader; private DiqubeLoader diqubeLoader; private ClusterFlattenServiceHandler clusterFlattenServiceHandler; private ServerTableMetadataPublisher metadataPublisher; private Object tableRegistrySync = new Object(); public ControlFileLoader(TableRegistry tableRegistry, TableFactory tableFactory, CsvLoader csvLoader, JsonLoader jsonLoader, DiqubeLoader diqubeLoader, ClusterFlattenServiceHandler clusterFlattenServiceHandler, ServerTableMetadataPublisher metadataPublisher, File controlFile) { this.tableRegistry = tableRegistry; this.tableFactory = tableFactory; this.csvLoader = csvLoader; this.jsonLoader = jsonLoader; this.diqubeLoader = diqubeLoader; this.clusterFlattenServiceHandler = clusterFlattenServiceHandler; this.metadataPublisher = metadataPublisher; this.controlFile = controlFile; } private ColumnType resolveColumnType(String controlFileString) throws LoadException { try { return ColumnType.valueOf(controlFileString.toUpperCase()); } catch (RuntimeException e) { throw new LoadException(controlFileString + " is no valid ColumnType."); } } /** * Loads the table shard synchronously. * * <p> * This method will automatically retry loading the .control file if it fails - this is needed if the control file has * not been written completely yet. If the validation/loading of the control file still fails after a few attempts, a * {@link LoadException} will be thrown. * * <p> * This method takes care of starting a flattening on the table if "autoFlatten" is available in control file. * * <p> * This method takes care of calculating {@link TableMetadata} for the resulting table and publish this information in * the cluster. * * <p> * Note that .ready files will not be created. * * @return The name of the table under which it was registered at {@link TableRegistry} and a List containing the * values of {@link TableShard#getLowestRowId()} of the table shard(s) that were loaded. */ public Pair<String, List<Long>> load() throws LoadException { Properties controlProperties; String fileName; String tableName; String type; String[] autoFlatten; long firstRowId; LoaderColumnInfo columnInfo; File file; Object sync = new Object(); // We retry executing the following loading/validation of the control file itself. It could be that the load method // gets called too early, when the control file has not been fully written, therefore we'll retry. int maxRetries = 5; for (int retryNo = 0;; retryNo++) { try { controlProperties = new Properties(); try (InputStream controlFileInputStream = new FileInputStream(controlFile)) { controlProperties.load(controlFileInputStream); } catch (IOException e) { throw new LoadException("Could not load information of control file " + controlFile.getAbsolutePath(), e); } fileName = controlProperties.getProperty(KEY_FILE); tableName = controlProperties.getProperty(KEY_TABLE); type = controlProperties.getProperty(KEY_TYPE); String firstRowIdString = controlProperties.getProperty(KEY_FIRST_ROWID); if (fileName == null || tableName == null || firstRowIdString == null || type == null || !(type.equals(TYPE_CSV) || type.equals(TYPE_JSON) || type.equals(TYPE_DIQUBE))) throw new LoadException("Invalid control file " + controlFile.getAbsolutePath()); try { firstRowId = Long.parseLong(firstRowIdString); } catch (NumberFormatException e) { throw new LoadException( "Invalid control file " + controlFile.getAbsolutePath() + " (FirstRowId is no valid number)"); } ColumnType defaultColumnType; String defaultColumnTypeString = controlProperties.getProperty(KEY_DEFAULT_COLTYPE); if (defaultColumnTypeString == null) defaultColumnType = ColumnType.STRING; else defaultColumnType = resolveColumnType(defaultColumnTypeString); columnInfo = new LoaderColumnInfo(defaultColumnType); for (Object key : controlProperties.keySet()) { String keyString = (String) key; if (keyString.startsWith(KEY_COLTYPE_PREFIX)) { String val = controlProperties.getProperty(keyString); keyString = keyString.substring(KEY_COLTYPE_PREFIX.length()); // TODO #13 LoaderColumnInfo should be able to handle repeated columns nicely. columnInfo.registerColumnType(keyString, resolveColumnType(val)); } } String autoFlattenUnsplit = controlProperties.getProperty(KEY_AUTO_FLATTEN, ""); if (!"".equals(autoFlattenUnsplit)) { autoFlatten = autoFlattenUnsplit.split(","); for (int i = 0; i < autoFlatten.length; i++) autoFlatten[i] = autoFlatten[i].trim(); } else autoFlatten = new String[0]; file = controlFile.toPath().resolveSibling(fileName).toFile(); if (!file.exists() || !file.isFile()) throw new LoadException("File " + file.getAbsolutePath() + " does not exist or is no file."); break; } catch (LoadException e) { if (retryNo == maxRetries - 1) { throw e; } logger.info("Was not able to load control file {}, will retry. Error: {}", controlFile.getAbsolutePath(), e.getMessage()); synchronized (sync) { try { sync.wait(200); } catch (InterruptedException e1) { throw new LoadException("Interrupted while waiting to retry loading control file", e1); } } } } Loader loader; switch (type) { case TYPE_CSV: loader = csvLoader; break; case TYPE_JSON: loader = jsonLoader; break; case TYPE_DIQUBE: loader = diqubeLoader; break; default: throw new LoadException("Unkown input file type."); } Collection<TableShard> newTableShards = loader.load(firstRowId, file.getAbsolutePath(), tableName, columnInfo); synchronized (tableRegistrySync) { Table table = tableRegistry.getTable(tableName); if (table != null) { if (!(table instanceof AdjustableTable)) throw new LoadException("The target table '" + tableName + "' cannot be adjusted."); List<TableShard> allShards = new ArrayList<>(table.getShards()); allShards.addAll(newTableShards); distributeNewMetadata(tableName, allShards); try { for (TableShard newTableShard : newTableShards) ((AdjustableTable) table).addTableShard(newTableShard); } catch (TableShardsOverlappingException e) { // remove all those shards that might've been added already. for (TableShard newTableShard : newTableShards) ((AdjustableTable) table).removeTableShard(newTableShard); throw new LoadException("Cannot load TableShard as it overlaps with an already loaded one", e); } } else { Collection<TableShard> newTableShardCollection = newTableShards; table = tableFactory.createDefaultTable(tableName, newTableShardCollection); distributeNewMetadata(tableName, newTableShardCollection); try { tableRegistry.addTable(tableName, table); } catch (TableLoadImpossibleException e) { throw new LoadException("Cannot load table " + table, e); } } } // For "auto-flattening" we use the clusterFlattenServiceHandler directly with an empty list of "other flatteners" // and a null-resultAddress. Using this, this node will merge new flatten requests on that table to the one we start // now, although these might fail (as other requests probably include multiple flatteners; our node will though only // flatten on ourselves; but query masters that issued the flattening should be able to cope with that). for (String autoFlattenField : autoFlatten) { UUID flattenId = UUID.randomUUID(); try { // these calls start the flattening asynchronously, therefore we just trigger computation here. If there is a // flattened version available in the flattenedDiskCache already, that will be used. clusterFlattenServiceHandler.flattenAllLocalShards(RUuidUtil.toRUuid(flattenId), tableName, autoFlattenField, new ArrayList<>(), null); } catch (TException e) { logger.error("Failed to flatten new table '{}' by '{}' locally with flatten ID {}.", tableName, autoFlattenField, flattenId, e); } } List<Long> firstRowIds = newTableShards.stream().map(shard -> shard.getLowestRowId()).sorted().collect(Collectors.toList()); return new Pair<>(tableName, firstRowIds); } /** * Creates new {@link TableMetadata} for the given table with the given shards (all shards of table, with the new * ones) and distributes it across the cluster. Throws {@link LoadException} if metadata is incompatible in any way * and no new shards should be loaded at all now. */ private void distributeNewMetadata(String tableName, Collection<TableShard> allShards) throws LoadException { try { metadataPublisher.publishMetadataOfTableShards(tableName, allShards); } catch (MergeImpossibleException e) { throw new LoadException("Cannot load table '" + tableName + "' since its metadata is incompatible", e); } } }