/** * */ package edu.washington.escience.myria.perfenforce; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.PrintWriter; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Sets; import edu.washington.escience.myria.CsvTupleReader; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.MyriaConstants; import edu.washington.escience.myria.RelationKey; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.api.encoding.PerfEnforceStatisticsEncoding; import edu.washington.escience.myria.api.encoding.PerfEnforceTableEncoding; import edu.washington.escience.myria.operator.DbInsert; import edu.washington.escience.myria.operator.DbQueryScan; import edu.washington.escience.myria.operator.EOSSource; import edu.washington.escience.myria.operator.EmptySink; import edu.washington.escience.myria.operator.RootOperator; import edu.washington.escience.myria.operator.TupleSource; import edu.washington.escience.myria.operator.network.Consumer; import edu.washington.escience.myria.operator.network.GenericShuffleProducer; import edu.washington.escience.myria.operator.network.distribute.BroadcastDistributeFunction; import edu.washington.escience.myria.operator.network.distribute.RoundRobinDistributeFunction; import edu.washington.escience.myria.parallel.ExchangePairID; import edu.washington.escience.myria.parallel.Server; /** * Methods to help prepare the data for PSLA generation */ public class PerfEnforceDataPreparation { private final Server server; private HashMap<Integer, RelationKey> factTableRelationMapper; private PerfEnforceTableEncoding factTableDescription; /** Logger. */ protected static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(PerfEnforceDataPreparation.class); /** * Constructs the PerfEnforceDataPreparation class * @param server the current instance of Server */ public PerfEnforceDataPreparation(final Server server) { this.server = server; } /** * Ingesting the fact table in a parallel sequence * * @param factTableDesc contains information about the fact table to ingest * @return a hashmap with the relationkeys for all versions of the fact table * @throws PerfEnforceException if there is an error ingesting the fact table */ public HashMap<Integer, RelationKey> ingestFact(final PerfEnforceTableEncoding factTableDesc) throws PerfEnforceException { factTableDescription = factTableDesc; factTableRelationMapper = new HashMap<Integer, RelationKey>(); ArrayList<RelationKey> relationKeysToUnion = new ArrayList<RelationKey>(); Collections.sort(PerfEnforceDriver.configurations, Collections.reverseOrder()); // Create a sequence for the largest configuration size int maxConfig = PerfEnforceDriver.configurations.get(0); Set<Integer> maxWorkerRange = PerfEnforceUtils.getWorkerRangeSet(maxConfig); /* * First, ingest the fact table under the relationKey with the union ("_U"). Then, create a materialized view with * the original relationKey name and add it to the catalog. This is what the user will be using on the MyriaL front * end. */ try { RelationKey relationKeyWithUnion = new RelationKey( factTableDesc.relationKey.getUserName(), factTableDesc.relationKey.getProgramName(), factTableDesc.relationKey.getRelationName() + maxConfig + "_U"); server.parallelIngestDataset( relationKeyWithUnion, factTableDesc.schema, factTableDesc.delimiter, null, null, null, factTableDesc.source, maxWorkerRange, null); relationKeysToUnion.add(relationKeyWithUnion); RelationKey relationKeyOriginal = new RelationKey( factTableDesc.relationKey.getUserName(), factTableDesc.relationKey.getProgramName(), factTableDesc.relationKey.getRelationName() + maxConfig); server.createMaterializedView( relationKeyOriginal.toString(MyriaConstants.STORAGE_SYSTEM_POSTGRESQL), PerfEnforceUtils.createUnionQuery(relationKeysToUnion), maxWorkerRange); server.addDatasetToCatalog( relationKeyOriginal, factTableDesc.schema, new ArrayList<Integer>(maxWorkerRange)); factTableRelationMapper.put(maxConfig, relationKeyOriginal); /* * Iterate and run this for the rest of the workers */ Set<Integer> previousWorkerRange = maxWorkerRange; RelationKey previousRelationKey = relationKeyWithUnion; for (int c = 1; c < PerfEnforceDriver.configurations.size(); c++) { // Get the next sequence of workers int currentSize = PerfEnforceDriver.configurations.get(c); Set<Integer> currentWorkerRange = PerfEnforceUtils.getWorkerRangeSet(currentSize); Set<Integer> diff = Sets.difference(previousWorkerRange, currentWorkerRange); RelationKey currentRelationKeyToUnion = new RelationKey( factTableDesc.relationKey.getUserName(), factTableDesc.relationKey.getProgramName(), factTableDesc.relationKey.getRelationName() + currentSize + "_U"); DbQueryScan scan = new DbQueryScan(previousRelationKey, factTableDesc.schema); int[] producingWorkers = PerfEnforceUtils.getRangeInclusiveArray(Collections.min(diff), Collections.max(diff)); int[] receivingWorkers = PerfEnforceUtils.getRangeInclusiveArray(1, Collections.max(currentWorkerRange)); final ExchangePairID shuffleId = ExchangePairID.newID(); RoundRobinDistributeFunction df = new RoundRobinDistributeFunction(); df.setDestinations(receivingWorkers.length, 1); GenericShuffleProducer producer = new GenericShuffleProducer( scan, new ExchangePairID[] {shuffleId}, receivingWorkers, df); Consumer consumer = new Consumer(factTableDesc.schema, shuffleId, producingWorkers); DbInsert insert = new DbInsert(consumer, currentRelationKeyToUnion, true); Map<Integer, RootOperator[]> workerPlans = new HashMap<>(currentSize); for (Integer workerID : producingWorkers) { workerPlans.put(workerID, new RootOperator[] {producer}); } for (Integer workerID : receivingWorkers) { workerPlans.put(workerID, new RootOperator[] {insert}); } server.submitQueryPlan(new EmptySink(new EOSSource()), workerPlans).get(); relationKeysToUnion.add(currentRelationKeyToUnion); RelationKey currentConfigRelationKey = new RelationKey( factTableDesc.relationKey.getUserName(), factTableDesc.relationKey.getProgramName(), factTableDesc.relationKey.getRelationName() + currentSize); server.createMaterializedView( currentConfigRelationKey.toString(MyriaConstants.STORAGE_SYSTEM_POSTGRESQL), PerfEnforceUtils.createUnionQuery(relationKeysToUnion), currentWorkerRange); server.addDatasetToCatalog( currentConfigRelationKey, factTableDesc.schema, new ArrayList<Integer>(currentWorkerRange)); factTableRelationMapper.put(currentSize, currentConfigRelationKey); previousWorkerRange = currentWorkerRange; previousRelationKey = currentConfigRelationKey; } Collections.sort(PerfEnforceDriver.configurations); return factTableRelationMapper; } catch (Exception e) { throw new PerfEnforceException("Error while ingesting fact table"); } } /** * Ingesting dimension tables for broadcasting * * @param dimTableDesc holds information about the dimension table to ingest * @throws PerfEnforceException if there is an error ingesting dimension tables */ public void ingestDimension(final PerfEnforceTableEncoding dimTableDesc) throws PerfEnforceException { Set<Integer> totalWorkers = PerfEnforceUtils.getWorkerRangeSet(Collections.max(PerfEnforceDriver.configurations)); try { TupleSource source = new TupleSource( new CsvTupleReader(dimTableDesc.schema, dimTableDesc.delimiter), dimTableDesc.source); server.ingestDataset( dimTableDesc.relationKey, new ArrayList<Integer>(totalWorkers), null, source, new BroadcastDistributeFunction()); } catch (Exception e) { throw new PerfEnforceException("Error ingesting dimension tables"); } } /** * This method analyzes the given table * * @param tableToAnalyze the table to analyze */ public void analyzeTable(final PerfEnforceTableEncoding tableToAnalyze) throws DbException, InterruptedException { /* * If this table is Fact, we need to make sure we run "analyze" on all versions of the table */ if (tableToAnalyze.type.equalsIgnoreCase("fact")) { for (Entry<Integer, RelationKey> entry : factTableRelationMapper.entrySet()) { PerfEnforceTableEncoding temp = new PerfEnforceTableEncoding( tableToAnalyze.relationKey, tableToAnalyze.type, tableToAnalyze.source, tableToAnalyze.schema, tableToAnalyze.delimiter, tableToAnalyze.keys, tableToAnalyze.corresponding_fact_key); temp.relationKey = new RelationKey( entry.getValue().getUserName(), entry.getValue().getProgramName(), entry.getValue().getRelationName()); postgresStatsAnalyzeTable(temp, PerfEnforceUtils.getWorkerRangeSet(entry.getKey())); } } else { postgresStatsAnalyzeTable( tableToAnalyze, PerfEnforceUtils.getWorkerRangeSet(Collections.max(PerfEnforceDriver.configurations))); } } /** * Helper method that runs the ANALYZE command for a set of workers * * @param tableToAnalyze the table to analyze * @param workers the set of workers to run the ANALYZE command */ public void postgresStatsAnalyzeTable( final PerfEnforceTableEncoding tableToAnalyze, Set<Integer> workers) throws DbException, InterruptedException { for (int i = 0; i < tableToAnalyze.schema.getColumnNames().size(); i++) { server.executeSQLStatement( String.format( "ALTER TABLE %s ALTER COLUMN %s SET STATISTICS 500;", tableToAnalyze.relationKey.toString(MyriaConstants.STORAGE_SYSTEM_POSTGRESQL), tableToAnalyze.schema.getColumnName(i)), workers); } server.executeSQLStatement( String.format( "ANALYZE %s;", tableToAnalyze.relationKey.toString(MyriaConstants.STORAGE_SYSTEM_POSTGRESQL)), workers); } /** * Collects statistical information about each table * @throws PerfEnforceException if there is an error collecting data statistics */ public void collectSelectivities() throws PerfEnforceException { try { /* record the stats for each configuration */ for (Integer currentConfig : PerfEnforceDriver.configurations) { Path statsWorkerPath = PerfEnforceDriver.configurationPath .resolve("PSLAGeneration") .resolve(currentConfig + "_Workers") .resolve("stats.json"); List<PerfEnforceStatisticsEncoding> statsEncodingList = new ArrayList<PerfEnforceStatisticsEncoding>(); RelationKey factRelationKey = factTableRelationMapper.get(currentConfig); long factTableTupleCount = server.getDatasetStatus(factRelationKey).getNumTuples(); statsEncodingList.add( runTableRanking( factRelationKey, factTableTupleCount, currentConfig, factTableDescription.type, factTableDescription.keys, factTableDescription.schema)); for (PerfEnforceTableEncoding t : PerfEnforceDriver.tableList) { if (t.type.equalsIgnoreCase("dimension")) { RelationKey dimensionTableKey = t.relationKey; long dimensionTableTupleCount = server.getDatasetStatus(dimensionTableKey).getNumTuples(); statsEncodingList.add( runTableRanking( dimensionTableKey, dimensionTableTupleCount, Collections.max(PerfEnforceDriver.configurations), t.type, t.keys, t.schema)); } } try (PrintWriter statsObjectWriter = new PrintWriter(new FileOutputStream(new File(statsWorkerPath.toString())))) { ObjectMapper mapper = new ObjectMapper(); mapper.writeValue(statsObjectWriter, statsEncodingList); } } } catch (Exception e) { throw new PerfEnforceException("Error collecting table statistics"); } } /** * Given the primary key, this method determines which key values will return either .001%, .01% or 10% of the data. * * @param relationKey the relationkey of the table * @param tableSize the size of the table * @param config the cluster configuration * @param type the type of the table -- can be either "fact" or "dimension" * @param keys the primary keys of the relation * @param schema the schema of the relation * @return returns the statistics metadata for a relation * @throws PerfEnforceException if there is an error computing the statistics */ public PerfEnforceStatisticsEncoding runTableRanking( final RelationKey relationKey, final long tableSize, final int config, final String type, final Set<Integer> keys, final Schema schema) throws PerfEnforceException { List<String> selectivityKeys = new ArrayList<String>(); List<Double> selectivityList = Arrays.asList(new Double[] {.001, .01, .1}); String attributeKeyString = PerfEnforceUtils.getAttributeKeyString(keys, schema); Schema attributeKeySchema = PerfEnforceUtils.getAttributeKeySchema(keys, schema); String tableName = relationKey.toString(MyriaConstants.STORAGE_SYSTEM_POSTGRESQL); try { for (int i = 0; i < selectivityList.size(); i++) { String rankingQuery = String.format( "select %s from (select %s, CAST(rank() over (order by %s asc) AS float)/%s as rank from %s) as r where r.rank >= %s LIMIT 1;", attributeKeyString, attributeKeyString, attributeKeyString, tableSize / config, tableName, selectivityList.get(i)); String[] sqlResult = server.executeSQLStatement( rankingQuery, attributeKeySchema, new HashSet<Integer>(Arrays.asList(1))); selectivityKeys.add(sqlResult[0]); } return new PerfEnforceStatisticsEncoding(tableName, tableSize, selectivityKeys); } catch (Exception e) { throw new PerfEnforceException("error running table ranks"); } } /** * This method collects features from each query generated by PSLAManager * @throws PerfEnforceException if there is an error collecting features from the generated queries */ public void collectFeaturesFromGeneratedQueries() throws PerfEnforceException { for (Integer config : PerfEnforceDriver.configurations) { Path workerPath = PerfEnforceDriver.configurationPath .resolve("PSLAGeneration") .resolve(config + "_Workers"); String currentLine = ""; try { try (PrintWriter featureWriter = new PrintWriter(workerPath.resolve("TESTING.arff").toString(), "UTF-8")) { featureWriter.write("@relation testing \n"); featureWriter.write("@attribute numberTables numeric \n"); featureWriter.write("@attribute postgesEstCostMin numeric \n"); featureWriter.write("@attribute postgesEstCostMax numeric \n"); featureWriter.write("@attribute postgesEstNumRows numeric \n"); featureWriter.write("@attribute postgesEstWidth numeric \n"); featureWriter.write("@attribute numberOfWorkers numeric \n"); featureWriter.write("@attribute realTime numeric \n"); featureWriter.write("\n"); featureWriter.write("@data \n"); try (BufferedReader br = new BufferedReader( new FileReader(workerPath.resolve("SQLQueries-Generated.txt").toString()))) { while ((currentLine = br.readLine()) != null) { currentLine = currentLine.replace( factTableDescription.relationKey.getRelationName(), factTableRelationMapper.get(config).getRelationName()); String features = PerfEnforceUtils.getMaxFeature(server, currentLine, config); featureWriter.write(features + "\n"); } } } } catch (Exception e) { throw new PerfEnforceException("Error creating table features"); } } } }