package edu.brown.workload; import java.io.File; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.collections15.set.ListOrderedSet; import org.apache.log4j.Logger; import org.voltdb.VoltType; import org.voltdb.catalog.CatalogType; import org.voltdb.catalog.Column; import org.voltdb.catalog.Database; import org.voltdb.catalog.ProcParameter; import org.voltdb.catalog.Procedure; import org.voltdb.catalog.Statement; import org.voltdb.catalog.StmtParameter; import org.voltdb.utils.Pair; import edu.brown.catalog.CatalogUtil; import edu.brown.hashing.AbstractHasher; import edu.brown.logging.LoggerUtil; import edu.brown.logging.LoggerUtil.LoggerBoolean; import edu.brown.mappings.ParameterMapping; import edu.brown.mappings.ParameterMappingsSet; import edu.brown.plannodes.PlanNodeUtil; import edu.brown.utils.ArgumentsParser; import edu.brown.utils.CollectionUtil; import edu.brown.utils.Consumer; import edu.brown.utils.PartitionEstimator; import edu.brown.utils.Producer; import edu.brown.utils.ThreadUtil; /** * * @author pavlo */ public class WorkloadSummarizer { private static final Logger LOG = Logger.getLogger(WorkloadSummarizer.class); private static final LoggerBoolean debug = new LoggerBoolean(); private static final LoggerBoolean trace = new LoggerBoolean(); static { LoggerUtil.attachObserver(LOG, debug, trace); } private final Database catalog_db; private final PartitionEstimator p_estimator; private final ParameterMappingsSet mappings; private final Collection<Procedure> target_procedures; private final Collection<Column> candidate_columns; private final Map<Statement, List<StmtParameter>> target_stmt_params = new HashMap<Statement, List<StmtParameter>>(); private final Map<Procedure, List<ProcParameter>> target_proc_params = new HashMap<Procedure, List<ProcParameter>>(); private Integer num_intervals; private class DuplicateTraceElements<CT extends CatalogType, T extends AbstractTraceElement<CT>> extends HashMap<CT, Map<String, Set<T>>> { private static final long serialVersionUID = 1L; private boolean has_duplicates = false; synchronized void add(CT catalog_item, String hash_str, T trace) { Map<String, Set<T>> m = this.get(catalog_item); if (m == null) { m = new ConcurrentHashMap<String, Set<T>>(); this.put(catalog_item, m); } Set<T> s = m.get(hash_str); if (s == null) { s = new ListOrderedSet<T>(); m.put(hash_str, s); } s.add(trace); this.has_duplicates = this.has_duplicates || s.size() > 1; } public Collection<T> getWeightedTraceElements() { List<T> new_elements = new ArrayList<T>(); for (CT catalog_item : this.keySet()) { for (Set<T> s : this.get(catalog_item).values()) { int weight = 0; for (T t : s) { weight += t.getWeight(); } if (weight == 0) continue; T t = CollectionUtil.first(s); t.setWeight(weight); new_elements.add(t); } // FOR } // FOR return (new_elements); } @Override public void clear() { for (Map<String, Set<T>> m : this.values()) { for (Set<T> s : m.values()) { s.clear(); } // FOR } // FOR this.has_duplicates = false; } public boolean hasDuplicates() { return (this.has_duplicates); } } /** * Constructor * @param catalog_db * @param p_estimator * @param mappings * @param procedures * @param candidate_columns */ public WorkloadSummarizer(Database catalog_db, PartitionEstimator p_estimator, ParameterMappingsSet mappings, Collection<Procedure> procedures, Collection<Column> candidate_columns) { assert(procedures != null); assert(candidate_columns != null); this.catalog_db = catalog_db; this.p_estimator = p_estimator; this.mappings = mappings; this.target_procedures = procedures; this.candidate_columns = candidate_columns; this.buildTargetParameters(); } protected WorkloadSummarizer(Database catalog_db, PartitionEstimator p_estimator, ParameterMappingsSet mappings) { this(catalog_db, p_estimator, mappings, CollectionUtil.addAll(new HashSet<Procedure>(), catalog_db.getProcedures()), CatalogUtil.getAllColumns(catalog_db)); } public void setIntervals(Integer intervals) { if (debug.val) LOG.debug("Compression Intervals: " + intervals); this.num_intervals = intervals; } /** * Main entry point * @param workload * @return */ public Workload process(Workload workload) { return (this.removeDuplicateTransactions(this.removeDuplicateQueries(workload))); } protected List<StmtParameter> getTargetParameters(Statement catalog_stmt) { return (this.target_stmt_params.get(catalog_stmt)); } protected List<ProcParameter> getTargetParameters(Procedure catalog_proc) { return (this.target_proc_params.get(catalog_proc)); } /** * Construct the internal lists that identify which StmtParameters we actually care about * when pruning duplicate queries based on the parameter hashes */ private void buildTargetParameters() { for (Procedure catalog_proc : catalog_db.getProcedures()) { if (catalog_proc.getSystemproc()) continue; // For each StmtParameter, look to see whether the column that it references is in our // list of candidate columns. If it is, then that means we will want to include it the value's hash // when determining whether a QueryTrace is unique for (Statement catalog_stmt : catalog_proc.getStatements()) { List<StmtParameter> stmt_params = new ArrayList<StmtParameter>(); for (StmtParameter catalog_param : catalog_stmt.getParameters()) { Column catalog_col = PlanNodeUtil.getColumnForStmtParameter(catalog_param); assert(catalog_col != null); if (this.candidate_columns.contains(catalog_col)) { stmt_params.add(catalog_param); } } // FOR (parameter) this.target_stmt_params.put(catalog_stmt, stmt_params); if (debug.val) LOG.debug(String.format("%s - Relevant Parameters: %s", catalog_stmt.fullName(), stmt_params)); } // FOR (statement) // For each ProcParameter, get the mappings to all of the StmtParameters // We can then check whether those StmtParameters are used against a column that we care about // If it is, then can put it in our list of relevant ProcParameters for this Procedure List<ProcParameter> proc_params = new ArrayList<ProcParameter>(); for (ProcParameter catalog_param : catalog_proc.getParameters()) { boolean matched = false; for (ParameterMapping c : mappings.get(catalog_param)) { assert(c.getStatementColumn() != null); if (this.candidate_columns.contains(c.getStatementColumn())) { matched = true; break; } } // FOR if (matched) proc_params.add(catalog_param); } // FOR (parameter) this.target_proc_params.put(catalog_proc, proc_params); if (debug.val) LOG.debug(String.format("%s - Relevant Parameters: %s", catalog_proc.fullName(), proc_params)); } // FOR (procedure) } protected String getTransactionTraceSignature(Procedure catalog_proc, TransactionTrace txn_trace, Integer interval) { SortedSet<String> queries = new TreeSet<String>(); for (QueryTrace query_trace : txn_trace.getQueries()) { Statement catalog_stmt = query_trace.getCatalogItem(catalog_db); queries.add(this.getQueryTraceSignature(catalog_stmt, query_trace)); } // FOR String signature = catalog_proc.getName() + "->"; if (interval != null) signature += "INT[" + interval + "]"; signature += this.getParamSignature(txn_trace, this.target_proc_params.get(catalog_proc)); for (String q : queries) { signature += "\n" + q; } // FOR if (trace.val) LOG.trace(txn_trace + " ==> " + signature); return (signature); } protected String getQueryTraceSignature(Statement catalog_stmt, QueryTrace query_trace) { // int weight = (query_trace.hasWeight() ? query_trace.getWeight() : 1); String param_signature = this.getParamSignature(query_trace, this.target_stmt_params.get(catalog_stmt)); // return String.format("%s[%.2f]%s", catalog_stmt.getName(), weight, param_signature); return String.format("%s->%s", catalog_stmt.getName(), param_signature); } protected String getParamSignature(AbstractTraceElement<? extends CatalogType> element, List<? extends CatalogType> target_params) { Object params[] = element.getParams(); String sig = (element.aborted ? "ABRT-" : ""); if (target_params != null) { AbstractHasher hasher = p_estimator.getHasher(); boolean first = true; for (CatalogType catalog_param : target_params) { // Skip types that are always unique (and not useful for partitioning) VoltType vtype = VoltType.get((catalog_param instanceof StmtParameter ? ((StmtParameter)catalog_param).getJavatype() : ((ProcParameter)catalog_param).getType())); if (vtype == VoltType.STRING || vtype == VoltType.TIMESTAMP) continue; // Add a prefix to separate parameters if (first == false) sig += "|"; // StmtParameter if (catalog_param instanceof StmtParameter) { int idx = ((StmtParameter)catalog_param).getIndex(); sig += hasher.hash(params[idx]); // ProcParameter } else if (catalog_param instanceof ProcParameter) { ProcParameter catalog_procparam = (ProcParameter)catalog_param; int idx = catalog_procparam.getIndex(); // ARRAY if (catalog_procparam.getIsarray()) { Set<Integer> hashes = new TreeSet<Integer>(); for (Object o : (Object[])params[idx]) { hashes.add(hasher.hash(o)); } // FOR boolean first_hash = true; for (Integer hash : hashes) { if (first_hash == false) sig += ","; sig += hash; first_hash = false; } // FOR // SCALAR } else { sig += hasher.hash(params[idx]); } } else { assert(false) : "Unexpected: " + catalog_param; } first = false; } // FOR } return (sig); } /** * Remove duplicate transaction invocations and populate a new Workload * @param workload * @return */ protected Workload removeDuplicateTransactions(final Workload workload) { final DuplicateTraceElements<Procedure, TransactionTrace> duplicates = new DuplicateTraceElements<Procedure, TransactionTrace>(); // PRODUCER Producer<TransactionTrace, TransactionTrace> producer = new Producer<TransactionTrace, TransactionTrace>(workload) { @Override public Pair<Consumer<TransactionTrace>, TransactionTrace> transform(TransactionTrace t) { return this.defaultTransform(t); } }; // CONSUMERS for (int i = 0, cnt = ThreadUtil.getMaxGlobalThreads(); i < cnt; i++) { Consumer<TransactionTrace> c = new Consumer<TransactionTrace>() { @Override public void process(TransactionTrace txn_trace) { Procedure catalog_proc = txn_trace.getCatalogItem(catalog_db); if (target_procedures.contains(catalog_proc) == false) return; Integer interval = (num_intervals != null ? workload.getTimeInterval(txn_trace, num_intervals) : null); String signature = getTransactionTraceSignature(catalog_proc, txn_trace, interval); assert(signature != null); assert(signature.isEmpty() == false); duplicates.add(catalog_proc, signature, txn_trace); } }; producer.addConsumer(c); } // FOR ThreadUtil.runGlobalPool(producer.getRunnablesList()); // BLOCKING if (duplicates.hasDuplicates() == false) return (workload); Workload new_workload = new Workload(this.catalog_db.getCatalog()); for (TransactionTrace txn_trace : duplicates.getWeightedTraceElements()) { new_workload.addTransaction(txn_trace.getCatalogItem(catalog_db), txn_trace); } // FOR LOG.info(String.format("Reduced Workload from (%d txns / %d queries) to (%d txns / %d queries)", workload.getTransactionCount(), workload.getQueryCount(), new_workload.getTransactionCount(), new_workload.getQueryCount())); return (new_workload); } /** * Removed duplicate query invocations within a single TransactionTrace. Duplicate QueryTraces will * be replaced with a single QueryTrace that is weighted. This will remove batch boundaries. * Returns a new workload that contains the new TransactionTraces instances with the * pruned list of QueryTraces. * @param workload * @return */ protected Workload removeDuplicateQueries(Workload workload) { final Workload new_workload = new Workload(this.catalog_db.getCatalog()); final AtomicInteger trimmed_ctr = new AtomicInteger(0); // PRODUCER Producer<TransactionTrace, TransactionTrace> producer = Producer.defaultProducer(workload); // CONSUMERS for (int i = 0, cnt = ThreadUtil.getMaxGlobalThreads(); i < cnt; i++) { final DuplicateTraceElements<Statement, QueryTrace> duplicates = new DuplicateTraceElements<Statement, QueryTrace>(); Consumer<TransactionTrace> c = new Consumer<TransactionTrace>() { @Override public void process(TransactionTrace txn_trace) { Procedure catalog_proc = txn_trace.getCatalogItem(catalog_db); if (target_procedures.contains(catalog_proc) == false) return; duplicates.clear(); for (QueryTrace query_trace : txn_trace.getQueries()) { Statement catalog_stmt = query_trace.getCatalogItem(catalog_db); String param_hashes = getQueryTraceSignature(catalog_stmt, query_trace); duplicates.add(catalog_stmt, param_hashes, query_trace); } // FOR (query) // If this TransactionTrace has duplicate queries, then we will want to consturct // a new TransactionTrace that has the weighted queries. Note that will cause us // to have to remove any batches if (duplicates.hasDuplicates()) { TransactionTrace new_txn_trace = (TransactionTrace)txn_trace.clone(); new_txn_trace.setQueries(duplicates.getWeightedTraceElements()); new_workload.addTransaction(new_txn_trace.getCatalogItem(catalog_db), new_txn_trace); trimmed_ctr.incrementAndGet(); } else { new_workload.addTransaction(txn_trace.getCatalogItem(catalog_db), txn_trace); } } }; producer.addConsumer(c); } // FOR ThreadUtil.runGlobalPool(producer.getRunnablesList()); // BLOCKING if (debug.val) LOG.debug(String.format("Reduced Workload %d -> %d txns [%.2f] / %d -> %d queries [%.2f]", workload.getTransactionCount(), new_workload.getTransactionCount(), (workload.getTransactionCount() - new_workload.getTransactionCount()) / (double)new_workload.getTransactionCount(), workload.getQueryCount(), new_workload.getQueryCount(), (workload.getQueryCount() - new_workload.getQueryCount()) / (double)new_workload.getQueryCount() )); return (new_workload); } public static void main(String[] vargs) throws Exception { ArgumentsParser args = ArgumentsParser.load(vargs); args.require( ArgumentsParser.PARAM_CATALOG, ArgumentsParser.PARAM_WORKLOAD, ArgumentsParser.PARAM_WORKLOAD_OUTPUT, ArgumentsParser.PARAM_MAPPINGS ); Integer intervals = args.getIntParam(ArgumentsParser.PARAM_DESIGNER_INTERVALS); LOG.info(String.format("Compressing workload based on %d partitions%s", args.catalogContext.numberOfPartitions, (intervals != null ? " over " + intervals + " intervals" : ""))); LOG.info("BEFORE:\n" + args.workload.getProcedureHistogram()); PartitionEstimator p_estimator = new PartitionEstimator(args.catalogContext); WorkloadSummarizer ws = new WorkloadSummarizer(args.catalog_db, p_estimator, args.param_mappings); if (intervals != null) ws.setIntervals(intervals); Workload new_workload = ws.process(args.workload); assert(new_workload != null); LOG.info("AFTER:\n" + new_workload.getProcedureHistogram()); File output_path = args.getFileParam(ArgumentsParser.PARAM_WORKLOAD_OUTPUT); LOG.info("Saving compressed workload '" + output_path + "'"); new_workload.save(output_path, args.catalog_db); } }