/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.compress.cocode; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator; import org.apache.sysml.runtime.compress.estim.CompressedSizeInfo; public class PlanningCoCoder { //internal configurations private final static PartitionerType COLUMN_PARTITIONER = PartitionerType.BIN_PACKING; private static final Log LOG = LogFactory.getLog(PlanningCoCoder.class.getName()); public enum PartitionerType { BIN_PACKING, STATIC, } public static List<int[]> findCocodesByPartitioning(CompressedSizeEstimator sizeEstimator, List<Integer> cols, CompressedSizeInfo[] colInfos, int numRows, int k) throws DMLRuntimeException { // filtering out non-groupable columns as singleton groups // weight is the ratio of its cardinality to the number of rows int numCols = cols.size(); List<Integer> groupCols = new ArrayList<Integer>(); HashMap<Integer, GroupableColInfo> groupColsInfo = new HashMap<Integer, GroupableColInfo>(); for (int i = 0; i < numCols; i++) { int colIx = cols.get(i); double cardinality = colInfos[colIx].getEstCard(); double weight = cardinality / numRows; groupCols.add(colIx); groupColsInfo.put(colIx, new GroupableColInfo(weight,colInfos[colIx].getMinSize())); } // use column group partitioner to create partitions of columns List<List<Integer>> bins = createColumnGroupPartitioner(COLUMN_PARTITIONER) .partitionColumns(groupCols, groupColsInfo); // brute force grouping within each partition return (k > 1) ? getCocodingGroupsBruteForce(bins, groupColsInfo, sizeEstimator, numRows, k) : getCocodingGroupsBruteForce(bins, groupColsInfo, sizeEstimator, numRows); } private static List<int[]> getCocodingGroupsBruteForce(List<List<Integer>> bins, HashMap<Integer, GroupableColInfo> groupColsInfo, CompressedSizeEstimator estim, int rlen) { List<int[]> retGroups = new ArrayList<int[]>(); for (List<Integer> bin : bins) { // building an array of singleton CoCodingGroup ArrayList<PlanningCoCodingGroup> sgroups = new ArrayList<PlanningCoCodingGroup>(); for (Integer col : bin) sgroups.add(new PlanningCoCodingGroup(col, groupColsInfo.get(col))); // brute force co-coding PlanningCoCodingGroup[] outputGroups = findCocodesBruteForce( estim, rlen, sgroups.toArray(new PlanningCoCodingGroup[0])); for (PlanningCoCodingGroup grp : outputGroups) retGroups.add(grp.getColIndices()); } return retGroups; } private static List<int[]> getCocodingGroupsBruteForce(List<List<Integer>> bins, HashMap<Integer, GroupableColInfo> groupColsInfo, CompressedSizeEstimator estim, int rlen, int k) throws DMLRuntimeException { List<int[]> retGroups = new ArrayList<int[]>(); try { ExecutorService pool = Executors.newFixedThreadPool( k ); ArrayList<CocodeTask> tasks = new ArrayList<CocodeTask>(); for (List<Integer> bin : bins) { // building an array of singleton CoCodingGroup ArrayList<PlanningCoCodingGroup> sgroups = new ArrayList<PlanningCoCodingGroup>(); for (Integer col : bin) sgroups.add(new PlanningCoCodingGroup(col, groupColsInfo.get(col))); tasks.add(new CocodeTask(estim, sgroups, rlen)); } List<Future<PlanningCoCodingGroup[]>> rtask = pool.invokeAll(tasks); for( Future<PlanningCoCodingGroup[]> lrtask : rtask ) for (PlanningCoCodingGroup grp : lrtask.get()) retGroups.add(grp.getColIndices()); pool.shutdown(); } catch(Exception ex) { throw new DMLRuntimeException(ex); } return retGroups; } /** * Identify columns to code together. Uses a greedy approach that merges * pairs of column groups into larger groups. Each phase of the greedy * algorithm considers all combinations of pairs to merge. * * @param sizeEstimator compressed size estimator * @param numRowsWeight number of rows weight * @param singltonGroups planning co-coding groups * @return */ private static PlanningCoCodingGroup[] findCocodesBruteForce( CompressedSizeEstimator estim, int numRows, PlanningCoCodingGroup[] singletonGroups) { if( LOG.isTraceEnabled() ) LOG.trace("Cocoding: process "+singletonGroups.length); List<PlanningCoCodingGroup> workset = new ArrayList<PlanningCoCodingGroup>(Arrays.asList(singletonGroups)); //establish memo table for extracted column groups PlanningMemoTable memo = new PlanningMemoTable(); //process merging iterations until no more change boolean changed = true; while( changed && workset.size()>1 ) { //find best merge, incl memoization PlanningCoCodingGroup tmp = null; for( int i=0; i<workset.size(); i++ ) { for( int j=i+1; j<workset.size(); j++ ) { PlanningCoCodingGroup c1 = workset.get(i); PlanningCoCodingGroup c2 = workset.get(j); memo.incrStats(1, 0, 0); //pruning filter: skip dominated candidates if( -Math.min(c1.getEstSize(), c2.getEstSize()) > memo.getOptChangeInSize() ) continue; //memoization or newly created group (incl bitmap extraction) PlanningCoCodingGroup c1c2 = memo.getOrCreate(c1, c2, estim, numRows); //keep best merged group only if( tmp == null || c1c2.getChangeInSize() < tmp.getChangeInSize() || (c1c2.getChangeInSize() == tmp.getChangeInSize() && c1c2.getColIndices().length < tmp.getColIndices().length)) tmp = c1c2; } } //modify working set if( tmp != null && tmp.getChangeInSize() < 0 ) { workset.remove(tmp.getLeftGroup()); workset.remove(tmp.getRightGroup()); workset.add(tmp); memo.remove(tmp); if( LOG.isTraceEnabled() ) { LOG.trace("--merge groups: "+Arrays.toString(tmp.getLeftGroup().getColIndices())+" and " +Arrays.toString(tmp.getRightGroup().getColIndices())); } } else { changed = false; } } if( LOG.isTraceEnabled() ) LOG.trace("--stats: "+Arrays.toString(memo.getStats())); return workset.toArray(new PlanningCoCodingGroup[0]); } private static ColumnGroupPartitioner createColumnGroupPartitioner(PartitionerType type) { switch( type ) { case BIN_PACKING: return new ColumnGroupPartitionerBinPacking(); case STATIC: return new ColumnGroupPartitionerStatic(); default: throw new RuntimeException( "Unsupported column group partitioner: "+type.toString()); } } public static class GroupableColInfo { public final double cardRatio; public final long size; public GroupableColInfo(double lcardRatio, long lsize) { cardRatio = lcardRatio; size = lsize; } } private static class CocodeTask implements Callable<PlanningCoCodingGroup[]> { private CompressedSizeEstimator _estim = null; private ArrayList<PlanningCoCodingGroup> _sgroups = null; private int _rlen = -1; protected CocodeTask( CompressedSizeEstimator estim, ArrayList<PlanningCoCodingGroup> sgroups, int rlen ) { _estim = estim; _sgroups = sgroups; _rlen = rlen; } @Override public PlanningCoCodingGroup[] call() throws DMLRuntimeException { // brute force co-coding return findCocodesBruteForce(_estim, _rlen, _sgroups.toArray(new PlanningCoCodingGroup[0])); } } }