/* * Copyright (c) 2011-2015 EPFL DATA Laboratory * Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE) * * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ch.epfl.data.squall.ewh.algorithms; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; import ch.epfl.data.squall.ewh.data_structures.BooleanRegions; import ch.epfl.data.squall.ewh.data_structures.ExtremePositions; import ch.epfl.data.squall.ewh.data_structures.JoinMatrix; import ch.epfl.data.squall.ewh.data_structures.Region; import ch.epfl.data.squall.ewh.main.PushStatisticCollector; import ch.epfl.data.squall.ewh.utilities.OverweightedException; import ch.epfl.data.squall.ewh.utilities.TooSmallMaxWeightException; import ch.epfl.data.squall.utilities.MyUtilities; public abstract class OkcanAlgorithm implements TilingAlgorithm { private static final double CLOSE_PERCENTAGE = 0.05; private static Logger LOG = Logger .getLogger(OkcanCandidateInputAlgorithm.class); private int _numXBuckets, _numYBuckets; private OkcanCoarsener _coarsener; private JoinMatrix _originalMatrix; private Map _map; private int _j; private StringBuilder _sb; // for monotonic joins: assumes a row has at least one candidate cell private HashMap<Integer, ExtremePositions> _rowExtremes; private HashMap<Integer, ExtremePositions> _columnExtremes; private boolean _isMonotonic = true; private boolean _isExact = false; public OkcanAlgorithm(int j, int numXBuckets, int numYBuckets, Map map, OkcanCoarsener coarsener) { _map = map; _j = j; _numXBuckets = numXBuckets; _numYBuckets = numYBuckets; _coarsener = coarsener; if (_coarsener instanceof OkcanExactCoarsener) { _isExact = true; LOG.info("Setting exactCoarsener is less efficient!"); } } // TODO: As the authors said in Section 5.1, we may align bucket boundaries, // as this is beneficial for "some queries" @Override public List<Region> partition(JoinMatrix joinMatrix, StringBuilder sb) { _sb = sb; // computing bucket sizes _originalMatrix = joinMatrix; int xSize = joinMatrix.getXSize(); int ySize = joinMatrix.getYSize(); // there cannot be more buckets than elements in any of the relations if (_numXBuckets > xSize) { _numXBuckets = xSize; _sb.append("WARNING: Bucket size X reduced to the number of rows ") .append(xSize).append("\n"); } if (_numYBuckets > ySize) { _numYBuckets = ySize; _sb.append( "WARNING: Bucket size Y reduced to the number of columns ") .append(ySize).append("\n"); } // creation of coarsenedMatrix JoinMatrix coarsenedMatrix = _coarsener.createAndFillCoarsenedMatrix( joinMatrix, _numXBuckets, _numYBuckets, _map); LOG.info("Created coarsened matrix in OkcanAlgorithm."); if (_isMonotonic) { precomputeRowExtremes(); precomputeColumnExtremes(); } LOG.info("Precomputed row extremes in OkcanAlgorithm."); long candidateGridCells = coarsenedMatrix.getNumElements(); LOG.info("The number of candidate cells in the coarsenedMatrix in OkcanAlgorithm is " + candidateGridCells); _sb.append("\nThe number of candidate grid cells is ") .append(candidateGridCells).append(".\n"); // check if the number of candidate grid cells is smaller than the // number of joiners if (candidateGridCells < _j) { // one joiner must have at least one cell throw new RuntimeException( "Too coarse-grained partitioning, not enough cells!"); } LOG.info("Started binary search in OkcanAlgorithm."); // actual work List<Region> coarsenedRegions = getCoarsenedRegions(coarsenedMatrix, _j); return _coarsener.translateCoarsenedToOriginalRegions(coarsenedRegions); } private List<Region> getCoarsenedRegions(JoinMatrix coarsenedMatrix, int numOfRegions) { int lowerBound = getWeightLowerBound(coarsenedMatrix, numOfRegions); int upperBound = getWeightUpperBound(coarsenedMatrix, numOfRegions); return binarySearch(lowerBound, upperBound, numOfRegions, coarsenedMatrix); } private List<Region> binarySearch(int lowerBound, int upperBound, int numOfRegions, JoinMatrix coarsenedMatrix) { List<Region> result = null; /* * this binary search will find the smallest maxInput in [lowerBound, * upperBound] such that mBucket can perform tiling with at most * numOfRegions rectangles It is guaranteed to find the smallest * maxInput, as in the case of success, we always go left in the range */ while (lowerBound <= upperBound) { int middle = (lowerBound + upperBound) / 2; LOG.info("New binary search with middle = " + middle); if (middle == 0 && lowerBound == 0 && upperBound == 1) { // middle should never be 0 lowerBound++; continue; } // to save same time, we do not try every single possibility for // large lowerBound, upperBound if ((result != null) && (MyUtilities.computePercentage(lowerBound, upperBound) < CLOSE_PERCENTAGE)) { LOG.info("Terminated binary search for lowerBound = " + lowerBound + ", upperBound = " + upperBound); break; } _sb.append("\nAt time ") .append(PushStatisticCollector.getWallClockTime()) .append("\n"); _sb.append("Binary search middle = ").append(middle).append("\n"); BooleanRegions br = mBucket(middle, numOfRegions, coarsenedMatrix); if (br.isSatisfied()) { // keep the last good result if (br.getRegions() != null) { // to avoid corner case when there are 0 regions result = br.getRegions(); } // if it is possible to build numOfRegions rectangles with // maxInput = middle, try with smaller maxInput upperBound = middle - 1; } else { // let's try with bigger maxInput lowerBound = middle + 1; } } return result; } /* * Heuristics to cover matrix with numOfRegion regions and maxInput maximum * input (half-perimeter) */ private BooleanRegions mBucket(int maxWeight, int numOfRegions, JoinMatrix coarsenedMatrix) { try { int currentRow = 0; List<Region> allRegions = null; while (currentRow < coarsenedMatrix.getXSize()) { CurrentRowRegions rr = coverSubMatrix(currentRow, maxWeight, coarsenedMatrix); currentRow = rr.getCurrentRow(); List<Region> newRegions = rr.getRegions(); if (newRegions != null) { // subMatrix has at least one candidate cell, and thus, at // least one region if (allRegions == null) { allRegions = new ArrayList<Region>(); } allRegions.addAll(newRegions); if (allRegions.size() > numOfRegions) { return new BooleanRegions(false, allRegions); } } } return new BooleanRegions(true, allRegions); } catch (TooSmallMaxWeightException e) { LOG.info("TooSmallWeightException " + e); return new BooleanRegions(false, null); } } /* * Covers "the best" block of consecutive rows */ private CurrentRowRegions coverSubMatrix(int currentRow, int maxWeight, JoinMatrix coarsenedMatrix) throws TooSmallMaxWeightException { double maxScore = -1; // we may not find even a single candidate cell in the subMatrix int bestRow = currentRow + maxWeight - 1; List<Region> bestRegions = null; try { for (int i = 0; i < maxWeight; i++) { // 0 as there could be only // one row left in the matrix // to be processed // check if lastRow is out of scope if (currentRow + i >= coarsenedMatrix.getXSize()) break; List<Region> regions = coverRows(currentRow, currentRow + i, maxWeight, coarsenedMatrix); if (regions != null) { // there is at least one candidate cell, so that at least // one region is created double area = getSumOfFrequencies(regions); double score = area / regions.size(); if (score >= maxScore) { maxScore = score; bestRow = currentRow + i; bestRegions = regions; } } } } catch (OverweightedException oe) { // one column was too heavy in terms of weight // does not make sense to try with even taller columns // has the effect of break from the loop LOG.info("OverweightedColumn " + oe); // we were not able to make any coverage; because of // OverweightedException, we know there is something to be covered if (bestRegions == null) { throw new TooSmallMaxWeightException(maxScore, null); } } return new CurrentRowRegions(bestRow + 1, bestRegions); } /* * Covers a block of consecutive rows by partitioning them by columns First * and last row are inclusive */ private List<Region> coverRows(int firstRow, int lastRow, int maxWeight, JoinMatrix coarsenedMatrix) throws OverweightedException { List<Region> regions = new ArrayList<Region>(); Region currentRegion = null; boolean regionStarted = false; int startYIndex = 0; int endYIndex = coarsenedMatrix.getYSize(); // non-inclusive if (_isMonotonic) { startYIndex = _rowExtremes.get(firstRow).getMostLeft(); endYIndex = _rowExtremes.get(lastRow).getMostRight() + 1; // +1 // because // endIndex // is // non-inclusive } // my optimization: instead of going one by one, we could use convex // optimization to reduce running time // not implemented because that's not written in their paper (that's my // idea) // also not sure if it would bring performance benefits for (int j = startYIndex; j < endYIndex; j++) { BooleanRegion candidateColumn = getOuterColumnCandidates(firstRow, lastRow, j, coarsenedMatrix); if (candidateColumn.isSatisfied()) {// isCandidate Region candidateColumnRegion = candidateColumn.getRegion(); if (!regionStarted) { currentRegion = candidateColumnRegion; regionStarted = true; if (getWeight(currentRegion) > maxWeight) { // one line weight is bigger than maximum allowed weight throw new OverweightedException(maxWeight, firstRow, j, lastRow, j); } } else { Region extendedRegion = mergeRegions(currentRegion, candidateColumnRegion); if (getWeight(extendedRegion) <= maxWeight) { currentRegion = extendedRegion; } else { regions.add(currentRegion); j--; // go over this index again regionStarted = false; } } } } if (regionStarted) { // the last region which contains some candidate cells, but did not // reach maxInput regions.add(currentRegion); } if (regions.isEmpty()) { regions = null; } return regions; } private static Region mergeRegions(Region originalRegion, Region newRegion) { Region result = new Region(originalRegion); int r_x1 = result.get_x1(); int r_y1 = result.get_y1(); int r_x2 = result.get_x2(); int r_y2 = result.get_y2(); int r_freq = result.getFrequency(); int n_x1 = newRegion.get_x1(); int n_y1 = newRegion.get_y1(); int n_x2 = newRegion.get_x2(); int n_y2 = newRegion.get_y2(); // the same as c_y1 int n_freq = newRegion.getFrequency(); if (n_x1 < r_x1) { // move upper left corner up result.set_x1(n_x1); } if (n_y1 < r_y1) { // move upper left corner left result.set_y1(n_y1); } if (n_x2 > r_x2) { // move lower right corner down result.set_x2(n_x2); } if (n_y2 > r_y2) { // move lower right corner right result.set_y2(n_y2); } result.setFrequency(r_freq + n_freq); // TODO for this to work, the // regions have to be // non-overlapping return result; } /* * All the boundaries are inclusive */ private BooleanRegion getOuterColumnCandidates(int firstRow, int lastRow, int column, JoinMatrix coarsenedMatrix) { if (!_isMonotonic) { boolean isCandidate = false; Region region = null; int firstCandRow = -1; int lastCandRow = -1; int frequency = 0; for (int i = firstRow; i <= lastRow; i++) { int numOutputs = coarsenedMatrix.getElement(i, column); if (numOutputs > 0) { // first is set only first time, last is set each time if (!isCandidate) { firstCandRow = i; isCandidate = true; } lastCandRow = i; frequency += numOutputs; } } if (isCandidate) { region = new Region(firstCandRow, column, lastCandRow, column, frequency); } return new BooleanRegion(isCandidate, region); } else { // monotonic case ExtremePositions epc = _columnExtremes.get(column); int firstCandRow = epc.getMostLeft(); int lastCandRow = epc.getMostRight(); firstRow = Math.max(firstRow, firstCandRow); lastRow = Math.min(lastRow, lastCandRow); Region region = null; boolean isCandidate = false; if (firstRow <= lastRow) { isCandidate = true; int frequency = 0; if (_isExact) { for (int i = firstRow; i <= lastRow; i++) { // needs to sum up actual values frequency += coarsenedMatrix.getElement(i, column); } } else { frequency += (lastRow - firstRow + 1); } /* * old version of the code for(int i = firstRow; i <= lastRow; * i++){ if(_isExact){ // needs to sum up actual values * frequency += coarsenedMatrix.getElement(i, column); }else{ // * all the values in the range are ones // no need for * coarsenedMatrix at all frequency++; } } */ region = new Region(firstRow, column, lastRow, column, frequency); } return new BooleanRegion(isCandidate, region); } } private static int getSumOfFrequencies(List<Region> regions) { int freq = 0; for (Region region : regions) { freq += region.getFrequency(); } return freq; } // take advantage of monotonicity; necessary for performance reasons // We should put this in OkcanCoarsener, but it's interface // We cannot put it in JoinMatrix (create a method similar to // joinMatrix.getNumCandidatesIterate) // because coarsenerMatrix JoinMatrix has no joinAttributes set. // Coordinates are in terms of coarsenedMatrix private void precomputeRowExtremes() { _rowExtremes = new HashMap<Integer, ExtremePositions>(); int firstCandInLastRow = 0; for (int i = 0; i < _numXBuckets; i++) { boolean isFirstInRow = true; int x1 = _coarsener.getOriginalXCoordinate(i, false); int x2 = _coarsener.getOriginalXCoordinate(i, true); for (int j = firstCandInLastRow; j < _numYBuckets; j++) { int y1 = _coarsener.getOriginalYCoordinate(j, false); int y2 = _coarsener.getOriginalYCoordinate(j, true); // LOG.info("x1 = " + x1 + ", y1 = " + y1 + ", x2 = " + x2 + // ", y2 = " + y2); Region region = new Region(x1, y1, x2, y2); boolean isCandidate = MyUtilities.isCandidateRegion( _originalMatrix, region, _originalMatrix.getComparisonPredicate(), _map); if (isCandidate) { if (isFirstInRow) { firstCandInLastRow = j; ExtremePositions ep = new ExtremePositions(j, j); _rowExtremes.put(i, ep); isFirstInRow = false; } else { ExtremePositions ep = _rowExtremes.get(i); ep.setMostRight(j); // update the last position (i) with // value j in place } } if (!isFirstInRow && !isCandidate) { // I am right from the candidate are; the first // non-candidate guy means I should switch to the next row break; } } } } private void precomputeColumnExtremes() { _columnExtremes = new HashMap<Integer, ExtremePositions>(); int firstCandInLastColumn = 0; for (int j = 0; j < _numYBuckets; j++) { boolean isFirstInColumn = true; int y1 = _coarsener.getOriginalYCoordinate(j, false); int y2 = _coarsener.getOriginalYCoordinate(j, true); for (int i = firstCandInLastColumn; i < _numXBuckets; i++) { int x1 = _coarsener.getOriginalXCoordinate(i, false); int x2 = _coarsener.getOriginalXCoordinate(i, true); // LOG.info("x1 = " + x1 + ", y1 = " + y1 + ", x2 = " + x2 + // ", y2 = " + y2); Region region = new Region(x1, y1, x2, y2); boolean isCandidate = MyUtilities.isCandidateRegion( _originalMatrix, region, _originalMatrix.getComparisonPredicate(), _map); if (isCandidate) { if (isFirstInColumn) { firstCandInLastColumn = i; ExtremePositions ep = new ExtremePositions(i, i); _columnExtremes.put(j, ep); isFirstInColumn = false; } else { ExtremePositions ep = _columnExtremes.get(j); ep.setMostRight(i); // update the last position (i) with // value j in place } } if (!isFirstInColumn && !isCandidate) { // I am right from the candidate are; the first // non-candidate guy means I should switch to the next row break; } } } } @Override public String toString() { return "numXBuckets = " + _numXBuckets + ", numYBuckets = " + _numYBuckets; } protected OkcanCoarsener getCoarsener() { return _coarsener; } /* * Minimum possible weight for a region; used in binary search */ protected abstract int getWeightLowerBound(JoinMatrix coarsenedMatrix, int numOfRegions); /* * Maximum possible weight for a region; used in binary search */ protected abstract int getWeightUpperBound(JoinMatrix coarsenedMatrix, int numOfRegions); private static class BooleanRegion { private boolean _satisfied; private Region _region; public BooleanRegion(boolean satisfied, Region region) { _satisfied = satisfied; _region = region; } public boolean isSatisfied() { return _satisfied; } public void setSatisfied(boolean satisfied) { _satisfied = satisfied; } public Region getRegion() { return _region; } public void setRegion(Region region) { _region = region; } } private static class CurrentRowRegions { private int _currentRow; private List<Region> _regions; public CurrentRowRegions(int currentRow, List<Region> regions) { _currentRow = currentRow; _regions = regions; } public int getCurrentRow() { return _currentRow; } public void setCurrentRow(int currentRow) { _currentRow = currentRow; } public List<Region> getRegions() { return _regions; } public void setRegions(List<Region> regions) { _regions = regions; } } }