/* * Copyright (c) 2011-2015 EPFL DATA Laboratory * Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE) * * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ch.epfl.data.squall.ewh.algorithms; import org.apache.log4j.Logger; import ch.epfl.data.squall.ewh.data_structures.JoinMatrix; /* If we receive input tuples: * * Postponed algorithm: * ** Sort them * ** Come up with p delimiters on rows and columns * ** Go over candidate cells, and put each output tuple in the right bucket * *** O(n^2) complexity * *** can be made cheaper, but no need: it's not on the critical path * * Keeping everything sorted (not on critical path): * ** Advantage: * *** No need to do sort * ** Disadvantages: * *** Data structure is complicated and replicated: * **** It is more complicated than two Btrees, as we need to keep outputs as well * **** Rows: Sorted list (rows) of sorted lists (outputs) * **** Columns: Sorted list (columns) of sorted lists (outputs) * *** Complexity: O(nlogn) to insert, and O(nlogn) to join with * ** O(n + m) time to count elements in all bucket; m is the number of output tuples * * * If we receive output tuples: * * Postponed algorithm: * ** Take input tuples of the output tuple, and do the same as in "If we receive input tuples:" * * Keeping everything sorted (not on critical path): * ** The same as "If we receive input tuples:", except that we do not compute the output tuple * */ public class InputShallowCoarsener extends ShallowCoarsener { private static Logger LOG = Logger.getLogger(InputShallowCoarsener.class); private int _originalXSize; private int _originalYSize; private int _numXBuckets, _numYBuckets; private int _bucketXSize, _bucketYSize; // last bucket is slightly bigger public InputShallowCoarsener(int numXBuckets, int numYBuckets) { _numXBuckets = numXBuckets; _numYBuckets = numYBuckets; } // has to be invoked before all other methods @Override public void setOriginalMatrix(JoinMatrix originalMatrix, StringBuilder sb) { _originalMatrix = originalMatrix; _originalXSize = originalMatrix.getXSize(); _originalYSize = originalMatrix.getYSize(); // compute bucket sizes: last bucket is slightly bigger _bucketXSize = _originalXSize / _numXBuckets; _bucketYSize = _originalYSize / _numYBuckets; // corner case: more buckets than elements in the original matrix if (_bucketXSize == 0) { _numXBuckets = _originalXSize; _bucketXSize = 1; sb.append("\nWARNING: Bucket size X reduced to the number of rows ") .append(_originalXSize).append("\n"); } if (_bucketYSize == 0) { _numYBuckets = _originalYSize; _bucketYSize = 1; sb.append( "\nWARNING: Bucket size Y reduced to the number of columns ") .append(_originalYSize).append("\n"); } sb.append("\nFor InputCoarsener, building the rounded matrix is instantenous!\n"); } @Override public WeightPrecomputation getPrecomputation() { // this coarsener does not build precomputation return null; } @Override public int getNumXCoarsenedPoints() { return _numXBuckets; } @Override public int getNumYCoarsenedPoints() { return _numYBuckets; } @Override public int getOriginalXCoordinate(int cx, boolean isHigher) { // this point is included; we take end of this point as the boundary // this point is out of maths, as the last bucket can be larger if (cx == _numXBuckets - 1) { if (isHigher) { return _originalXSize - 1; } } // non-last bucket if (isHigher) { // the end of this bucket is the boundary cx++; } int x = cx * _bucketXSize; if (isHigher) { // the end of this bucket is the boundary x--; } return x; } @Override public int getOriginalYCoordinate(int cy, boolean isHigher) { // this point is included; we take end of this point as the boundary // this point is out of maths, as the last bucket can be larger if (cy == _numYBuckets - 1) { if (isHigher) { return _originalYSize - 1; } } // non-last bucket if (isHigher) { // the end of this bucket is the boundary cy++; } int y = cy * _bucketYSize; if (isHigher) { // the end of this bucket is the boundary y--; } return y; } @Override public int getCoarsenedXCoordinate(int x) { // this point is included; we take end of this point as the boundary // this point is out of maths, as the last bucket can be larger if (x > (_numXBuckets - 1) * _bucketXSize) { return _numXBuckets - 1; } else { return x / _bucketXSize; } } @Override public int getCoarsenedYCoordinate(int y) { // this point is included; we take end of this point as the boundary // this point is out of maths, as the last bucket can be larger if (y > (_numYBuckets - 1) * _bucketYSize) { return _numYBuckets - 1; } else { return y / _bucketYSize; } } @Override public String toString() { return "InputBinaryCoarsener [numXBuckets = " + _numXBuckets + ", numYBuckets = " + _numYBuckets + "]"; } }