/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2013 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.spatial.histograms.utils; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import xxl.core.functions.Functional.UnaryFunction; import xxl.core.indexStructures.RTree; import xxl.core.indexStructures.rtrees.GenericPartitioner; import xxl.core.indexStructures.rtrees.RtreeIterativeBulkloader; import xxl.core.indexStructures.rtrees.GenericPartitioner.Bucket; import xxl.core.indexStructures.rtrees.GenericPartitioner.CostFunctionArrayProcessor; import xxl.core.indexStructures.rtrees.GenericPartitioner.DefaultArrayProcessor; import xxl.core.spatial.rectangles.DoublePointRectangle; /** * * This class provides Spatial Histogram construction methods ( * We refer for details to D. Achakeev and B. Seeger A class of R-tree histograms for spatial databases GIS 2012 ). * These methods are used after, generating e.g. leaf node MBRs of an R-tree. * The input of the method is an iterator of {@link SpatialHistogramBucket} objects (micro clusters). * To map the leaf nodes of R-tree {@link RTree} to a an {@link SpatialHistogramBucket} object the following methods are provided * {@see SpatialHistogramUtils.getRectanglesLevel1}. * * * * * */ public class RVHistogram { /** * These types definitions for processing the optimal partitioning. * The default method is SOPT * */ public static enum HistType{ SOPT, GOPT, NOPT } /** * Default min capacity ratio for histogram buckets. Let N be the number of input micro-clusters (e.g. leaf nodes), then N/m defines average number of micro clusters * per histogram bucket. We set the minimal bucket capacity as follows (Math.max(Math.floor( N/m * DEFAULT_MIN_CAPACITY_RATIO), 1)) * */ public static final double DEFAULT_MIN_CAPACITY_RATIO = 0.5; /** * Default partition size. This parameter is used if the number of input micro clusters is too large to be process in main memory. * To this end, simple heuristic is applied: we apply dynamic programming of sufficient large partitons of an input data. */ public static final int DEFAULT_CHUNK_SIZE = 10_0000; /** * creates and stores an array of {@link SpatialHistogramBucket} objects from iterator. * @param iterator * @param size * @return */ public static SpatialHistogramBucket[] toWeightedArray(Iterator<SpatialHistogramBucket> iterator, int size){ SpatialHistogramBucket[] recs = new SpatialHistogramBucket[size]; int i = 0; while(iterator.hasNext()){ recs[i] = iterator.next(); i++; } return recs; } /** * This is a basic generic method for spatial histogram generation. * * @param levelEntries iterator containing {@link SpatialHistogramBucket} objects * @param b minimal number of objects per histogram bucket * @param B maximal number of objects per histogram bucket * @param inputSize number of {@link SpatialHistogramBucket} buckets * @param numberOfBuckets target number of histogram buckets * @param spaceUtil {@link #DEFAULT_MIN_CAPACITY_RATIO} * @param processor different cost function can be provided * @param type processing type of dynamic programmin scheme * @param chunkSize chunk size * @return a list of {@link SpatialHistogramBucket} */ public static List<SpatialHistogramBucket> computeHistogramOPT( Iterator<SpatialHistogramBucket> levelEntries, int b, int B, int inputSize, int numberOfBuckets, double spaceUtil, CostFunctionArrayProcessor<DoublePointRectangle> processor, HistType type, int chunkSize){ List<SpatialHistogramBucket> histogram = new ArrayList<SpatialHistogramBucket>(); List<SpatialHistogramBucket> buffer = new ArrayList<SpatialHistogramBucket>(chunkSize); for(; levelEntries.hasNext() ; ){ for(int i= 0 ; i < chunkSize && levelEntries.hasNext(); i++){ SpatialHistogramBucket rec = levelEntries.next(); buffer.add(rec); } // compute distribution int[] distribution = null; SpatialHistogramBucket[] processingList = toWeightedArray(buffer.iterator(), buffer.size()); int n = (int) (Math.ceil(processingList.length/(spaceUtil * B ))); if (buffer.size() > B){ processor.reset(); switch(type){ case GOPT : { Bucket[] buckets = GenericPartitioner.computeGOPT(processingList, b, B, processor); distribution = GenericPartitioner.getDistribution(buckets[processingList.length-1]); }break; case NOPT : { Bucket[][] buckets = GenericPartitioner.computeNOPT(processingList, n, processor); distribution = GenericPartitioner.getDistribution(buckets[n-1][processingList.length-1]); }break; default :{ Bucket[][] buckets = GenericPartitioner.computeOPTF(processingList, b, B, n, processor); distribution = GenericPartitioner.getDistribution(buckets[n-1][processingList.length-1]); } } processor.reset(); } else{ distribution = new int [] {buffer.size()}; } int k = 0; for(int i: distribution){ SpatialHistogramBucket sumRec = null; int weight= 0; for(int j = 0; j < i ; j++, k++){ if (sumRec == null){ sumRec = new SpatialHistogramBucket(processingList[k]); }else{ sumRec.union(processingList[k]); } weight+=processingList[k].getWeight(); sumRec.updateAverage(processingList[k].avgExtent); } sumRec.setWeight(weight); histogram.add(sumRec); } // clear buffer buffer.clear(); } return histogram; } /** * Default method for computing a spatial histogram. The cost function is a sum of MBR volumes. OPT-Partitioning is used. * Use this method if the input size cannot be processed in memory. * * @param levelEntries * @param inputSize * @param numberOfBuckets * @param chunkSize * @return */ public static List<SpatialHistogramBucket> computeRVHistogramChunkHeuristic(Iterator<SpatialHistogramBucket> levelEntries, int inputSize, int numberOfBuckets, int chunkSize){ UnaryFunction<DoublePointRectangle, Double> function = new UnaryFunction<DoublePointRectangle, Double>() { @Override public Double invoke(DoublePointRectangle arg) { DoublePointRectangle rec = new DoublePointRectangle(arg); return rec.area(); } }; CostFunctionArrayProcessor<DoublePointRectangle> arrayProcessor = new DefaultArrayProcessor(function); double f = inputSize/ (double)numberOfBuckets; // 1-avgLoad int d = (int) Math.ceil((inputSize/ ((double)numberOfBuckets))); int b = (int)(Math.max(Math.floor(f * DEFAULT_MIN_CAPACITY_RATIO), 1)); b = Math.max(b, 2); int B = b+d;// double rat = ((double)f)/ (double)B; // ~2/3 if ratio = 0.5 return computeHistogramOPT( levelEntries, b, B, inputSize, numberOfBuckets, rat, arrayProcessor, HistType.SOPT, chunkSize); } /** * Default method for computing a spatial histogram. The cost function is a sum of MBR volumes. OPT-Partitioning is used. * Use this method if the input size cannot be processed in memory. * #DEFAULT_CHUNK_SIZE is used. * * @param levelEntries * @param inputSize * @param numberOfBuckets * @return */ public static List<SpatialHistogramBucket> computeRVHistogramChunkHeuristic(Iterator<SpatialHistogramBucket> levelEntries, int inputSize, int numberOfBuckets){ return computeRVHistogramChunkHeuristic( levelEntries, inputSize, numberOfBuckets, DEFAULT_CHUNK_SIZE); } /** * Default method for computing a spatial histogram. The cost function is a sum of MBR volumes. OPT-Partitioning is used. * Use this method if the input size cannot be processed in memory. * * @param levelEntries * @param inputSize * @param numberOfBuckets * @param chunkSize * @return */ public static List<SpatialHistogramBucket> computeRVHistogram(Iterator<SpatialHistogramBucket> levelEntries, int inputSize, int numberOfBuckets){ UnaryFunction<DoublePointRectangle, Double> function = new UnaryFunction<DoublePointRectangle, Double>() { @Override public Double invoke(DoublePointRectangle arg) { DoublePointRectangle rec = new DoublePointRectangle(arg); return rec.area(); } }; CostFunctionArrayProcessor<DoublePointRectangle> arrayProcessor = new DefaultArrayProcessor(function); double f = inputSize/ (double)numberOfBuckets; // 1-avgLoad int d = (int) Math.ceil((inputSize/ ((double)numberOfBuckets))); int b = (int)(Math.max(Math.floor(f * DEFAULT_MIN_CAPACITY_RATIO), 1)); b = Math.max(b, 2); int B = b+d;// return computeHistogramOPT( levelEntries, b, B, inputSize, numberOfBuckets, arrayProcessor, HistType.SOPT); } /** * This is a generic spatial histogram computation method. * Use this method if input set can be processed in main memory, otherwise consider to use a heuristic method {@link #computeHistogramOPT(Iterator, int, int, int, int, double, CostFunctionArrayProcessor, HistType, int)} * * * @param levelEntries * @param b * @param B * @param inputSize * @param numberOfBuckets * @param processor * @param type * @return */ public static List<SpatialHistogramBucket> computeHistogramOPT(Iterator<SpatialHistogramBucket> levelEntries, int b, int B, int inputSize, int numberOfBuckets, CostFunctionArrayProcessor<DoublePointRectangle> processor, HistType type){ SpatialHistogramBucket[] processingList = toWeightedArray(levelEntries, inputSize); List<SpatialHistogramBucket> histogram = new ArrayList<SpatialHistogramBucket>(); // compute distribution int[] distribution = null; processor.reset(); switch(type){ case GOPT : { Bucket[] buckets = GenericPartitioner.computeGOPT(processingList, b, B, processor); distribution = GenericPartitioner.getDistribution(buckets[processingList.length-1]); }break; case NOPT : { Bucket[][] buckets = GenericPartitioner.computeNOPT(processingList, numberOfBuckets, processor); distribution = GenericPartitioner.getDistribution(buckets[numberOfBuckets-1][inputSize-1]); }break; default :{ Bucket[][] buckets = GenericPartitioner.computeOPTF(processingList, b, B, numberOfBuckets, processor); distribution = GenericPartitioner.getDistribution(buckets[numberOfBuckets-1][inputSize-1]); } } // update statistical information int k = 0; for(int i: distribution){ SpatialHistogramBucket sumRec = null; int weight= 0; for(int j = 0; j < i ; j++, k++){ if (sumRec == null){ sumRec = new SpatialHistogramBucket(processingList[k]); }else{ sumRec.union(processingList[k]); } weight+=processingList[k].getWeight(); sumRec.updateAverage(processingList[k].avgExtent); } sumRec.setWeight(weight); histogram.add(sumRec); } processor.reset(); return histogram; } }