RVHistogram.java example

Explorer
xxl-master
/* XXL: The eXtensible and fleXible Library for data processing

Copyright (C) 2000-2013 Prof. Dr. Bernhard Seeger
                        Head of the Database Research Group
                        Department of Mathematics and Computer Science
                        University of Marburg
                        Germany

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library;  If not, see <http://www.gnu.org/licenses/>. 

    http://code.google.com/p/xxl/

*/
package xxl.core.spatial.histograms.utils;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import xxl.core.functions.Functional.UnaryFunction;
import xxl.core.indexStructures.RTree;
import xxl.core.indexStructures.rtrees.GenericPartitioner;
import xxl.core.indexStructures.rtrees.RtreeIterativeBulkloader;
import xxl.core.indexStructures.rtrees.GenericPartitioner.Bucket;
import xxl.core.indexStructures.rtrees.GenericPartitioner.CostFunctionArrayProcessor;
import xxl.core.indexStructures.rtrees.GenericPartitioner.DefaultArrayProcessor;
import xxl.core.spatial.rectangles.DoublePointRectangle;



/**
 * 
 * This class provides Spatial Histogram construction methods ( 
 * We refer for details to  D. Achakeev and B. Seeger A class of R-tree histograms for spatial databases GIS 2012 ). 
 * These methods are used after, generating e.g. leaf node MBRs of an R-tree.  
 *  The input of the method is an iterator of {@link SpatialHistogramBucket} objects (micro clusters). 
 *  To map the leaf nodes of R-tree {@link RTree} to a an {@link SpatialHistogramBucket} object the following methods are provided   
 *  {@see SpatialHistogramUtils.getRectanglesLevel1}.
 * 
 * 
 *  
 * 
 *
 */
public class RVHistogram {
	/**
	 * These types definitions for processing the optimal partitioning. 
	 * The default method is SOPT
	 * 
	 */
	public static enum HistType{
		SOPT,
		GOPT,
		NOPT
	}
	/**
	 * Default min capacity ratio for histogram buckets. Let N be the number of input micro-clusters (e.g. leaf nodes), then N/m defines average number of micro clusters 
	 * per histogram bucket. We set the minimal bucket capacity as follows  (Math.max(Math.floor(  N/m * DEFAULT_MIN_CAPACITY_RATIO), 1))
	 * 
	 */
	public static final double DEFAULT_MIN_CAPACITY_RATIO = 0.5; 
	/**
	 * Default partition size. This parameter is used if the number of input micro clusters is too large to be process in main memory. 
	 * To this end, simple heuristic is applied: we apply dynamic programming of sufficient large partitons of an input data.  
	 */
	public static final int DEFAULT_CHUNK_SIZE = 10_0000; 
	/**
	 * creates and stores an array of {@link SpatialHistogramBucket} objects from iterator.
	 * @param iterator
	 * @param size
	 * @return
	 */
	public static SpatialHistogramBucket[] toWeightedArray(Iterator<SpatialHistogramBucket> iterator, int size){
		SpatialHistogramBucket[] recs = new SpatialHistogramBucket[size];
		int i = 0; 
		while(iterator.hasNext()){
			recs[i] = iterator.next();
			i++;
		}
		return recs;
	}
	
	

	
	/**
	 * This is a basic generic method for spatial histogram generation.  
	 * 
	 * @param levelEntries iterator containing {@link SpatialHistogramBucket} objects
	 * @param b minimal number of objects per histogram bucket
	 * @param B maximal number of objects per histogram bucket
	 * @param inputSize number of {@link SpatialHistogramBucket} buckets
	 * @param numberOfBuckets target number of histogram buckets
	 * @param spaceUtil {@link #DEFAULT_MIN_CAPACITY_RATIO}
	 * @param processor different cost function can be provided 
	 * @param type processing type of dynamic programmin scheme
	 * @param chunkSize chunk size
	 * @return a list of {@link SpatialHistogramBucket} 
	 */
	public static List<SpatialHistogramBucket> computeHistogramOPT(
			Iterator<SpatialHistogramBucket> levelEntries,
			int b, int B, 
			int inputSize, int numberOfBuckets, double spaceUtil,
			CostFunctionArrayProcessor<DoublePointRectangle> processor,
			HistType type, 
			int chunkSize){
		List<SpatialHistogramBucket> histogram = new ArrayList<SpatialHistogramBucket>();
		List<SpatialHistogramBucket> buffer = new ArrayList<SpatialHistogramBucket>(chunkSize);
		for(; levelEntries.hasNext() ; ){
			for(int i= 0 ; i < chunkSize && levelEntries.hasNext(); i++){
				SpatialHistogramBucket rec = levelEntries.next();
				buffer.add(rec); 
			}
			// compute distribution 
			int[] distribution = null;
			SpatialHistogramBucket[] processingList =  toWeightedArray(buffer.iterator(), buffer.size());
			int n = (int) (Math.ceil(processingList.length/(spaceUtil * B )));
			if (buffer.size() > B){
				processor.reset();
				switch(type){
					case GOPT : {
						Bucket[] buckets = GenericPartitioner.computeGOPT(processingList, b, B, processor);
						distribution = GenericPartitioner.getDistribution(buckets[processingList.length-1]);
						
					}break;
					case NOPT : {
						Bucket[][] buckets = GenericPartitioner.computeNOPT(processingList, n, processor);	
						distribution =  GenericPartitioner.getDistribution(buckets[n-1][processingList.length-1]);
					}break;
					default :{
						Bucket[][] buckets = GenericPartitioner.computeOPTF(processingList, b, B, n, processor);	
						distribution =  GenericPartitioner.getDistribution(buckets[n-1][processingList.length-1]);
					}
				}
				processor.reset();
			}
			else{
				distribution = new int [] {buffer.size()};
			}
			int k = 0; 
			for(int i: distribution){
				SpatialHistogramBucket sumRec = null;
				int weight= 0;
				for(int j = 0; j < i ; j++, k++){
					if (sumRec == null){
						sumRec = new SpatialHistogramBucket(processingList[k]);
					}else{
						sumRec.union(processingList[k]);
					}
					weight+=processingList[k].getWeight();
					sumRec.updateAverage(processingList[k].avgExtent);
				}
				sumRec.setWeight(weight);
				histogram.add(sumRec);
			}
			// clear buffer 
			buffer.clear();
		}
		return histogram;
	}
	
	
	/**
	 * Default method for computing a spatial histogram.  The cost function is a sum of MBR volumes. OPT-Partitioning is used. 
	 * Use this method if the input size cannot be processed in memory.  
	 *  
	 * @param levelEntries
	 * @param inputSize
	 * @param numberOfBuckets
	 * @param chunkSize
	 * @return
	 */
	public static List<SpatialHistogramBucket> computeRVHistogramChunkHeuristic(Iterator<SpatialHistogramBucket> levelEntries, int inputSize, int numberOfBuckets,
			int chunkSize){
		UnaryFunction<DoublePointRectangle, Double> function =  new UnaryFunction<DoublePointRectangle, Double>() {

			@Override
			public Double invoke(DoublePointRectangle arg) {
				DoublePointRectangle rec =   new DoublePointRectangle(arg);
				return  rec.area();
			}
			
		};
		CostFunctionArrayProcessor<DoublePointRectangle> arrayProcessor = new DefaultArrayProcessor(function); 
		double f = inputSize/ (double)numberOfBuckets; // 1-avgLoad 
		int d = (int) Math.ceil((inputSize/ ((double)numberOfBuckets)));
		int b = (int)(Math.max(Math.floor(f * DEFAULT_MIN_CAPACITY_RATIO), 1));
		b = Math.max(b, 2);
		int B = b+d;//
		double rat = ((double)f)/ (double)B; // ~2/3 if ratio = 0.5
		return computeHistogramOPT(	levelEntries,	b, 	B, 	inputSize, numberOfBuckets, rat, 	arrayProcessor,
				HistType.SOPT, 
				chunkSize); 
	}
	
	
	/**
	 * Default method for computing a spatial histogram.  The cost function is a sum of MBR volumes. OPT-Partitioning is used. 
	 * Use this method if the input size cannot be processed in memory.  
	 *  #DEFAULT_CHUNK_SIZE is used.
	 *  
	 * @param levelEntries
	 * @param inputSize
	 * @param numberOfBuckets
	 * @return
	 */
	public static List<SpatialHistogramBucket> computeRVHistogramChunkHeuristic(Iterator<SpatialHistogramBucket> levelEntries,
			int inputSize, int numberOfBuckets){
		return computeRVHistogramChunkHeuristic(
				levelEntries,
				inputSize,  numberOfBuckets,
				DEFAULT_CHUNK_SIZE); 
	}
	
	/**
	 * Default method for computing a spatial histogram.  The cost function is a sum of MBR volumes. OPT-Partitioning is used. 
	 * Use this method if the input size cannot be processed in memory.  
	 *  
	 * @param levelEntries
	 * @param inputSize
	 * @param numberOfBuckets
	 * @param chunkSize
	 * @return
	 */
	public static List<SpatialHistogramBucket> computeRVHistogram(Iterator<SpatialHistogramBucket> levelEntries, int inputSize, int numberOfBuckets){
		UnaryFunction<DoublePointRectangle, Double> function =  new UnaryFunction<DoublePointRectangle, Double>() {

			@Override
			public Double invoke(DoublePointRectangle arg) {
				DoublePointRectangle rec =   new DoublePointRectangle(arg);
				return  rec.area();
			}
			
		};
		CostFunctionArrayProcessor<DoublePointRectangle> arrayProcessor = new DefaultArrayProcessor(function); 
		double f = inputSize/ (double)numberOfBuckets; // 1-avgLoad 
		int d = (int) Math.ceil((inputSize/ ((double)numberOfBuckets)));
		int b = (int)(Math.max(Math.floor(f * DEFAULT_MIN_CAPACITY_RATIO), 1));
		b = Math.max(b, 2);
		int B = b+d;//
		return computeHistogramOPT(	levelEntries,	b, 	B, 	inputSize, numberOfBuckets,	arrayProcessor,
				HistType.SOPT); 
	}
	
	
	/**
	 * This is a generic spatial histogram computation method.
	 * Use this method if input set can be processed in main memory, otherwise consider to use a heuristic method {@link #computeHistogramOPT(Iterator, int, int, int, int, double, CostFunctionArrayProcessor, HistType, int)}
	 * 
	 * 
	 * @param levelEntries
	 * @param b
	 * @param B
	 * @param inputSize
	 * @param numberOfBuckets
	 * @param processor
	 * @param type
	 * @return
	 */
	public static List<SpatialHistogramBucket> computeHistogramOPT(Iterator<SpatialHistogramBucket> levelEntries,
			int b, int B,  
			int inputSize, 
			int numberOfBuckets, 
			CostFunctionArrayProcessor<DoublePointRectangle> processor, 
			HistType type){
		SpatialHistogramBucket[] processingList = toWeightedArray(levelEntries, inputSize);
		List<SpatialHistogramBucket> histogram = new ArrayList<SpatialHistogramBucket>();
		// compute distribution 
		int[] distribution = null;
		processor.reset();
		switch(type){
			case GOPT : {
				Bucket[] buckets = GenericPartitioner.computeGOPT(processingList, b, B, processor);
				distribution = GenericPartitioner.getDistribution(buckets[processingList.length-1]);
				
			}break;
			case NOPT : {
				Bucket[][] buckets = GenericPartitioner.computeNOPT(processingList,  numberOfBuckets, processor);	
				distribution =  GenericPartitioner.getDistribution(buckets[numberOfBuckets-1][inputSize-1]);
			}break;
			default :{
				Bucket[][] buckets = GenericPartitioner.computeOPTF(processingList, b, B, numberOfBuckets, processor);	
				distribution =  GenericPartitioner.getDistribution(buckets[numberOfBuckets-1][inputSize-1]);
			}
		}
		// update statistical information
		int k = 0; 
		for(int i: distribution){
			SpatialHistogramBucket sumRec = null;
			int weight= 0;
			for(int j = 0; j < i ; j++, k++){
				if (sumRec == null){
					sumRec = new SpatialHistogramBucket(processingList[k]);
				}else{
					sumRec.union(processingList[k]);
				}
				weight+=processingList[k].getWeight();
				sumRec.updateAverage(processingList[k].avgExtent);
			}
			sumRec.setWeight(weight);
			histogram.add(sumRec);
		}
		processor.reset();
		return histogram;
	}
}