package org.streaminer.stream.quantile; import org.streaminer.util.distance.CosineDistance; import org.streaminer.util.distance.LinearDistance; import org.streaminer.util.distance.SquaredDistance; import java.io.Serializable; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.Random; /** * Source code: <https://bitbucket.org/cbockermann/stream-mining>. */ public class EnsembleQuantiles implements IQuantiles<Double> { private GKQuantiles activeBlock; private LinkedList<Double> sampleBlock; private int chunkSize; private int maxEnsembleSize; private double sampleRatio; private LinkedList<SingleModel> ensemble; private String updateMode; private String similarityMeasure; private double epsilon; /** * Use this String at {@link #setUpdateMode(String)} for always replacing the oldest model * with a new model. */ public static final String REPLACE_OLDEST_MODEL = "replaceOldest"; /** * Use this String at {@link #setUpdateMode(String)} for replacing the most dissimilar model * compared to the new model. Squared distance is used to determine similarity. */ public static final String REPLACE_MOST_DISSIMILAR_MODEL = "replaceMostDissimilar"; /** * Use this String at {@link #setUpdateMode(String)} for replacing a random model. */ public static final String REPLACE_RANDOM_MODEL = "replaceRandom"; /** * Use this String at {@link #setUpdateMode(String)} for using a sample of the stream to determine * which model of the ensemble will be replaced. The sample is used to create quantiles of parts of * the stream that are handled in different chunks. You can set the sample rate with {@link #setSampleRatio(double)}. * The similarity is computed by squared distance. */ public static final String REPLACE_SAMPLED_MOST_DISSIMILAR_MODEL = "replaceSampledMostDissimilar"; /** * use this String at {@link #setUpdateMode(String)} for merging the eldest and second eldest model * in order to decrease the ensemble size by one. */ public static final String MERGE_OLDEST_MODELS = "mergeOldest"; /** * use this String at {@link #setUpdateMode(String)} for merging the most similar models of the ensemble * instead of deleting one. Similarity is defined by squared distance. */ public static final String MERGE_MOST_SIMILAR_MODELS = "mergeMostSimilar"; /** * use this String at {@link #setUpdateMode(String)} for merging the most dissimilar models of the ensemble. * Similarity is defined by squared distance. */ public static final String MERGE_MOST_DISSIMILAR_MODELS = "mergeMostDissimilar"; /** * this strategy merges the new, full bucket into the next one that is, the one * which has not been merged for the longest time */ public static final String MERGE_ROUND_ROBIN = "mergeRoundRobin"; /* ----------------------------- * Similarity Measures * TODO write JavaDOC */ public static final String EUCLIDEAN_DISTANCE = "euclidean distance"; public static final String COSINE_DISTANCE = "cosine distance"; public static final String MANHATTAN_DISTANCE = "manhattan distance"; public EnsembleQuantiles(){ this( 0.01f ); } /** * @param epsilon <code>double</code> that represents the error bound. */ public EnsembleQuantiles(double epsilon) { if (epsilon <= 0 || epsilon >= 1) { throw new RuntimeException("An appropriate epsilon value must lay between 0 and 1."); } this.epsilon = epsilon; this.activeBlock = new GKQuantiles(epsilon); this.sampleBlock = new LinkedList<Double>(); this.ensemble = new LinkedList<SingleModel>(); setChunkSize(250000); setEnsembleSize(5); this.updateMode = REPLACE_MOST_DISSIMILAR_MODEL; this.sampleRatio = 0.5d; this.similarityMeasure = EUCLIDEAN_DISTANCE; } public void setEpsilon( Double epsilon ){ this.epsilon = epsilon; this.activeBlock = new GKQuantiles(epsilon); this.sampleBlock = new LinkedList<Double>(); this.ensemble = new LinkedList<SingleModel>(); } @Override public void offer(Double value) { activeBlock.offer(value); if (updateMode.equals(REPLACE_SAMPLED_MOST_DISSIMILAR_MODEL) && addToSampleBlock()) { sampleBlock.addLast(value); if (sampleBlock.size() > chunkSize){ sampleBlock.removeFirst(); } } if (activeBlock.getCount() == chunkSize){ updateEnsemble(); activeBlock = new GKQuantiles(epsilon); } } @Override public Double getQuantile(double q) throws QuantilesException { LinkedList<Double> summary = getSummary(); Double wantedRank = Math.floor(q * summary.size()); try { return summary.get(wantedRank.intValue()); } catch (Exception e) { return Double.NaN; } } /** * Resets the size of a single chunk. * @param chunkSize - an appropriate <code>int</code> value */ public void setChunkSize(Integer chunkSize) { this.chunkSize = chunkSize; } /** * Returns current chunk size. * @return the size of a single chunk */ public int getChunkSize() { return chunkSize; } /** * Resets the maximum number of chunks in the ensemble. Excessive chunks will be removed. * @param maxEnsembleSize - an <code>int</code> value */ public void setEnsembleSize(Integer maxEnsembleSize) { this.maxEnsembleSize = maxEnsembleSize; while (ensemble.size() > maxEnsembleSize){ ensemble.removeFirst(); } } /** * Returns the number of chunks that are managed in a ensemble * @return the maximum number of chunks stored in the ensemble */ public int getEnsembleSize() { return maxEnsembleSize; } /** * Returns the currently used update mode * @return a {@link String} that contains the classifier of the current update mode */ public String getUpdateMode() { return updateMode; } /** * Use this method to change the default kind of update (i.e. replacing the model with greatest squared distance to the new * model). There are seven different kinds of updating the ensemble (represented by <code>final static String</code>s): * <ul> * <li> {@link #REPLACE_MOST_DISSIMILAR_MODEL}: determining which model has furthest squared distance to the new one and replacing it (default) * <li> {@link #REPLACE_OLDEST_MODEL}: replacing the oldest value * <li> {@link #REPLACE_RANDOM_MODEL}: replacing a random model * <li> {@link #REPLACE_SAMPLED_MOST_DISSIMILAR_MODEL}: sampling the stream to get a reference model that contains elements that are handled in different chunks. again * the squared distance is used to determine which model will be replaced * <li> {@link #MERGE_OLDEST_MODELS}: merging the oldest model to decrease the number of models of the ensemble * <li> {@link #MERGE_MOST_SIMILAR_MODELS}: merging the most similar models in respect to squared distance * <li> {@link #MERGE_MOST_DISSIMILAR_MODELS}: merging the most dissimilar models in respect to squared distance * </ul> * @param updateMode Please use final static strings listed above to enable an update mode. */ public void setUpdateMode(String um ) { String updateMode = um; if (um.indexOf( "_" ) > 0 ) { String[] tok = um.split( "_" ); String mode = tok[0].toLowerCase(); for (int i = 1; i < tok.length; i++) { String cur = tok[i].toLowerCase(); mode = mode + cur.substring( 0, 1).toUpperCase() + cur.substring( 1 ); } updateMode = mode; } if (updateMode.equals(REPLACE_OLDEST_MODEL) || updateMode.equals(REPLACE_RANDOM_MODEL) || updateMode.equals(REPLACE_MOST_DISSIMILAR_MODEL) || updateMode.equals(REPLACE_SAMPLED_MOST_DISSIMILAR_MODEL) || updateMode.equals(MERGE_OLDEST_MODELS) || updateMode.equals(MERGE_MOST_SIMILAR_MODELS) || updateMode.equals(MERGE_MOST_DISSIMILAR_MODELS ) || updateMode.equals( MERGE_ROUND_ROBIN ) ) { this.updateMode = updateMode; } //TODO Runtime Exception? else { //System.out.println("Wrong parameter value '" + updateMode + "'! Haven't change the upate mode."); } } /** * Returns current sample rate * @return <code>double</code> value specifying the current sample rate. */ public double getSampleRatio() { return sampleRatio; } /** * Specify a {@link Double} value to set the sampling rate. Please note that a sample rate of 0 will * result in no sampling, so each time the reference vector will be the new model. If you set the sample * rate to a value greater or equal to 1 the elements will rather be saved than sampled. * @param sampleRatio */ public void setSampleRatio(Double sampleRatio) { if (sampleRatio < 0) { sampleRatio = 0.0d; } if (sampleRatio > 1) { sampleRatio = 1.0d; } this.sampleRatio = sampleRatio; } public void setSimilarityMeasure (String similarityMeasure){ if (similarityMeasure.equals(COSINE_DISTANCE) || similarityMeasure.equals(EUCLIDEAN_DISTANCE) || similarityMeasure.equals(MANHATTAN_DISTANCE)){ this.similarityMeasure = similarityMeasure; } //TODO Runtime Exception? } public String getSimilarityMeasure(){ return similarityMeasure; } /** * Every time a chunk gets full, the ensemble must be updated. If the ensemble doesn't consist of * <code>maxEnsembleSize</code> chunks, the newest chunk will be added to the ensemble without removing * any, of course.<br> * For more details on update methods see {@link #setUpdateMode(String)}. */ private void updateEnsemble() { SingleModel newModel = this.getNewModel(); if (this.ensemble.size() < this.maxEnsembleSize) { this.ensemble.addLast(newModel); } else { if( this.updateMode.equals( MERGE_ROUND_ROBIN ) ){ this.mergeRoundRobinModels( newModel ); return; } if (this.updateMode.equals(REPLACE_OLDEST_MODEL)){ this.replaceOldestModel(newModel); } if (this.updateMode.equals(REPLACE_RANDOM_MODEL)){ this.replaceRandomModel(newModel); } if (this.updateMode.equals(REPLACE_MOST_DISSIMILAR_MODEL)){ this.replaceMostDissimilarModel(newModel); } if (this.updateMode.equals(REPLACE_SAMPLED_MOST_DISSIMILAR_MODEL)){ this.replaceSampledMostDissimilarModel(newModel); } if (this.updateMode.equals(MERGE_OLDEST_MODELS)){ this.mergeOldestModels(newModel); } if (this.updateMode.equals(MERGE_MOST_SIMILAR_MODELS)){ this.mergeMostSimilarModels(newModel); } if (this.updateMode.equals(MERGE_MOST_DISSIMILAR_MODELS)){ this.mergeMostDissimilarModels(newModel); } } } /** * Updating by replacing the oldest model * @param newModel */ private void replaceOldestModel(SingleModel newModel){ this.ensemble.removeFirst(); this.ensemble.addLast(newModel); } /** * Updating by replacing a random model * @param newModel */ private void replaceRandomModel(SingleModel newModel){ Random random = new Random(); this.ensemble.remove( random.nextInt(this.ensemble.size()) ); this.ensemble.addLast(newModel); } /** * Updating by replacing the most dissimilar model * @param newModel */ private void replaceMostDissimilarModel(SingleModel newModel){ SingleModel worstModel = this.getModelWithLowestSimilarityTo(newModel); this.ensemble.remove(worstModel); this.ensemble.addLast(newModel); } /** * Updating by replacing the most similar model in respect to a sample of the stream * @param newModel */ private void replaceSampledMostDissimilarModel(SingleModel newModel){ LinkedList<Double> quantiles = new LinkedList<Double>(); Double phi = epsilon; while (phi < 1) { Double nextQuantile = phi * this.sampleBlock.size(); try { quantiles.add(this.sampleBlock.get(nextQuantile.intValue())); } catch (IndexOutOfBoundsException e) { phi = 1.0d; quantiles = newModel.getQuantiles(); } phi += epsilon; } SingleModel sample = new SingleModel(quantiles); SingleModel worstModel = this.getModelWithLowestSimilarityTo(sample); this.ensemble.remove(worstModel); this.ensemble.addLast(newModel); } /** * Updating by merging the eldest and second eldest model * @param newModel */ private void mergeOldestModels(SingleModel newModel){ SingleModel mergedModel = this.mergeModels(this.ensemble.get(0), this.ensemble.get(1)); this.ensemble.removeFirst(); this.ensemble.removeFirst(); this.ensemble.addFirst(mergedModel); this.ensemble.addLast(newModel); } /** * Updating by merging the eldest and second eldest model * @param newModel */ private void mergeRoundRobinModels(SingleModel newModel){ SingleModel mergedModel = this.mergeModels(this.ensemble.get(0), this.ensemble.get(1)); this.ensemble.removeFirst(); this.ensemble.removeFirst(); this.ensemble.addLast(mergedModel); this.ensemble.addLast(newModel); } /** * Updating by merging the most similar models * @param newModel */ private void mergeMostSimilarModels(SingleModel newModel){ LinkedList<LinkedList<Double>> allModels = new LinkedList<LinkedList<Double>>(); for (int i = 0; i < this.ensemble.size(); i++){ allModels.add(this.ensemble.get(i).getQuantiles()); } LinkedList<LinkedList<Double>> mergePair = (LinkedList<LinkedList<Double>>) SquaredDistance.getPairWithSmallestDistance(allModels); if (this.similarityMeasure.equals(COSINE_DISTANCE)){ mergePair = (LinkedList<LinkedList<Double>>) CosineDistance.getPairWithSmallestDistance(allModels); } if (this.similarityMeasure.equals(MANHATTAN_DISTANCE)){ mergePair = (LinkedList<LinkedList<Double>>) LinearDistance.getPairWithSmallestDistance(allModels); } SingleModel mergedOne = new SingleModel(mergePair.getFirst()); SingleModel mergedTwo = new SingleModel(mergePair.getLast()); SingleModel mergedModel = this.mergeModels(mergedOne, mergedTwo); //int removed = 0; Iterator<SingleModel> it = ensemble.iterator(); while (it.hasNext()) { SingleModel cur = it.next(); if( cur.equals( mergedOne ) || cur.equals( mergedTwo ) ){ it.remove(); //removed++; } } /* for (int i = 0; i < this.ensemble.size(); i++){ if (this.ensemble.get(i).equals(mergedOne) || this.ensemble.get(i).equals(mergedTwo)){ //this.ensemble.remove(i); removed++; } } */ this.ensemble.add(mergedModel); this.ensemble.add(newModel); } /** * Updating by merging the most dissimilar models * @param newModel */ private void mergeMostDissimilarModels(SingleModel newModel){ LinkedList<LinkedList<Double>> allModels = new LinkedList<LinkedList<Double>>(); for (int i = 0; i < this.ensemble.size(); i++){ allModels.add(this.ensemble.get(i).getQuantiles()); } LinkedList<LinkedList<Double>> mergePair = (LinkedList<LinkedList<Double>>) SquaredDistance.getPairWithFurthestDistance(allModels); if (this.similarityMeasure.equals(COSINE_DISTANCE)) { mergePair = (LinkedList<LinkedList<Double>>) CosineDistance.getPairWithFurthestDistance(allModels); } if (this.similarityMeasure.equals(MANHATTAN_DISTANCE)) { mergePair = (LinkedList<LinkedList<Double>>) LinearDistance.getPairWithFurthestDistance(allModels); } SingleModel mergedOne = new SingleModel(mergePair.getFirst()); SingleModel mergedTwo = new SingleModel(mergePair.getLast()); SingleModel mergedModel = this.mergeModels(mergedOne, mergedTwo); for (int i = 0; i < this.ensemble.size(); i++) { if (this.ensemble.get(i).equals(mergedOne) || this.ensemble.get(i).equals(mergedTwo)){ this.ensemble.remove(i); } } this.ensemble.add(mergedModel); this.ensemble.add(newModel); } /** * The maximum error parameter for a chunk is bounded by epsilon. I.e. a estimated rank must not differ more than * epsilon * (number of elements in the chunk) ranks from the exact rank. We achieve this goal by constructing a * summary that contains all equidistant quantiles, starting with epsilon, 2*epsilon, 3*epsilon, and so on. * @return a {@link SingleModel} containing the equidistant quantiles from epsilon to 1 */ private SingleModel getNewModel(){ LinkedList<Double> quantiles = new LinkedList<Double>(); Double phi = epsilon; while (phi < 1) { quantiles.add(this.activeBlock.getQuantile(phi)); phi += epsilon; } return (new SingleModel(quantiles)); } /** * Computes the similarity of all chunks in the ensemble compared to the <code>comparator</code>. * @param comparator * @return {@link SingleModel} containing the worst chunk in the ensemble in respect to squared * distance to <code>comparator</code> */ private SingleModel getModelWithLowestSimilarityTo(SingleModel comparator){ LinkedList<LinkedList<Double>> oldQuantiles = new LinkedList<LinkedList<Double>>(); for (int i = 0; i < this.ensemble.size(); i++) { oldQuantiles.add(ensemble.get(i).getQuantiles()); } LinkedList<Double> toBeRemovedQuantiles = SquaredDistance.getFarestVector(oldQuantiles, comparator.getQuantiles()); if (this.similarityMeasure.equals(COSINE_DISTANCE)) { toBeRemovedQuantiles = CosineDistance.getFarestVector(oldQuantiles, comparator.getQuantiles()); } if (this.similarityMeasure.equals(MANHATTAN_DISTANCE)) { toBeRemovedQuantiles = LinearDistance.getFarestVector(oldQuantiles, comparator.getQuantiles()); } SingleModel toBeRemoved = new SingleModel(toBeRemovedQuantiles); for (int i = 0; i < this.ensemble.size(); i++) { if (ensemble.get(i).equals(toBeRemoved)){ return ensemble.get(i); } } return toBeRemoved; } /** * If sampling is enabled (i.e. an update mode using sampling is active) this method determines * whether a new stream element gets added to the sample block or not. * @return <code>true</code> if a new element should be part of the sample block or else <code>false</code> */ private boolean addToSampleBlock(){ Random random = new Random(); if (random.nextDouble() <= this.sampleRatio) { return true; } return false; } /** * Merges two given models to a single model. In detail {@link Vector#mean(java.util.List, java.util.List)} * is used. * @param first - {@link SingleModel} containing a vector. * @param second - {@link SingleModel} containing a vector. * @return {@link SingleModel} that containing the resulting vector. */ private SingleModel mergeModels(SingleModel first, SingleModel second){ LinkedList<Double> fst = first.getQuantiles(); LinkedList<Double> snd = second.getQuantiles(); LinkedList<Double> merged = new LinkedList<Double>(); for(int i = 0; i < fst.size() && i < snd.size(); i++) merged.add(0.5 * (fst.get(i) + snd.get(i))); return new SingleModel(merged); } public String toString(){ StringBuffer s = new StringBuffer(); s.append( getClass().getCanonicalName() ); s.append( " {" ); s.append( " updateMode=" + this.getUpdateMode() ); s.append( ", epsilon=" + epsilon ); s.append( ", chunkSize=" + this.getChunkSize() ); s.append( ", ensembleSize=" + this.getEnsembleSize() ); s.append( " }" ); return s.toString(); } /** * Constructs a summary taking into account the ensemble and the current active block. * @return a {@link List} of {@link Double} containing a sorted list of quantiles. */ private LinkedList<Double> getSummary(){ LinkedList<Double> summary = new LinkedList<Double>(); for (int i = 0; i < this.ensemble.size(); i++) { summary.addAll(this.ensemble.get(i).getQuantiles()); } Double phi = epsilon; // Due to avoid Double.NaN and to many absolute min/max values in the summary if (this.activeBlock.getCount() > 1 / epsilon) { while (phi < 1){ summary.add(this.activeBlock.getQuantile(phi)); phi += epsilon; } } while (summary.contains(null)) { summary.remove(null); } Collections.sort(summary); return summary; } /** * Wrapper to avoid ugly <code>LinkedList<LinkedList<Double>></code> constructs. */ private class SingleModel implements Serializable { private static final long serialVersionUID = -8462870855147396071L; private LinkedList<Double> quantiles; public SingleModel (LinkedList<Double> quantiles) { this.quantiles = quantiles; } public LinkedList<Double> getQuantiles() { return this.quantiles; } public boolean equals(SingleModel model) { for (int i = 0; i < this.quantiles.size(); i++) { if (!(this.quantiles.get(i).equals(model.getQuantiles().get(i)))) { return false; } } return true; } } }