package brickhouse.analytics.uniques;
/**
* Copyright 2012 Klout, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/
import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.RoundingMode;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.TreeMap;
public class SketchSet implements ICountDistinct {
static final int SIZEOF_LONG = 64;
public static int DEFAULT_MAX_ITEMS = 5000;
private int maxItems = DEFAULT_MAX_ITEMS;
private TreeMap<Long, String> sortedMap;
private static HashFunction HASH = Hashing.md5();
public SketchSet() {
sortedMap = new TreeMap<Long, String>();
}
public SketchSet(int max) {
this.maxItems = max;
sortedMap = new TreeMap<Long, String>();
}
public void addHashItem(long hash, String str) {
if (sortedMap.size() < maxItems) {
sortedMap.put(hash, str);
} else {
Long hashLong = hash;
if (!sortedMap.containsKey(hashLong)) {
long maxHash = sortedMap.lastKey();
if (hash < maxHash) {
sortedMap.remove(maxHash);
sortedMap.put(hashLong, str);
}
}
}
}
/**
* for testing
*
* @param hash
*/
public void addHash(long hash) {
addHashItem(hash, Long.toString(hash));
}
public void addItem(String str) {
HashCode hc = HASH.hashUnencodedChars(str);
this.addHashItem(hc.asLong(), str);
}
public List<String> getMinHashItems() {
return new ArrayList(this.sortedMap.values());
}
public SortedMap<Long, String> getHashItemMap() {
return this.sortedMap;
}
public List<Long> getMinHashes() {
return new ArrayList(this.sortedMap.keySet());
}
public void clear() {
this.sortedMap.clear();
}
public int getMaxItems() {
return maxItems;
}
public long lastHash() {
return sortedMap.lastKey();
}
public String lastItem() {
return sortedMap.lastEntry().getValue();
}
public double estimateReach() {
if (sortedMap.size() < maxItems) {
return sortedMap.size();
}
long maxHash = sortedMap.lastKey();
return EstimatedReach(maxHash, maxItems);
}
static public double EstimatedReach(String lastItem, int maxItems) {
long maxHash = HASH.hashUnencodedChars(lastItem).asLong();
return EstimatedReach(maxHash, maxItems);
}
static public double EstimatedReach(long maxHash, int maxItems) {
BigDecimal maxHashShifted = new BigDecimal(BigInteger.valueOf(maxHash).add(BigInteger.valueOf(Long.MAX_VALUE)));
BigDecimal bigMaxItems = new BigDecimal(maxItems * 2).multiply(BigDecimal.valueOf(Long.MAX_VALUE));
BigDecimal ratio = bigMaxItems.divide(maxHashShifted, RoundingMode.HALF_EVEN);
return ratio.doubleValue();
}
public long calculateSimHash() {
int[] sumTable = new int[SIZEOF_LONG];
Iterator<Long> hashes = getHashItemMap().keySet().iterator();
while (hashes.hasNext()) {
long hash = hashes.next();
long mask = 1l;
for (int pos = 0; pos < SIZEOF_LONG; ++pos) {
if ((hash & mask) != 0l) {
sumTable[pos]++;
} else {
sumTable[pos]--;
}
mask <<= 1;
}
}
long simHash = 0l;
long mask = 1l;
for (int pos = 0; pos < SIZEOF_LONG; ++pos) {
if (sumTable[pos] > 0) {
simHash |= mask;
}
mask <<= 1;
}
return simHash;
}
public void combine(SketchSet other) {
for (Entry<Long, String> entry : other.sortedMap.entrySet()) {
addHashItem(entry.getKey(), entry.getValue());
}
}
}