package tr.gov.ulakbim.jDenetX.clusterers.clustree;
//import cluster.Cluster;
//import cluster.Clustering;
//import cluster.StreamClusterer;
import tr.gov.ulakbim.jDenetX.cluster.Clustering;
import tr.gov.ulakbim.jDenetX.clusterers.AbstractClusterer;
import tr.gov.ulakbim.jDenetX.clusterers.clustree.util.Budget;
import tr.gov.ulakbim.jDenetX.clusterers.clustree.util.SimpleBudget;
import tr.gov.ulakbim.jDenetX.core.Measurement;
import tr.gov.ulakbim.jDenetX.options.IntOption;
import weka.core.Instance;
import java.util.LinkedList;
/**
* A representation of a tree.
*
* @author sanchez
*/
public class ClusTree extends AbstractClusterer {
public IntOption timeWindowOption = new IntOption("timeWindow",
't', "Rang of the window.", 1000);
public IntOption maxHeightOption = new IntOption(
"maxHeight", 'h',
"The maximal height of the tree", 8);
private static int INSERTIONS_BETWEEN_CLEANUPS = 10000;
/**
* The root node of the tree.
*/
private Node root;
// Information about the data represented in this tree.
/**
* Dimensionality of the data points managed by this tree.
*/
private int numberDimensions;
/**
* Parameter for the weighting function use to weight the entries.
*/
private double negLambda;
/**
* The current height of the tree. Should always be smaller than maxHeight.
*/
private int height;
/**
* The maximal height of the tree.
*/
private int maxHeight;
/**
* This variable is used to keep the inverse height that is stored in every
* node correct.
*/
private int numRootSplits;
/**
* The thresholf for the weighting of an Entry. An Entry is irrelevant, if
* it is in a leaf and the weightedN of the data Cluster is smaller than
* this threshold.
*
* @see Entry#data
*/
private double weightThreshold = 0.01;
/**
* Number of points inserted into the tree.
*/
private int numberInsertions;
private long timestamp;
@Override
public void resetLearningImpl() {
negLambda = (1.0 / (double) timeWindowOption.getValue())
* (Math.log(weightThreshold) / Math.log(2));
maxHeight = maxHeightOption.getValue();
numberDimensions = -1;
root = null;
timestamp = 0;
height = 0;
numRootSplits = 0;
numberInsertions = 0;
}
@Override
protected Measurement[] getModelMeasurementsImpl() {
return null;
}
public boolean isRandomizable() {
return false;
}
@Override
public void getModelDescription(StringBuilder out, int indent) {
}
public double[] getVotesForInstance(Instance inst) {
return null;
}
@Override
public boolean implementsMicroClusterer() {
return true;
}
@Override
public void trainOnInstanceImpl(Instance instance) {
timestamp++;
//TODO check if instance contains label
if (root == null) {
numberDimensions = instance.numAttributes();
root = new Node(numberDimensions, 0);
} else {
if (numberDimensions != instance.numAttributes())
System.out.println("Wrong dimensionality, expected:" + numberDimensions + "found:" + instance.numAttributes());
}
ClusKernel newPointAsKernel = new ClusKernel(instance.toDoubleArray(), numberDimensions);
insert(newPointAsKernel, new SimpleBudget(1000), timestamp);
}
/**
* Insert a new point in the <code>Tree</code>. The point should be
* represented as a cluster with a single data point(i.e. N = 1). A
* <code>Budget</code> class is also given, which is informed of the number
* of operation the tree does, and informs the tree when it does not have
* time left and should stop the insertion.
*
* @param newPoint The point to be inserted.
* @param budget The budget and statistics recollector for the insertion.
* @param timestamp The moment at which this point is inserted.
* @see Kernel
* @see Budget
*/
public void insert(ClusKernel newPoint, Budget budget, long timestamp) {
Entry rootEntry = new Entry(this.numberDimensions,
root, timestamp);
ClusKernel carriedBuffer = new ClusKernel(this.numberDimensions);
Entry toInsertHere = insert(newPoint, carriedBuffer, root, rootEntry,
budget, timestamp);
if (toInsertHere != null) {
this.numRootSplits++;
this.height += this.height < this.maxHeight ? 1 : 0;
Node newRoot = new Node(this.numberDimensions,
toInsertHere.getChild().getRawLevel() + 1);
newRoot.addEntry(rootEntry, timestamp);
newRoot.addEntry(toInsertHere, timestamp);
this.root = newRoot;
}
this.numberInsertions++;
if (this.numberInsertions % INSERTIONS_BETWEEN_CLEANUPS == 0) {
cleanUp(this.root, 0);
}
}
// TODO: Expand all function that work on entries to work with the Budget.
private Entry insert(ClusKernel pointToInsert, ClusKernel carriedBuffer,
Node currentNode, Entry parentEntry, Budget budget, long timestamp) {
assert (currentNode != null);
assert (currentNode.isLeaf()
|| currentNode.getEntries()[0].getChild() != null);
currentNode.makeOlder(timestamp, this.negLambda);
// This variable will be changed from to null to an actual reference
// in the following if-else block if we have to insert something here,
// either because this is a leaf, or because of split propagation.
Entry toInsertHere = null;
if (currentNode.isLeaf()) {
// At the end of the function the entry will be inserted.
toInsertHere = new Entry(this.numberDimensions,
pointToInsert, timestamp);
} else {
Entry bestEntry = currentNode.nearestEntry(pointToInsert);
bestEntry.aggregateCluster(pointToInsert, timestamp,
this.negLambda);
boolean isCarriedBufferEmpty = carriedBuffer.isEmpty();
Entry bestBufferEntry = null;
if (!isCarriedBufferEmpty) {
bestBufferEntry = currentNode.nearestEntry(carriedBuffer);
bestBufferEntry.aggregateCluster(carriedBuffer, timestamp,
this.negLambda);
}
if (!budget.hasMoreTime()) {
bestEntry.aggregateToBuffer(pointToInsert, timestamp,
this.negLambda);
if (!isCarriedBufferEmpty) {
bestBufferEntry.aggregateToBuffer(carriedBuffer,
timestamp, this.negLambda);
}
return null;
}
// If the way of the buffer differs from the way of the point to
// be inserted, leave the buffer here.
if (!isCarriedBufferEmpty && (bestEntry != bestBufferEntry)) {
bestBufferEntry.aggregateToBuffer(carriedBuffer, timestamp,
this.negLambda);
carriedBuffer.clear();
}
// Take the buffer of the best entry for the point to be inserted
// along.
ClusKernel takeAlongBuffer = bestEntry.emptyBuffer(timestamp,
this.negLambda);
carriedBuffer.add(takeAlongBuffer);
// Recursive call.
toInsertHere = insert(pointToInsert, carriedBuffer,
bestEntry.getChild(), bestEntry, budget, timestamp);
}
// If the above block has a new Entry for this place insert it.
if (toInsertHere != null) {
return this.insertHere(toInsertHere, currentNode, parentEntry,
carriedBuffer, budget, timestamp);
}
// If nothing else needs to be done in all the above levels
// return null to signalize it.
return null;
}
// XXX: Document the insertion when the final implementation is done.
private Entry insertHere(Entry newEntry, Node currentNode,
Entry parentEntry, ClusKernel carriedBuffer, Budget budget,
long timestamp) {
int numFreeEntries = currentNode.numFreeEntries();
// Insert the buffer that we carry.
if (!carriedBuffer.isEmpty()) {
Entry bufferEntry = new Entry(this.numberDimensions,
carriedBuffer, timestamp);
if (numFreeEntries <= 1) {
// Distance from buffer to entries.
Entry nearestEntryToCarriedBuffer =
currentNode.nearestEntry(newEntry);
double distanceNearestEntryToBuffer =
nearestEntryToCarriedBuffer.calcDistance(newEntry);
// Distance between buffer and point to insert.
double distanceBufferNewEntry =
newEntry.calcDistance(carriedBuffer);
// Best distance between Entrys in the Node.
BestMergeInNode bestMergeInNode =
calculateBestMergeInNode(currentNode);
// See what the minimal distance is and do the correspoding
// action.
if (distanceNearestEntryToBuffer <= distanceBufferNewEntry
&& distanceNearestEntryToBuffer <= bestMergeInNode.distance) {
// Aggregate buffer entry to nearest entry in node.
nearestEntryToCarriedBuffer.aggregateEntry(bufferEntry,
timestamp, this.negLambda);
} else if (distanceBufferNewEntry <= distanceNearestEntryToBuffer
&& distanceBufferNewEntry <= bestMergeInNode.distance) {
newEntry.mergeWith(bufferEntry);
} else {
currentNode.mergeEntries(bestMergeInNode.entryPos1,
bestMergeInNode.entryPos2);
currentNode.addEntry(bufferEntry, timestamp);
}
} else {
assert (currentNode.isLeaf());
currentNode.addEntry(bufferEntry, timestamp);
}
}
// Normally the insertion of the carries buffer does not change the
// number of free entries, but in case of future changes we calculate
// the number again.
numFreeEntries = currentNode.numFreeEntries();
// Search for an Entry with a weight under the threshold.
Entry irrelevantEntry = currentNode.getIrrelevantEntry(this.weightThreshold);
if (currentNode.isLeaf() && irrelevantEntry != null) {
irrelevantEntry.overwriteOldEntry(newEntry);
} else if (numFreeEntries >= 1) {
currentNode.addEntry(newEntry, timestamp);
} else {
if (currentNode.isLeaf() && (this.hasMaximalSize()
|| !budget.hasMoreTime())) {
mergeEntryWithoutSplit(currentNode, newEntry,
timestamp);
} else {
// We have to split.
return split(newEntry, currentNode, parentEntry, timestamp);
}
}
return null;
}
/**
* Inserts an <code>Entry</code> into a <code>Node</code> without inducing
* a split.
*
* @param node The node at which the entry is to be inserted.
* @param newEntry The entry to be inserted.
* @param timestamp The moment at which this occurs.
*/
private void mergeEntryWithoutSplit(Node node,
Entry newEntry, long timestamp) {
Entry nearestEntryToCarriedBuffer =
node.nearestEntry(newEntry);
double distanceNearestEntryToBuffer =
nearestEntryToCarriedBuffer.calcDistance(newEntry);
BestMergeInNode bestMergeInNode =
calculateBestMergeInNode(node);
if (distanceNearestEntryToBuffer < bestMergeInNode.distance) {
nearestEntryToCarriedBuffer.aggregateEntry(newEntry, timestamp,
this.negLambda);
} else {
node.mergeEntries(bestMergeInNode.entryPos1,
bestMergeInNode.entryPos2);
node.addEntry(newEntry, timestamp);
}
}
/**
* Calculates the best merge possible between two nodes in a node. This
* means that the pair with the smallest distance is found.
*
* @param node The node in which these two entries have to be found.
* @return An object which encodes the two position of the entries with the
* smallest distance in the node and the distance between them.
* @see BestMergeInNode
* @see Entry#calcDistance(tree.Entry)
*/
private BestMergeInNode calculateBestMergeInNode(Node node) {
assert (node.numFreeEntries() == 0);
Entry[] entries = node.getEntries();
int toMerge1 = -1;
int toMerge2 = -1;
double distanceBetweenMergeEntries = Double.NaN;
double minDistance = Double.MAX_VALUE;
for (int i = 0; i < entries.length; i++) {
Entry e1 = entries[i];
for (int j = i + 1; j < entries.length; j++) {
Entry e2 = entries[j];
double distance = e1.calcDistance(e2);
if (distance < minDistance) {
toMerge1 = i;
toMerge2 = j;
distanceBetweenMergeEntries = distance;
}
}
}
assert (toMerge1 != -1 && toMerge2 != -1);
if (Double.isNaN(distanceBetweenMergeEntries)) {
throw new RuntimeException("The minimal distance between two "
+ "Entrys in a Node was Double.MAX_VAUE. That can hardly "
+ "be right.");
}
return new BestMergeInNode(toMerge1, toMerge2,
distanceBetweenMergeEntries);
}
private boolean hasMaximalSize() {
// TODO: Improve hasMaximalSize(). For now it just works somehow for testing.
return this.height == this.maxHeight;
}
/**
* Performs a (2,2) split on the given node with the given entry. This
* implementation only works if the nodes have three entries each. The split
* will generate two new nodes. One of them will be put where the old node
* was, and for the other a new <code>Entry</code> will be generated and
* returned.
*
* @param newEntry The entry to be added to the node.
* @param node The node that is going to be splitted.
* @param parentEntry The entry in the tree that points at the node that
* is going to be splitted.
* @param timestamp The moment at which this split occurs.
* @return An entry which points at the second node created in the split.
* This entry has to be introduced later in the tree.
*/
private Entry split(Entry newEntry, Node node, Entry parentEntry,
long timestamp) {
// The implemented split function only works in trees where node
// have three entries.
// Splitting only makes sense on full nodes.
assert (node.numFreeEntries() == 0);
assert (parentEntry.getChild() == node);
// All the entries we have to separate in two nodes.
Entry[] allEntries = new Entry[4];
Entry[] nodeEntries = node.getEntries();
for (int i = 0; i < nodeEntries.length; i++) {
allEntries[i] = new Entry(nodeEntries[i]);
}
allEntries[3] = newEntry;
// Clear the given node, since we are going to refill it later.
node = new Node(this.numberDimensions, node.getRawLevel());
// Calculate the distance of all the possible pairings, since we want
// to do a (2,2) split.
double select01 = allEntries[0].calcDistance(allEntries[1])
+ allEntries[2].calcDistance(allEntries[3]);
double select02 = allEntries[0].calcDistance(allEntries[2])
+ allEntries[1].calcDistance(allEntries[3]);
double select03 = allEntries[0].calcDistance(allEntries[3])
+ allEntries[1].calcDistance(allEntries[2]);
// See which of the pairings is minimal and distribute the entries
// accordingly.
Node residualNode = new Node(this.numberDimensions,
node.getRawLevel());
if (select01 < select02) {
if (select01 < select03) {//select01 smallest
node.addEntry(allEntries[0], timestamp);
node.addEntry(allEntries[1], timestamp);
residualNode.addEntry(allEntries[2], timestamp);
residualNode.addEntry(allEntries[3], timestamp);
} else {//select03 smallest
node.addEntry(allEntries[0], timestamp);
node.addEntry(allEntries[3], timestamp);
residualNode.addEntry(allEntries[1], timestamp);
residualNode.addEntry(allEntries[2], timestamp);
}
} else {
if (select02 < select03) {//select02 smallest
node.addEntry(allEntries[0], timestamp);
node.addEntry(allEntries[2], timestamp);
residualNode.addEntry(allEntries[1], timestamp);
residualNode.addEntry(allEntries[3], timestamp);
} else {//select03 smallest
node.addEntry(allEntries[0], timestamp);
node.addEntry(allEntries[3], timestamp);
residualNode.addEntry(allEntries[1], timestamp);
residualNode.addEntry(allEntries[2], timestamp);
}
}
// Set the other node into the tree.
parentEntry.setChild(node);
parentEntry.recalculateData();
// Generate a new entry for the residual node.
Entry residualEntry = new Entry(this.numberDimensions,
residualNode, timestamp);
return residualEntry;
}
/**
* Return the number of time the tree has grown in size. If the tree grows
* and is then cutted from a certain depth, it also counts.
*
* @return The number of times the root node was splitted.
*/
public int getNumRootSplits() {
return numRootSplits;
}
/**
* Return the current height of the tree. This should never be greater than
* <code>maxHeight</code>.
*
* @return The height of the tree.
* @see #maxHeight
*/
public int getHeight() {
assert (height <= maxHeight);
return height;
}
private void cleanUp(Node currentNode, int level) {
if (currentNode == null) {
return;
}
Entry[] entries = currentNode.getEntries();
if (level == this.maxHeight) {
for (int i = 0; i < entries.length; i++) {
Entry e = entries[i];
e.setChild(null);
}
} else {
for (int i = 0; i < entries.length; i++) {
Entry e = entries[i];
cleanUp(e.getChild(), level + 1);
}
}
}
/**
* @param currentTime The current time
* @return The kernels at the leaf level as a clustering
*/
@Override
public Clustering getMicroClusteringResult() {
return getClustering(timestamp, -1);
}
@Override
public Clustering getClusteringResult() {
return null;
}
/**
* @param currentTime The current time
* @return The kernels at the given level as a clustering.
*/
public Clustering getClustering(long currentTime, int targetLevel) {
if (root == null) {
return null;
}
Clustering clusters = new Clustering();
LinkedList<Node> queue = new LinkedList<Node>();
queue.add(root);
while (!queue.isEmpty()) {
Node current = queue.remove();
int currentLevel = current.getLevel(this);
boolean isLeaf = (current.isLeaf() && currentLevel <= maxHeight)
|| currentLevel == maxHeight;
if (currentLevel == targetLevel
|| (targetLevel == -1 && isLeaf)) {
assert (currentLevel <= maxHeight);
Entry[] entries = current.getEntries();
for (int i = 0; i < entries.length; i++) {
Entry entry = entries[i];
if (entry == null || entry.isEmpty()) {
continue;
}
long diff = currentTime - entry.getTimestamp();
ClusKernel gaussKernel = new ClusKernel(entry.getData());
if (diff > 0) {
gaussKernel.makeOlder(diff, negLambda);
}
clusters.add(gaussKernel);
}
} else if (!current.isLeaf()) {
Entry[] entries = current.getEntries();
for (int i = 0; i < entries.length; i++) {
Entry entry = entries[i];
if (entry.isEmpty()) {
continue;
}
if (entry.isIrrelevant(weightThreshold)) {
continue;
}
queue.add(entry.getChild());
}
}
}
return clusters;
}
/**************************************************************************
* LOCAL CLASSES
**************************************************************************/
/**
* A class to code the return value of searching the smallest merge in a
* node.
*/
class BestMergeInNode {
/**
* The position of the first entry in the array of the node.
*/
public int entryPos1;
/**
* The position of the second entry in the array of the node.
*/
public int entryPos2;
/**
* The distance between the two entries.
*/
public double distance;
/**
* The constructor of this return value. It will automatically make
* sure that the first position is the smaller one of the two.
*
* @param pos1 One of the position.
* @param pos2 One of the position.
* @param distance The distance between the entries at these positions.
*/
public BestMergeInNode(int pos1, int pos2,
double distance) {
assert (pos1 != pos2);
this.distance = distance;
if (pos1 < pos2) {
this.entryPos1 = pos1;
this.entryPos2 = pos2;
} else {
this.entryPos1 = pos2;
this.entryPos2 = pos1;
}
}
}
}