/*
* This file is part of Caliph & Emir.
*
* Caliph & Emir is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Caliph & Emir is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Caliph & Emir; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Copyright statement:
* --------------------
* (c) 2002-2005 by Mathias Lux (mathias@juggle.at)
* http://www.juggle.at, http://caliph-emir.sourceforge.net
*/
package at.lux.retrieval.clustering.suffixtree;
import at.lux.retrieval.clustering.AbstractSuffixTree;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
/**
* @author Mathias Lux, mathias@juggle.at
* Date: 03.06.2004
* Time: 15:30:41
*/
public class StcNode {
private HashMap<String, StcNode> children = null;
private HashSet<StcDocument> documents = null;
// using as cache:
private HashSet<StcDocument> allChildrensDocuments = null;
public StcNode() {
children = new HashMap<String, StcNode>();
documents = new HashSet<StcDocument>();
allChildrensDocuments = new HashSet<StcDocument>();
}
public void add(List<String> phrase, StcDocument stcDocument, int suffix, String prefix, AbstractSuffixTree tree) {
// save state of being a base cluster:
boolean isBaseClusterBeforeInsertion = tree.getNodeToClusterIndex().containsKey(this);
if (isBaseClusterBeforeInsertion == true) {
// Cluster has already been BaseCluster, so delete it before changes occur!!
BaseCluster cluster = tree.getNodeToClusterIndex().remove(this); // remove from index and retrieve BaseCluster
assert cluster != null;
tree.getSimilarityMatrix().removeBaseCluster(cluster);
boolean clusterRemove = tree.getBaseClusters().remove(cluster);
assert clusterRemove; // remove from tree
tree.getTouchedClusters().remove(cluster);
}
// add new phrase:
if (phrase.size() > 0) {
String temp = phrase.get(0);
String newPrefix = new String(prefix);
if (newPrefix.length() > 0) {
newPrefix += " " + phrase.remove(0).trim();
} else {
newPrefix += phrase.remove(0).trim();
}
if (children.containsKey(temp)) {
// edge with string already exists -> add document
children.get(temp).add(phrase, stcDocument, suffix, newPrefix, tree);
} else {
// edge does not exist -> create -> add document
StcNode n = new StcNode();
n.add(phrase, stcDocument, suffix, newPrefix, tree);
children.put(temp, n);
}
// afterwards: update the childrensDocuments:
allChildrensDocuments.addAll(children.get(temp).getAllChildrensDocuments());
// allChildrensDocuments.addAll(children.get(temp).documents);
} else { // add new StcDocument ...
StcDocument doc = stcDocument;
documents.add(doc);
}
// check if being a base cluster has been changed:
if (isBaseClusterCandidate() == true && prefix.length() > 0) {
// the node has changed and became a base cluster.
// so we have to put it into the newly created base cluster treeset :)
BaseCluster cluster = new BaseCluster(getAllChildrensDocuments(), this, prefix, tree.getIndex(), tree.getStopwords());
if (cluster.getScore() > 0.0) {
boolean addSuccessful = tree.getBaseClusters().add(cluster);
assert addSuccessful;
BaseCluster previousValue = tree.getNodeToClusterIndex().put(this, cluster);
assert previousValue == null;
boolean addToTouchedClusters = tree.getTouchedClusters().add(cluster);
assert addToTouchedClusters;
// we have to add it to the similarity matrix, so the values can be recalculated:
tree.getSimilarityMatrix().addBaseCluster(cluster);
}
} else {
// this is only a consistency check though there shouldn't be changes from being
// a base cluster to being none.
BaseCluster wasCluster = tree.getNodeToClusterIndex().get(this);
assert wasCluster == null;
}
}
/**
* Returns all documents of itself and all descending nodes.
*
* @return
*/
private HashSet<StcDocument> getAllChildrensDocuments() {
// obviously we don't want this being null, cause this would kill performance :)
if (allChildrensDocuments == null) {
allChildrensDocuments = new HashSet<StcDocument>();
// allChildrensDocuments.addAll(documents);
Iterator<String> childrenIterator = children.keySet().iterator();
while (childrenIterator.hasNext()) {
String s = childrenIterator.next();
StcNode n = children.get(s);
allChildrensDocuments.addAll(n.getAllChildrensDocuments());
// allChildrensDocuments.addAll(n.documents);
}
}
HashSet<StcDocument> result = new HashSet<StcDocument>(allChildrensDocuments);
result.addAll(documents);
return result;
}
/**
* Checks wether a node can be a base cluster
*
* @return true if it is a base cluster, false if not.
*/
private boolean isBaseClusterCandidate() {
boolean result = false;
if (allChildrensDocuments.size() + documents.size() > 1 && !allChildrensDocuments.containsAll(documents) && (children.size() + documents.size() > 1)) {
result = true;
}
return result;
}
}