Cluster.java example

Explorer

Syntactic-master
- src
  - main
    - java
      - syntaxLearner
        Cluster.java
        ClusterContext.java
        Learner.java
        LearnerMain.java
        Recorder.java
        UI
        Console.java
        Report.java
        corpus
        Context.java
        Corpus.java
        Vocabulary.java
        VocabularyContext.java
        Word.java
        source
        CorpusSource.java
        PlainTextFile.java
        WikiDump.java

package syntaxLearner;

import java.util.Set;
import java.util.TreeSet;

import syntaxLearner.corpus.Vocabulary;
import syntaxLearner.corpus.Word;

public class Cluster {

	public final short ID;
	public int totalSize;	//size of this portion of the corpus
	public int wordCount;	//individual word
	public Set<Integer> words;
	//private Map<Context,Double> distribution;
	private double[][] distribution;
	private final Vocabulary vocab;
	private final Learner learner;
	private boolean isNew = false;

	public Cluster(Vocabulary v, Learner l){
		this.vocab=v;
		this.learner=l;
		this.ID = learner.newClusterID();
		words = new TreeSet<Integer>();
		reset();
	}

	public Cluster (Vocabulary v, Learner l, boolean isGround){
		this.vocab=v;
		this.learner=l;
		this.ID = isGround? -1 : learner.newClusterID();
		words = new TreeSet<Integer>();
		reset();
	}



	/**
	 * Calculates the distribution of an entire Cluster.
	 * Similar to calculating a centroid with k-means.
	 * @return a "Map" vector 
	 */
	public double[][] clusterDistribution(){
		if (learner.isClusterUpdated(this)) {
			return distribution;
		} else {
			/* Initialize */
			distribution = new double[learner.NUMBER_OF_CLUSTERS+1][learner.NUMBER_OF_CLUSTERS+1];
			Set<ClusterContext> clusterContexts = learner.getClusterContexts();

			/* Sum every context in every word */

			for (int i : words){
				Word w = vocab.getWord(i);
				double weight =  (1.0*w.frequency)/(totalSize*wordCount);
				for (ClusterContext cc : clusterContexts){
					distribution[cc.type1 + 1][cc.type2 + 1]+= (w.clusterDistribution(cc)*weight);
				}
				learner.registerClusterUpdate(this);
			}
			return distribution;
		}
	}

	/* Sets parenthood */ 
	public void add(Cluster c){
		//Actually asserting that intersection (this, c) is empty.
		for (int i: c.words){
			vocab.getWord(i).setParent(this);
			learner.setParent(i,this.ID);
		}
		words.addAll(c.words);
		totalSize+=c.totalSize;
		wordCount+=c.wordCount;
	}

	/* Adds, counts the values and sets parenthood */
	public void add(int i){
		words.add(i);
		wordCount++;
		Word w = vocab.getWord(i);
		totalSize+=w.frequency;
		w.setParent(this);
	}

	public void remove(int i){
		if (words.contains(i)){
			words.remove(i);
			wordCount--;
			totalSize-=vocab.getWord(i).frequency;
			assert (totalSize>=0 && wordCount>=0);
			}
	}

	/**
	 * Wipes all data apart from the parent vocabulary.
	 * Done this way instead of destructing the object
	 * altogether in order to keep a constant number of 
	 * clusters in the count.
	 */
	public void reset(){
		words.clear();
		wordCount = 0;
		totalSize=0;	
	}

	public short getID(){
		return ID;
	}
	
	public boolean isNew(){
		return isNew;
	}
	
	public void setNew(boolean b){
		isNew=b;
	}
}