package syntaxLearner.corpus; import java.util.Map; import java.util.TreeMap; import syntaxLearner.*; public class Word implements Comparable<Word>{ /* Identifiers */ public final String name; public final Vocabulary vocab; public final int ID; /* Data Structures*/ public Map<VocabularyContext, Integer> vocabContexts; //# of contexts in V*xV* space, NEVER updates private int[][] clusterContexts; //public Cluster parent; /* Empirical Tools */ public int frequency; private double smoothingFactor=0; //The number to be added to zero values private double smoothingCoefficent=1; //The coefficient multiplying non-zero values public Word (String name, Vocabulary v){ this.name=name; this.vocab=v; ID = vocab.newID(); vocabContexts = new TreeMap<VocabularyContext,Integer>(); frequency=1; } public void addContext(VocabularyContext c){ if (vocabContexts.containsKey(c)){ vocabContexts.put(c, vocabContexts.get(c)+1); } else { vocabContexts.put(c, 1); } } /** * @return The smoothed distribution of the word in (K+1)^2 space. */ public double clusterDistribution(ClusterContext c) { calculateSmoothingFactor(); int value; if ((value = clusterContexts[c.type1+1][c.type2+1])!=0){ return value*smoothingCoefficent*1.0/frequency; } else { return smoothingFactor; } } private void calculateSmoothingFactor(){ if (!vocab.isWordUpdated(ID)) { clusterWords(); //zeros = (K+1)^2 - occupied contexts //count nonZeros and find the minimum and its frequency int nonZeros = 0; int min = 0; int minValues = 0; for (int[] ai : clusterContexts){ for (int j : ai){ if (j!=0) { nonZeros++; //initialize min if ((min==0) || (min>j)) { min=j; minValues=1; } else if (j==min){ minValues++; } } } } int zeros = (vocab.getCorpus().getLearner().getClusterContexts().size() - nonZeros); if (zeros!=0) { smoothingCoefficent = 1 - (minValues * 1.0 / frequency); smoothingFactor = minValues * 1.0 / (frequency * zeros); } else { //if there are no zeros, don't smoothe smoothingFactor=0; smoothingCoefficent= 1; //Register in the vocabulary as calculated vocab.registerWordUpdate(ID); } } } /** * To be updated at every turn. */ private void clusterWords() { if (!vocab.isWordUpdated(ID)){ Learner learner = vocab.getCorpus().getLearner(); int numOfClusters = learner.getNumOfClusters(); clusterContexts = new int[numOfClusters+1][numOfClusters+1]; for (Map.Entry<VocabularyContext, Integer> e : vocabContexts.entrySet()){ clusterContexts[learner.getParent(e.getKey().type1)+1] [learner.getParent(e.getKey().type2)+1]+=e.getValue(); } } } public void setParent(Cluster c){ //this.parent=c; vocab.getCorpus().getLearner().setParent(this.ID, c.getID()); } public Cluster getParent(){ short parentID = vocab.getCorpus().getLearner().getParent(this.ID); return vocab.getCorpus().getLearner().getCluster(parentID); } public int getParentID(){ return vocab.getCorpus().getLearner().getParent(this.ID); } public void increase(int i){ frequency+=i; } @Override public int compareTo(Word w) { if (this.frequency>w.frequency) return 2; else if (this.frequency<w.frequency) return -2; else return -Math.abs(this.name.compareTo(w.name)); } }