package project.utils.collocation.impl; import java.awt.Point; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import project.client.persistence.Message; import project.client.persistence.Word; import project.utils.collocation.CollocationExtractor; import project.utils.collocation.Matrix; import project.utils.collocation.WordStatistics; public class CollocationImplVer1 implements CollocationExtractor { private static final int MAX_DISTANCE = 10; private int distance = MAX_DISTANCE; private static Map<String,Integer> wordIndices = new HashMap<String, Integer> (); private static int crtIdx = -1; public CollocationImplVer1 () {} public CollocationImplVer1 (int distance) { this.distance = distance; } public Matrix<WordStatistics> calculateCollocationMatrix(Message message, int maxDistance) { this.distance = maxDistance; return calculateCollocationMatrix(message); } public Matrix<WordStatistics> calculateCollocationMatrix(Message message) { /** * Algorithm is as follows * * 1. Create the matrix. This can become quite memory consuming, * since it's stored as N(words) ^ 2 * 4 bytes (float) * * 2. Loop through all words, from 1 to N. * - for each one, loop from distance/2 (behing) to distance/2 (ahead) * - for the given pair, calculate mean and variance :) - 2 matrices * * 3. Return the matrix */ //System.out.println ("Analyzing " + sentence.getWords().size() + " words"); //System.out.println (words); String[] words = message.getFormattedContent().split(";"); //Iterator<Word> i = sentence.getWords().iterator(); //while (i.hasNext()) { for (String w : words) { //Word w = i.next(); if (!wordIndices.containsKey(w)) { crtIdx ++; Integer wIdx = new Integer (crtIdx); wordIndices.put(w, wIdx); } } //System.out.println ("Matrix has " + counts.size() + " x " + counts.size() + " elements"); // Allocate the matrix Matrix<WordStatistics> m = new Matrix<WordStatistics> (); //Word[] vecWords = sentence.getWords().toArray(new Word [sentence.getWords().size()]); //Loop through all words for (int j = 0; j < words.length; j++) { String w1 = words [j]; Integer p1 = wordIndices.get(w1); for (int k = j + 1; k < (j + 1 + distance) ; k++) { if (k >= words.length) break; String w2 = words [k]; Integer p2 = wordIndices.get(w2); float dist = k - j; WordStatistics stats = m.getValue(p1, p2); if (stats == null) { stats = new WordStatistics (); stats.setW1(w1); stats.setW2(w2); } stats.addOffset((float) dist); stats.updateStats(); m.setValue(p1, p2, stats); } } /* Iterator<Point> j = m.getValues().keySet().iterator(); while (j.hasNext()) { Point p = j.next(); WordStatistics stats = m.getValues().get(p); stats.updateStats(); } */ //System.out.println ("Matrix has " + m.getSize() + " non-null elements (out of " + (counts.size() * counts.size()) + ")"); return m; } public float determineCollocation(Matrix<WordStatistics> collocation, Message message, Word w1, Word w2) throws NumberFormatException { return 0; } }