package doser.word2vec.semanticCategories; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import org.apache.commons.math.stat.descriptive.SummaryStatistics; import word2vec.tools.Word2VecModel; public class ComputeSimilarities { public ComputeSimilarities() { super(); } public void analzye(Word2VecModel m) { File file = new File("/home/zwicklbauer/samplingoutput.dat"); BufferedReader reader = null; HashMap<Integer, LinkedList<EntityPair>> vals = new HashMap<Integer, LinkedList<EntityPair>>(); try { reader = new BufferedReader(new FileReader(file)); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\\t"); Integer val = Integer.valueOf(splitter[0]); if (vals.containsKey(val)) { LinkedList<EntityPair> sims = vals.get(val); double sim = m.computeSimilarity(splitter[1], splitter[2]); EntityPair pair = new EntityPair(splitter[1], splitter[2], splitter[3], splitter[4]); if (sim > -2) { pair.setSim(sim); sims.add(pair); } } else { LinkedList<EntityPair> sims = new LinkedList<EntityPair>(); double sim = m.computeSimilarity(splitter[1], splitter[2]); EntityPair pair = new EntityPair(splitter[1], splitter[2], splitter[3], splitter[4]); if (sim > -2) { pair.setSim(sim); sims.add(pair); vals.put(val, sims); } } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } File f = new File("/home/zwicklbauer/sampling/distances"); PrintWriter writer = null; try { writer = new PrintWriter(f); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } int maxLength = 0; for (Map.Entry<Integer, LinkedList<EntityPair>> entry : vals.entrySet()) { LinkedList<EntityPair> value = entry.getValue(); if(maxLength < value.size()) { maxLength = value.size(); } } for (int i = 0; i < maxLength; i++) { StringBuilder builder = new StringBuilder(); for (int j = 0; j < vals.size(); j++) { LinkedList<EntityPair> list = vals.get(j); if(list.size() > i) { builder.append(list.get(i).getSim()+"\t"); } else { builder.append(" \t"); } } writer.println(builder.toString()); } // for (Map.Entry<Integer, HashSet<EntityPair>> entry : vals.entrySet()) { // SummaryStatistics stats = new SummaryStatistics(); // Integer key = entry.getKey(); // File f = new File("/home/zwicklbauer/sampling/distance" + key); // PrintWriter writer = null; // try { // writer = new PrintWriter(f); // HashSet<EntityPair> value = entry.getValue(); // for (EntityPair pair : value) { // writer.println(key + "\t" + pair.getSim() + "\t" // + pair.getCategory1() + "\t" + pair.getCategory2()); // stats.addValue(pair.getSim()); // } // System.out.println("DISTANCE: " + key + " AVG: " // + stats.getMean() + " StandardDeviation: " // + stats.getStandardDeviation() + " Variance: " // + stats.getVariance() + " Min: " + stats.getMin() // + " Max: " + stats.getMax()); // } catch (FileNotFoundException e) { // e.printStackTrace(); // } finally { // if (writer != null) { // writer.close(); // } // } // } } public static void main(String[] args) { ComputeSimilarities sims = new ComputeSimilarities(); Word2VecModel model = Word2VecModel .createWord2VecModel("/mnt/ssd1/disambiguation/word2vec/wikientitymodel_min5.seq"); sims.analzye(model); } }