/** * Copyright (c) 2014, the LESK-WSD-DSM AUTHORS. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the University of Bari nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 * */ package di.uniba.it.wsd.dsm; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; /** * This class reads and stores in a Map the word vectors * * @author pierpaolo */ public class DataVectorStore implements VectorStore { private Map<String, float[]> vectors; private int dimension; private String vectorType; private static final Logger logger = Logger.getLogger(DataVectorStore.class.getName()); /** * * @param inputFile * @throws IOException */ public void init(File inputFile) throws IOException { vectors = new HashMap<>(); DataInputStream input = new DataInputStream(new BufferedInputStream(new FileInputStream(inputFile))); String header = input.readUTF(); Properties props = readHeader(header); if (props == null) { vectorType = input.readUTF(); dimension = input.readInt(); } else { dimension = Integer.parseInt(props.getProperty("-dim")); vectorType = props.getProperty("-type"); } ObjectVector.vecLength = dimension; int c = 0; while (input.available() > 0) { String key = input.readUTF(); float[] v = new float[dimension]; for (int i = 0; i < dimension; i++) { v[i] = input.readFloat(); } vectors.put(key, v); c++; if (c % 10000 == 0) { System.out.print(c + " "); } } input.close(); logger.log(Level.INFO, "Loaded {0} vectors.", vectors.size()); } /** * * @param key * @return */ public float[] getVector(String key) { return vectors.get(key); } /** * * @param word * @param n * @return */ public List<SpaceResult> findSimilar(String word, int n) { float[] v1 = vectors.get(word); if (v1 == null) { Logger.getLogger(LuceneVectorStore.class.getName()).log(Level.WARNING, "No vector for term: {0}", word); return new ArrayList<>(); } PriorityQueue<SpaceResult> queue = new PriorityQueue<>(); Iterator<String> iterator = vectors.keySet().iterator(); while (iterator.hasNext()) { String key = iterator.next(); float[] v2 = vectors.get(key); float score = VectorUtils.scalarProduct(v1, v2); if (queue.size() < n) { queue.offer(new SpaceResult(key, score)); } else { queue.poll(); queue.offer(new SpaceResult(key, score)); } } queue.poll(); List<SpaceResult> list = new ArrayList<>(queue); Collections.sort(list); return list; } private Properties readHeader(String line) throws IllegalArgumentException { if (line.contains("-dim") && line.contains("-seed") && line.contains("-type")) { Properties props = new Properties(); String[] split = line.split("\t"); if (split.length % 2 == 0) { for (int i = 0; i < split.length; i = i + 2) { props.put(split[i], split[i + 1]); } } else { throw new IllegalArgumentException("Not valid header: " + line); } return props; } else { return null; } } }