/** * Copyright (c) 2014, the Temporal Random Indexing AUTHORS. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the University of Bari nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 * */ package di.uniba.it.tri.space; import di.uniba.it.tri.space.clustering.Clusters; import di.uniba.it.tri.shell.TriShell; import di.uniba.it.tri.vectors.FileVectorReader; import di.uniba.it.tri.vectors.MapVectorReader; import di.uniba.it.tri.vectors.MemoryVectorReader; import di.uniba.it.tri.vectors.ObjectVector; import di.uniba.it.tri.vectors.RealVector; import di.uniba.it.tri.vectors.ReverseObjectVectorComparator; import di.uniba.it.tri.vectors.Vector; import di.uniba.it.tri.vectors.VectorFactory; import di.uniba.it.tri.vectors.VectorReader; import di.uniba.it.tri.vectors.VectorStoreUtils; import di.uniba.it.tri.vectors.VectorType; import static di.uniba.it.tri.vectors.VectorUtils.getNearestVector; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.PriorityQueue; import java.util.Random; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; /** * Utils for managing WordSpaces * * @author pierpaolo */ public class TemporalSpaceUtils { /** * Combine two or more WordSpaces using vectors sum * * @param spaces WordSpaces * @return The WordSpace as a Map that is the combination of given * WordSpaces */ public static Map<String, Vector> combineSpaces(Map<String, Vector>... spaces) { Map<String, Vector> newSpace = new HashMap<>(); for (Map<String, Vector> space : spaces) { Iterator<String> iterator = space.keySet().iterator(); while (iterator.hasNext()) { String key = iterator.next(); Vector v = space.get(key); Vector nw = newSpace.get(key); if (nw != null) { nw.superpose(v, 1, null); } else { newSpace.put(key, v); } } } Iterator<Vector> iterator = newSpace.values().iterator(); while (iterator.hasNext()) { iterator.next().normalize(); } return newSpace; } /** * Combine two or more VectorReaders using vectors sum * * @param readers VectorReaders * @return The WordSpace as a Map that is the combination of given * VectorReaders * @throws IOException */ public static Map<String, Vector> combineVectorReader(VectorReader... readers) throws IOException { Map<String, Vector> newSpace = new HashMap<>(); for (VectorReader reader : readers) { Iterator<ObjectVector> allVectors = reader.getAllVectors(); while (allVectors.hasNext()) { ObjectVector ov = allVectors.next(); Vector nw = newSpace.get(ov.getKey()); if (nw != null) { nw.superpose(ov.getVector(), 1, null); } else { newSpace.put(ov.getKey(), ov.getVector()); } } } Iterator<Vector> iterator = newSpace.values().iterator(); while (iterator.hasNext()) { iterator.next().normalize(); } return newSpace; } /** * Combine two or more VectorReaders using vectors sum * * @param readers VectorReaders * @return The VectorReader that is the combination of given VectorReaders * @throws IOException */ public static VectorReader combineAndBuildVectorReader(VectorReader... readers) throws IOException { Map<String, Vector> newSpace = new HashMap<>(); System.out.println(); for (VectorReader reader : readers) { System.out.print("."); Iterator<ObjectVector> allVectors = reader.getAllVectors(); while (allVectors.hasNext()) { ObjectVector ov = allVectors.next(); Vector nw = newSpace.get(ov.getKey()); if (nw != null) { nw.superpose(ov.getVector(), 1, null); } else { newSpace.put(ov.getKey(), ov.getVector()); } } } System.out.println(); Iterator<Vector> iterator = newSpace.values().iterator(); while (iterator.hasNext()) { iterator.next().normalize(); } return new MapVectorReader(newSpace); } /** * Combine two or more VectorReaders using vectors sum and save the result * in a File * * @param outputFile The File * @param readers VectorReaders * @throws IOException */ public static void combineAndSaveVectorReader(File outputFile, VectorReader... readers) throws IOException { Map<String, Vector> newSpace = new HashMap<>(); System.out.println(); for (VectorReader reader : readers) { System.out.print("."); Iterator<ObjectVector> allVectors = reader.getAllVectors(); while (allVectors.hasNext()) { ObjectVector ov = allVectors.next(); Vector nw = newSpace.get(ov.getKey()); if (nw != null) { nw.superpose(ov.getVector(), 1, null); } else { newSpace.put(ov.getKey(), ov.getVector()); } } } System.out.println(); int dimension = 0; Iterator<Vector> iterator = newSpace.values().iterator(); while (iterator.hasNext()) { if (dimension == 0) { Vector v = iterator.next(); dimension = v.getDimension(); v.normalize(); } else { iterator.next().normalize(); } } DataOutputStream outputStream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile))); String header = VectorStoreUtils.createHeader(VectorType.REAL, dimension, -1); outputStream.writeUTF(header); for (Entry<String, Vector> entry : newSpace.entrySet()) { outputStream.writeUTF(entry.getKey()); entry.getValue().writeToStream(outputStream); } outputStream.close(); newSpace.clear(); newSpace = null; } /** * Return the list of the n nearest vectors given a word * * @param store The VectorReader that contains vectors * @param word The word * @param n The number of nearest vectors * @return The list of the n nearest vectors * @throws IOException */ public static List<ObjectVector> getNearestVectors(VectorReader store, String word, int n) throws IOException { Vector vector = store.getVector(word); if (vector != null) { return getNearestVectors(store, vector, n); } else { return new ArrayList<>(); } } /** * Return the list of the n nearest vectors given a vector * * @param store The VectorReader that contains vectors * @param vector The vector * @param n The number of nearest vectors * @return The list of the n nearest vectors * @throws IOException */ public static List<ObjectVector> getNearestVectors(VectorReader store, Vector vector, int n) throws IOException { PriorityQueue<ObjectVector> queue = new PriorityQueue<>(); Iterator<ObjectVector> allVectors = store.getAllVectors(); while (allVectors.hasNext()) { ObjectVector ov = allVectors.next(); double overlap = ov.getVector().measureOverlap(vector); ov.setScore(overlap); if (queue.size() <= n) { queue.offer(ov); } else { queue.poll(); queue.offer(ov); } } queue.poll(); List<ObjectVector> list = new ArrayList<>(queue); Collections.sort(list, new ReverseObjectVectorComparator()); return list; } /** * Given a directory of stored vector file readers returns the list of files * belonging to a specified time period * * @param startDir The directory * @param start The begin of the time period * @param end The end of the time period * @return The list of files */ public static List<File> getFileTemporalRange(File startDir, int start, int end) { List<File> list = new ArrayList<>(); File[] listFiles = startDir.listFiles(); for (File file : listFiles) { if (file.getName().endsWith(".vectors")) { String name = file.getName().replaceAll(".vectors", ""); String[] split = name.split("_"); String stringYear = split[split.length - 1]; int year = Integer.parseInt(stringYear); if (year >= start && year <= end) { list.add(file); } } } return list; } private static Map<String, File> mapSpaceFile = new HashMap<>(); /** * Return a list of available years given the directory where file readers * are stored and the time period * * @param startDir The directory * @param start The begin of the time period * @param end The end of the time period * @return The list of available years */ public static List<String> getAvailableYears(File startDir, int start, int end) { mapSpaceFile.clear(); File[] listFiles = startDir.listFiles(); Pattern pattern = Pattern.compile("[0-9]+"); for (File file : listFiles) { if (file.getName().endsWith(".vectors")) { Matcher matcher = pattern.matcher(file.getName()); if (matcher.find()) { String stringYear = matcher.group(); int year = Integer.parseInt(stringYear); if (year >= start && year <= end) { mapSpaceFile.put(stringYear, file); } } } } return new ArrayList(mapSpaceFile.keySet()); } /** * Index the file of elemental vectors * * @param stardDir The directory containing the WordSpaces * @return The index of elemental vectors * @throws IOException */ public static IndexReader indexElemental(File stardDir) throws IOException { File elementalFile = getElementalFile(stardDir); return index(elementalFile); } /** * Index a VectorReader in order to search words * * @param vreader The VectorReader * @return The index * @throws IOException */ public static IndexReader index(VectorReader vreader) throws IOException { Iterator<String> keys = vreader.getKeys(); RAMDirectory ramDir = new RAMDirectory(); IndexWriterConfig iwconfig = new IndexWriterConfig(Version.LATEST, new StandardAnalyzer(CharArraySet.EMPTY_SET)); IndexWriter writer = new IndexWriter(ramDir, iwconfig); while (keys.hasNext()) { String word = keys.next(); Document doc = new Document(); doc.add(new StringField("word", word, Field.Store.YES)); writer.addDocument(doc); } writer.close(); return DirectoryReader.open(ramDir); } /** * Index a file in order to search words * * @param file The file * @return The index * @throws IOException */ public static IndexReader index(File file) throws IOException { FileVectorReader vreader = new FileVectorReader(file); vreader.init(); IndexReader index = index(vreader); vreader.close(); return index; } /** * Get the elemental vectors file * * @param startDir The directory containing the WordSpaces * @return */ public static File getElementalFile(File startDir) { return new File(startDir.getAbsolutePath() + "/vectors.elemental"); } /** * Get the file in which vectors of a specified year are stored * * @param startDir The directory containing the WordSpaces * @param year The year * @return The file */ public static File getVectorFile(File startDir, int year) { return getVectorFile(startDir, String.valueOf(year)); } /** * Get the file in which vectors of a specified year are stored * * @param startDir The directory containing the WordSpaces * @param year The year * @return The file */ public static File getVectorFile(File startDir, String year) { //return new File(startDir.getAbsolutePath() + "/count_" + year + ".vectors"); return (mapSpaceFile.get(year)); } /** * Get the VectorReader in which vectors of a specified year are stored * * @param startDir The directory containing the WordSpaces * @param year The year * @param mem * @return The file * @throws java.io.IOException */ public static VectorReader getVectorReader(File startDir, String year, boolean mem) throws IOException { File file = getVectorFile(startDir, year); VectorReader vr; if (mem) { vr = new MemoryVectorReader(file); vr.init(); } else { vr = new FileVectorReader(file); vr.init(); } return vr; } /** * Get the VectorReader in which vectors of a specified year are stored * * @param startDir The directory containing the WordSpaces * @param year The year * @param mem * @return The file * @throws java.io.IOException */ public static VectorReader getVectorReader(File startDir, int year, boolean mem) throws IOException { return getVectorReader(startDir, String.valueOf(year), mem); } /** * Return the less n similar vectors in two WordSpaces * * @param store1 The first Vector Reader * @param store2 The second Vector Reader * @param n The number of vectors * @param min min threshold * @param max max threshold * @return The list of less n similar vectors * @throws IOException */ public static List<ObjectVector> sims(VectorReader store1, VectorReader store2, int n, double min, double max) throws IOException { PriorityQueue<ObjectVector> queue = new PriorityQueue<>(); Iterator<ObjectVector> allVectors = store1.getAllVectors(); int c = 0; TriShell.println(""); while (allVectors.hasNext()) { ObjectVector ov = allVectors.next(); Vector vector = store2.getVector(ov.getKey()); if (vector != null) { double overlap = 1 - ov.getVector().measureOverlap(vector); if (overlap >= min && overlap <= max) { ov.setScore(overlap); if (queue.size() <= n) { queue.offer(ov); } else { queue.poll(); queue.offer(ov); } } } c++; if (c % 1000 == 0) { TriShell.print("."); } } TriShell.println(""); queue.poll(); List<ObjectVector> list = new ArrayList<>(queue); Collections.sort(list, new ReverseObjectVectorComparator()); return list; } /** * Count the number of vectors in a VectorReader * * @param reader The VectorReader * @return The number of vectors * @throws IOException */ public static int countVectors(VectorReader reader) throws IOException { Iterator<ObjectVector> allVectors = reader.getAllVectors(); int counter = 0; while (allVectors.hasNext()) { allVectors.next(); counter++; } return counter; } /** * Load stop words from a file (one stop word per line) * * @param filename The file name * @return The set of stop words * @throws IOException */ public static Set<String> loadStopWord(String filename) throws IOException { return loadStopWord(new File(filename)); } /** * Load stop words from a file (one stop word per line) * * @param file The file * @return The set of stop words * @throws IOException */ public static Set<String> loadStopWord(File file) throws IOException { Set<String> set = new HashSet<>(); BufferedReader reader = new BufferedReader(new FileReader(file)); while (reader.ready()) { set.add(reader.readLine().toLowerCase().trim()); } reader.close(); return set; } public static Clusters kMeansCluster(VectorReader vr, List<ObjectVector> objectVectors, int k) throws IOException { Clusters clusters = new Clusters(new int[objectVectors.size()], new Vector[k]); Random rand = new Random(); // Initialize cluster mappings randomly. for (int i = 0; i < objectVectors.size(); ++i) { int randInt = rand.nextInt(Integer.MAX_VALUE); clusters.getClusterMappings()[i] = randInt % k; } // Loop that computes centroids and reassigns members. boolean clustering = true; while (clustering) { // Clear centroid register. for (int i = 0; i < clusters.getCentroids().length; ++i) { clusters.getCentroids()[i] = VectorFactory.createZeroVector(VectorType.REAL, vr.getDimension()); } // Generate new cluster centroids. for (int i = 0; i < objectVectors.size(); ++i) { clusters.getCentroids()[clusters.getClusterMappings()[i]].superpose(objectVectors.get(i).getVector(), 1, null); } for (int i = 0; i < k; ++i) { clusters.getCentroids()[i].normalize(); } boolean changeFlag = false; // Map items to clusters. for (int i = 0; i < objectVectors.size(); i++) { int j = getNearestVector(objectVectors.get(i).getVector(), clusters.getCentroids()); if (j != clusters.getClusterMappings()[i]) { changeFlag = true; clusters.getClusterMappings()[i] = j; } } if (changeFlag == false) { clustering = false; } } return clusters; } public static Vector computeMeanVector(VectorReader vr) throws IOException { Iterator<ObjectVector> allVectors = vr.getAllVectors(); Vector mean = VectorFactory.createZeroVector(VectorType.REAL, vr.getDimension()); float n = 0; while (allVectors.hasNext()) { ObjectVector next = allVectors.next(); mean.superpose(next.getVector(), 1, null); n++; } float[] mv = ((RealVector) mean).getCoordinates(); for (int i = 0; i < mv.length; i++) { mv[i] /= n; } mean = new RealVector(mv); return mean; } }