/* * File: InfluenceSpread.java * Authors: Tu-Thach Quach * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright 2016, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. * Export of this program may require a license from the United States * Government. See CopyrightHistory.txt for complete details. * */ package examples; import gov.sandia.cognition.annotation.PublicationReference; import gov.sandia.cognition.annotation.PublicationType; import gov.sandia.cognition.graph.DirectedNodeEdgeGraph; import gov.sandia.cognition.graph.DirectedWeightedNodeEdgeGraph; import gov.sandia.cognition.graph.WeightedDenseMemoryGraph; import gov.sandia.cognition.graph.inference.GraphWrappingEnergyFunction; import gov.sandia.cognition.graph.inference.SumProductDirectedPropagation; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; /** * Use this to calculate the influence spread of seed sets. See the main method * for all the parameters to pass into the program. * * @author Tu-Thach Quach * */ @PublicationReference(author = "Tu-Thach Quach and Jeremy D. Wendt", title = "A diffusion model for maximizing influence spread in large networks", type = PublicationType.Conference, publication = "Proceedings of the International Conference on Social Informatics", year = 2016) public class InfluenceSpread { public static final int MAX_NUM_ITERATIONS = 20; /** * Seed file is one line per seed set. Each line is * <p> * movie_name node_1 node_2 ... node_n * </p> * * @param filename * @return * @throws IOException */ public static ArrayList<ArrayList<String>> readSeeds(String filename) throws IOException { ArrayList<ArrayList<String>> seeds = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new FileReader(filename))) { String line = reader.readLine(); while (line != null) { String[] items = line.split("\\s+"); ArrayList<String> list = new ArrayList<>(); for (int i = 0; i < items.length; i++) { list.add(items[i].trim()); } seeds.add(list); line = reader.readLine(); } } return seeds; } /** * Graph file stores edges and potentials per line: * <p> * node_i node_j potential_ij potential_ji * </p> * * @param filename * @return * @throws IOException */ public static DirectedWeightedNodeEdgeGraph<String> createFromEdgeList( String filename) throws IOException { DirectedWeightedNodeEdgeGraph<String> graph = new WeightedDenseMemoryGraph<>(); try (BufferedReader reader = new BufferedReader(new FileReader(filename))) { String line; while ((line = reader.readLine()) != null) { line = line.trim(); if (line.isEmpty()) { continue; } else if (line.startsWith("#")) { continue; } String[] items = line.split("\\s+"); String nodei = items[0].trim(); String nodej = items[1].trim(); double pij = Double.parseDouble(items[2]); double pji = Double.parseDouble(items[3]); graph.addEdge(nodei, nodej, pij); graph.addEdge(nodej, nodei, pji); } } return graph; } /** * Unary file stores unary potentials per line: * <p> * node_i potential_i * </p> * * @param filename * @return * @throws IOException */ public static Map<String, Double> readUnaryPotentials(String filename) throws IOException { Map<String, Double> unaryPotentials = new HashMap<>(); try (BufferedReader reader = new BufferedReader(new FileReader(filename))) { String line = reader.readLine(); while (line != null) { String[] items = line.split("\\s+"); String node = items[0].trim(); double p = Double.parseDouble(items[1]); unaryPotentials.put(node, p); line = reader.readLine(); } } return unaryPotentials; } private static double propagate( GraphWrappingEnergyFunction<Integer, String> f, SumProductDirectedPropagation<Integer> solver, ArrayList<String> seeds, int numCores) { // Important to clear all seeds and stored costs. f.clearLabels(); // Set seeds: the first element is the propagation name, not a seed // node. for (int i = 1; i < seeds.size(); i++) { String node = seeds.get(i); f.setLabel(node, 1); } boolean converged = solver.solve(); if (!converged) { System.out.println("Warning: seeds for propagation " + seeds.get(0) + " did not converge."); } double prop = 0; for (int i = 0; i < f.numNodes(); i++) { prop += solver.getBelief(i, 1); } return prop; } /** * Arguments for this example */ private static class Arguments { String inputFilename; String unaryPotentialsFilename; String seedsFilename; double minimumInfluenceProbability; int numCores; int batchSize; int sequence; int offset; String outputFilename; public Arguments() { // The graph filename. Each line is source, destination, influence_src_to_dst, influence_dst_to_src inputFilename = "example.txt"; // This file defines how likely each node is to adopt independent of incoming messages unaryPotentialsFilename = "unary.txt"; // This specifies which nodes to set as seeds. Each line is a whitespace-delimited list for a separate run seedsFilename = "seeds.txt"; // If any entry in unaryPotentialsFilename is less than this, it's increased to this minimumInfluenceProbability = 0.001; // The number of cores to run the parallel code on numCores = 4; // The number of rows of seeds to run this run batchSize = 3; // Which number to start on within the current batch offset = 0; // Which batch to run (for if there are lots of seed rows and you're running different seed tests on different machines) sequence = 0; // Where to write results to outputFilename = "results.txt"; } }; /** * The program takes the following parameters (we will use flixster as an * example): * <p> * 1: the graph file storing directed edges with edge potentials p_ij in the * paper. This file has the following format on each line: nodei nodej p_ij * p_ji * </p> * <p> * 2: the unary file storing the unary potential \rho_i in the paper. This * file has the following format on each line: nodei \rho_i * </p> * <p> * 3: the file storing the propagations to propagate. Each line as the * following format: movie_id node_1 node_2 node_3 ... movie_n etc. The * nodes correspond to the seed nodes that are first to review the movie * among their friends. The program will then calculate the spread of * influence starting with these seed nodes. * </p> * <p> * 4: the minimum edge weight p_ij just in case some edge potentials are too * small. This is usually not needed and can be set to 0. * </p> * <p> * 5: the number of compute cores to use to compute the spread. * </p> * <p> * 6: the batch size. The program can be ran in parallel across many * machines using sbatch or whatever your favorite parallel job management * utility. Each machine is assigned a batch number and a size. A typical * batch size is the number of lines in s divided by the number of machines. * For a small network that does not need to be ran in parallel, set bs to * the number of lines in s and off (see below) to 0. * </p> * <p> * 7: the offset for this batch. It determines which line in s to start * processing. For example, if bs = 10 and off = 100, then this batch will * start processing line 100 and do so for the next 10 lines. * </p> * <p> * 8: the output file to write out. The format of each line of the output * file will be: movie_id predicted_spread. * </p> * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { Arguments a = new Arguments(); File graphFile = new File(a.inputFilename); File unaryFile = new File(a.unaryPotentialsFilename); File seedFile = new File(a.seedsFilename); if (!graphFile.exists()) { System.out.println(graphFile.getPath() + " does not exist."); System.exit(0); } if (!unaryFile.exists()) { System.out.println(unaryFile.getPath() + " does not exist."); System.exit(0); } if (!seedFile.exists()) { System.out.println(seedFile.getPath() + " does not exist."); System.exit(0); } InfluencePotentialHandler.MINIMUM_EDGE_POTENTIAL = a.minimumInfluenceProbability; int start = a.offset + a.sequence * a.batchSize; System.out.println("Parameters:"); System.out.println("Graph file: " + graphFile.getPath()); System.out.println("Unary file: " + unaryFile.getPath()); System.out.println("Seed file: " + seedFile.getPath()); System.out.println("Minimum influence probability: " + InfluencePotentialHandler.MINIMUM_EDGE_POTENTIAL); System.out.println("Number of threads: " + a.numCores); System.out.println("Output file: " + a.outputFilename); System.out.println("Sequence ID: " + a.sequence); System.out.println("Batch size: " + a.batchSize); System.out.println("Offset: " + a.offset); System.out.println("Start: " + start); System.out.println("Reading unary potentials..."); Map<String, Double> unaryPotentials = readUnaryPotentials( unaryFile.getPath()); System.out.println("Number of unary potentials: " + unaryPotentials.size()); // We originally use Integer instead of String to store the nodes. I // think that is more efficient (memory and possibly speed), but using // String is more flexible. System.out.println("Reading graph..."); DirectedNodeEdgeGraph<String> graph = createFromEdgeList( graphFile.getPath()); graph.getEdgeEndpointIds(0); System.out.println("Number of nodes: " + graph.getNumNodes()); System.out.println("Number of edges: " + graph.getNumEdges()); GraphWrappingEnergyFunction<Integer, String> f = new GraphWrappingEnergyFunction<>(graph, new InfluencePotentialHandler(graph, unaryPotentials)); // Every node must have a unary potential. assert (unaryPotentials.size() == graph.getNumNodes()); System.out.println("Reading seeds..."); ArrayList<ArrayList<String>> allSeeds = readSeeds(seedFile.getPath()); System.out.println("Number of seeds: " + allSeeds.size()); BufferedWriter writer = new BufferedWriter(new FileWriter(new File( a.outputFilename))); SumProductDirectedPropagation<Integer> solver = new SumProductDirectedPropagation<>(MAX_NUM_ITERATIONS, 0.001, a.numCores); solver.init(f); try { for (int i = 0; i < a.batchSize; i++) { int index = start + i; if (index < allSeeds.size()) { ArrayList<String> propagationSeeds = allSeeds.get(index); // The first entry is the propagation name, e.g., // product/movie name. String propagationName = propagationSeeds.get(0); System.out.println("Propagating " + propagationName + " with " + (propagationSeeds.size() - 1) + " seeds"); double tic = System.currentTimeMillis(); double prop = propagate(f, solver, propagationSeeds, a.numCores); double toc = System.currentTimeMillis(); System.out.println("Time elapsed: " + (toc - tic) / 1000); System.out.println(propagationName + '\t' + String.valueOf( prop)); writer.write(propagationName + '\t' + String.valueOf(prop)); writer.newLine(); writer.flush(); } } } finally { writer.close(); } System.out.println("Done"); System.out.println("Memory usage: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory())); } }