/* * This file is part of Gradoop. * * Gradoop is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Gradoop is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Gradoop. If not, see <http://www.gnu.org/licenses/>. */ package org.gradoop.examples.sna; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.apache.commons.io.IOUtils; import org.apache.flink.api.common.ProgramDescription; import org.apache.flink.api.java.ExecutionEnvironment; import org.gradoop.examples.AbstractRunner; import org.gradoop.examples.utils.ExampleOutput; import org.gradoop.flink.model.api.functions.TransformationFunction; import org.gradoop.flink.model.impl.LogicalGraph; import org.gradoop.flink.algorithms.labelpropagation.GellyLabelPropagation; import org.gradoop.flink.model.impl.operators.aggregation.ApplyAggregation; import org.gradoop.flink.model.impl.operators.aggregation.functions.count.EdgeCount; import org.gradoop.flink.model.impl.operators.aggregation.functions.count.VertexCount; import org.gradoop.flink.model.impl.operators.combination.ReduceCombination; import org.gradoop.flink.util.FlinkAsciiGraphLoader; import org.gradoop.flink.util.GradoopFlinkConfig; /** * The program executes the following workflow: * * 1) Extract subgraph with: * - vertex predicate: must be of type 'Person' * - edge predicate: must be of type 'knows' * 2) Transform vertices and edges to necessary information * 3) Compute communities using Gelly label propagation * 4) Compute vertex count per community * 5) Select communities with a vertex count greater than a given threshold * 6) Combine the remaining graphs to a single graph * 7) Group the graph using the vertex attributes 'city' and 'gender' and * - count the number of vertices represented by each super vertex * - count the number of edges represented by each super edge * 8) Aggregate the grouped graph: * - add the total vertex count as new graph property * - add the total edge count as new graph property * * The program can be either executed using external data (for benchmarking) or * demo data ({@link #main(String[])}). */ public class SNABenchmark2 extends AbstractRunner implements ProgramDescription { /** * Runs the benchmark program. * * The program can be executed using either external data or demo data. * * If no arguments are given, the program is executed on a demo social network * which is described in 'resources/data/gdl/sna.gdl'. * * For using external data, the following arguments are mandatory: * * 1) (possibly HDFS) input directory that contains * - nodes.json * - edges.json * - graphs.json * * 2) (possibly HDFS) output directory to write the resulting graph to * * 3) Threshold for community selection depending on the dataset size: * * Scale - Threshold (recommended) * 1 - 1,000 * 10 - 7,500 * 100 - 50,000 * 1K - 350,000 * 10K - 2,450,000 * * @param args args[0]: input dir, args[1]: output dir, args[2]: threshold * @throws Exception */ @SuppressWarnings({ "unchecked", "Duplicates" }) public static void main(String[] args) throws Exception { boolean useExternalData = args.length > 0; if (useExternalData) { executeWithExternalData(args); } else { executeWithDemoData(GradoopFlinkConfig .createConfig(ExecutionEnvironment.getExecutionEnvironment())); } } /** * Runs the benchmark program with external data (e.g. from HDFS) * * @param args args[0]: input dir, args[1]: output dir, args[2]: threshold * @throws Exception */ @SuppressWarnings("unchecked") private static void executeWithExternalData(String[] args) throws Exception { Preconditions.checkArgument( args.length == 3, "input dir, output dir and threshold required"); String inputDir = args[0]; String outputDir = args[1]; int threshold = Integer.parseInt(args[2]); LogicalGraph epgmDatabase = readLogicalGraph(inputDir); writeLogicalGraph(execute(epgmDatabase, threshold), outputDir); getExecutionEnvironment().execute(); } /** * Runs the benchmark with demo data. * * @param gradoopConf gradoop config * @throws Exception */ private static void executeWithDemoData(GradoopFlinkConfig gradoopConf) throws Exception { ExampleOutput out = new ExampleOutput(); FlinkAsciiGraphLoader loader = new FlinkAsciiGraphLoader(gradoopConf); String graphDefinition = IOUtils.toString(SNABenchmark2.class .getResourceAsStream("/data/gdl/sna.gdl")); loader.initDatabaseFromString(graphDefinition); LogicalGraph inputGraph = loader.getLogicalGraphByVariable("db"); out.add("Input Graph", inputGraph); LogicalGraph outputGraph = execute(inputGraph, 2); out.add("Output Graph", outputGraph); out.print(); } /** * The actual computation. * * @param socialNetwork social network graph * @param threshold used in community selection predicate * @return summarized, aggregated graph */ private static LogicalGraph execute(LogicalGraph socialNetwork, final int threshold) { final int maxIterations = 4; final String vertexCount = "vertexCount"; final String edgeCount = "edgeCount"; final String person = "person"; final String knows = "knows"; final String city = "city"; final String gender = "gender"; final String birthday = "birthday"; final String label = "label"; return socialNetwork // 1) extract subgraph .subgraph( vertex -> vertex.getLabel().toLowerCase().equals(person), edge -> edge.getLabel().toLowerCase().equals(knows)) // project to necessary information .transform( // keep graph heads TransformationFunction.keep(), // keep necessary vertex properties (current, transformed) -> { transformed.setLabel(current.getLabel()); transformed.setProperty(city, current.getPropertyValue(city)); transformed.setProperty(gender, current.getPropertyValue(gender)); transformed.setProperty(label, current.getPropertyValue(birthday)); return transformed; }, // keep only edge label (current, transformed) -> { transformed.setLabel(current.getLabel()); return transformed; }) // 3a) compute communities .callForGraph(new GellyLabelPropagation(maxIterations, label)) // 3b) separate communities .splitBy(label) // 4) compute vertex count per community .apply(new ApplyAggregation(new VertexCount())) // 5) select graphs with more than minClusterSize vertices .select(g -> g.getPropertyValue(vertexCount).getLong() > threshold) // 6) reduce filtered graphs to a single graph using combination .reduce(new ReduceCombination()) // 7) group that graph by vertex properties .groupBy(Lists.newArrayList(city, gender)) // 8a) count vertices of grouped graph .aggregate(new VertexCount()) // 8b) count edges of grouped graph .aggregate(new EdgeCount()); } @Override public String getDescription() { return SNABenchmark2.class.getName(); } }