GroupingBenchmark.java example

Explorer
gradoop-master
/*
 * This file is part of Gradoop.
 *
 * Gradoop is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Gradoop is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Gradoop. If not, see <http://www.gnu.org/licenses/>.
 */

package org.gradoop.benchmark.grouping;

import com.google.common.collect.Lists;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.io.FileUtils;
import org.apache.flink.api.common.ProgramDescription;
import org.gradoop.examples.AbstractRunner;
import org.gradoop.flink.model.impl.LogicalGraph;
import org.gradoop.flink.model.impl.operators.grouping.Grouping;
import org.gradoop.flink.model.impl.operators.grouping.GroupingStrategy;
import org.gradoop.flink.model.impl.operators.grouping.functions.aggregation.CountAggregator;
import org.gradoop.flink.model.impl.operators.grouping.functions.aggregation.MaxAggregator;
import org.gradoop.flink.model.impl.operators.grouping.functions.aggregation.MinAggregator;
import org.gradoop.flink.model.impl.operators.grouping.functions.aggregation.PropertyValueAggregator;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

/**
 * A dedicated program for parametrized graph grouping benchmark.
 */
public class GroupingBenchmark extends AbstractRunner
  implements ProgramDescription {

  /**
   * Option to declare path to input graph
   */
  private static final String OPTION_INPUT_PATH = "i";
  /**
   * Option to declare path to output graph
   */
  private static final String OPTION_OUTPUT_PATH = "o";
  /**
   * Option to set the grouping strategy
   */
  private static final String OPTION_GROUPING_STRATEGY = "s";
  /**
   * Vertex grouping key option
   */
  private static final String OPTION_VERTEX_GROUPING_KEY = "vgk";
  /**
   * EPGMEdge grouping key option
   */
  private static final String OPTION_EDGE_GROUPING_KEY = "egk";
  /**
   * Use vertex label option
   */
  private static final String OPTION_USE_VERTEX_LABELS = "uvl";
  /**
   * Use edge label option
   */
  private static final String OPTION_USE_EDGE_LABELS = "uel";
  /**
   * Path to CSV log file
   */
  private static final String OPTION_CSV_PATH = "csv";
  /**
   * Used vertex aggregator functions (min, max, count, none)
   */
  private static final String OPTION_VERTEX_AGGREGATION_FUNCS = "vagg";
  /**
   * Used vertex aggregation keys
   */
  private static final String OPTION_VERTEX_AGGREGATION_KEYS = "vak";
  /**
   * Used Vertex aggregation result keys
   */
  private static final String OPTION_VERTEX_AGGREGATION_RESULT_KEYS = "vark";
  /**
   * Used EPGMEdge aggregator functions (min, max, count, none)
   */
  private static final String OPTION_EDGE_AGGREGATION_FUNCS = "eagg";
  /**
   * Used vertex aggregation keys
   */
  private static final String OPTION_EDGE_AGGREGATION_KEYS = "eak";
  /**
   * Used Vertex aggregation result keys
   */
  private static final String OPTION_EDGE_AGGREGATION_RESULT_KEYS = "eark";
  /**
   * Grouping strategy
   */
  private static GroupingStrategy STRATEGY = GroupingStrategy.GROUP_REDUCE;
  /**
   * Used VertexKey for grouping
   */
  private static String VERTEX_GROUPING_KEYS;
  /**
   * Used EdgeKey for grouping
   */
  private static String EDGE_GROUPING_KEYS;
  /**
   * Used csv path
   */
  private static String CSV_PATH;
  /**
   * Used hdfs INPUT_PATH
   */
  private static String INPUT_PATH;
  /**
   * Used hdfs OUTPUT_PATH
   */
  private static String OUTPUT_PATH;
  /**
   * Used vertex aggregators
   */
  private static String VERTEX_AGGREGATORS;
  /**
   * Used vertex aggregator keys
   */
  private static String VERTEX_AGGREGATOR_KEYS;
  /**
   * Used vertex aggregator result keys
   */
  private static String VERTEX_AGGREGATOR_RESULT_KEYS;
  /**
   * Used edge aggregators
   */
  private static String EDGE_AGGREGATORS;
  /**
   * Used edge aggregator keys
   */
  private static String EDGE_AGGREGATOR_KEYS;
  /**
   * Used edge aggregator result keys
   */
  private static String EDGE_AGGREGATOR_RESULT_KEYS;
  /**
   * Uses VertexLabels
   */
  private static boolean USE_VERTEX_LABELS;
  /**
   * Uses EdgeLabels
   */
  private static boolean USE_EDGE_LABELS;
  /**
   * Token separator for input strings
   */
  private static final Pattern TOKEN_SEPARATOR = Pattern.compile(",");


  static {
    OPTIONS.addOption(OPTION_INPUT_PATH, "vertex-input-path", true,
      "Path to vertex file");
    OPTIONS.addOption(OPTION_OUTPUT_PATH, "output-path", true,
      "Path to write output files to");
    OPTIONS.addOption(OPTION_GROUPING_STRATEGY, "strategy", true,
      "Grouping strategy (GR, GC)");
    OPTIONS.addOption(OPTION_USE_VERTEX_LABELS, "use-vertex-labels", false,
      "Group on vertex labels");
    OPTIONS.addOption(OPTION_USE_EDGE_LABELS, "use-edge-labels", false,
      "Group on edge labels");
    OPTIONS.addOption(OPTION_VERTEX_GROUPING_KEY, "vertex-grouping-key", true,
      "EPGMProperty key to group vertices on.");
    OPTIONS.addOption(OPTION_EDGE_GROUPING_KEY, "edge-grouping-key", true,
      "EPGMProperty key to group edges on.");
    OPTIONS.addOption(OPTION_CSV_PATH, "csv-path", true, "Path of the " +
      "generated CSV-File");
    OPTIONS.addOption(OPTION_VERTEX_AGGREGATION_FUNCS, "vertex-aggregator",
      true, "Applied aggregation function on vertices");
    OPTIONS.addOption(OPTION_VERTEX_AGGREGATION_KEYS,
      "vertex-aggregation-keys", true, "keys for vertex aggregation");
    OPTIONS.addOption(OPTION_VERTEX_AGGREGATION_RESULT_KEYS,
      "vertex-aggregation-result-keys", true, "keys for aggregation result");
    OPTIONS.addOption(OPTION_EDGE_AGGREGATION_FUNCS, "edge-aggregator", true,
      "Applied aggregation function on edges");
    OPTIONS.addOption(OPTION_EDGE_AGGREGATION_KEYS, "edge-aggregation-keys",
      true, "keys for edge aggregation");
    OPTIONS.addOption(OPTION_EDGE_AGGREGATION_RESULT_KEYS,
      "edge-aggregation-result-keys", true, "keys for aggregation result");
  }

  /**
   * Main program to run the benchmark. Arguments are the available options.
   *
   * @param args program arguments
   * @throws Exception
   */
  @SuppressWarnings("unchecked")
  public static void main(String[] args) throws Exception {
    CommandLine cmd = parseArguments(args,
      GroupingBenchmark.class.getName());
    if (cmd == null) {
      return;
    }
    // test if minimum arguments are set
    performSanityCheck(cmd);

    // read cmd arguments
    readCMDArguments(cmd);

    // initialize EPGM database
    LogicalGraph graphDatabase = readLogicalGraph(INPUT_PATH);

    // initialize grouping keys
    List<String> vertexKeys = Lists.newArrayList();
    if (VERTEX_GROUPING_KEYS != null) {
      vertexKeys = getKeys(VERTEX_GROUPING_KEYS);
    }

    List<String> edgeKeys = Lists.newArrayList();
    if (EDGE_GROUPING_KEYS != null) {
      edgeKeys = getKeys(EDGE_GROUPING_KEYS);
    }

    // initialize aggregators
    List<PropertyValueAggregator> vAggregators = Lists.newArrayList();
    List<PropertyValueAggregator> eAggregators = Lists.newArrayList();

    if (cmd.hasOption(OPTION_VERTEX_AGGREGATION_KEYS)) {
      vAggregators =
        getAggregators(VERTEX_AGGREGATORS, VERTEX_AGGREGATOR_KEYS,
          VERTEX_AGGREGATOR_RESULT_KEYS);
    }

    if (cmd.hasOption(OPTION_EDGE_AGGREGATION_KEYS)) {
      eAggregators = getAggregators(EDGE_AGGREGATORS, EDGE_AGGREGATOR_KEYS,
        EDGE_AGGREGATOR_RESULT_KEYS);
    }
    // build grouping operator
    Grouping grouping = getOperator(STRATEGY,
      vertexKeys, edgeKeys, USE_VERTEX_LABELS, USE_EDGE_LABELS, vAggregators,
      eAggregators);

    // call grouping on whole database graph
    LogicalGraph summarizedGraph = graphDatabase.callForGraph(grouping);
    if (summarizedGraph != null) {
      writeLogicalGraph(summarizedGraph, OUTPUT_PATH);
      writeCSV();
    } else {
      System.err.println("wrong parameter constellation");
    }
  }


  /**
   * Checks if the minimum of arguments is provided
   *
   * @param cmd command line
   */
  private static void performSanityCheck(final CommandLine cmd) {
    if (!cmd.hasOption(OPTION_INPUT_PATH)) {
      throw new IllegalArgumentException("Define a graph input directory.");
    }
    if (!cmd.hasOption(OPTION_CSV_PATH)) {
      throw new IllegalArgumentException("Path to CSV-File need to be set");
    }
    if (!cmd.hasOption(OPTION_VERTEX_GROUPING_KEY) &&
      !cmd.hasOption(OPTION_USE_VERTEX_LABELS)) {
      throw new IllegalArgumentException(
        "Chose at least one vertex grouping key or use vertex labels.");
    }
    if (!cmd.hasOption(OPTION_VERTEX_AGGREGATION_FUNCS)) {
      throw new IllegalArgumentException("Vertex aggregator need to be set! " +
        "(max, min, count, none (or list of these)");
    }
    if (!cmd.hasOption(OPTION_EDGE_AGGREGATION_FUNCS)) {
      throw new IllegalArgumentException("Edge aggregator need to be set! " +
        "(max, min, count, none (or list of these)");
    }

  }

  /**
   * Reads the given arguments from command line
   *
   * @param cmd command line
   */
  private static void readCMDArguments(final CommandLine cmd) {
    // read input output paths
    INPUT_PATH = cmd.getOptionValue(OPTION_INPUT_PATH);
    OUTPUT_PATH = cmd.getOptionValue(OPTION_OUTPUT_PATH);
    CSV_PATH = cmd.getOptionValue(OPTION_CSV_PATH);

    // initialize grouping strategy
    if (cmd.hasOption(OPTION_GROUPING_STRATEGY)) {
      String value = cmd.getOptionValue(OPTION_GROUPING_STRATEGY);
      if (value.toUpperCase().equals("GC")) {
        STRATEGY = GroupingStrategy.GROUP_COMBINE;
      }
    }

    // read if vertex or edge keys should be used
    boolean useVertexKey = cmd.hasOption(OPTION_VERTEX_GROUPING_KEY);
    VERTEX_GROUPING_KEYS =
      useVertexKey ? cmd.getOptionValue(OPTION_VERTEX_GROUPING_KEY) : null;
    boolean useEdgeKey = cmd.hasOption(OPTION_EDGE_GROUPING_KEY);
    EDGE_GROUPING_KEYS =
      useEdgeKey ? cmd.getOptionValue(OPTION_EDGE_GROUPING_KEY) : null;

    // read vertex and edge labels
    USE_VERTEX_LABELS = cmd.hasOption(OPTION_USE_VERTEX_LABELS);
    USE_EDGE_LABELS = cmd.hasOption(OPTION_USE_EDGE_LABELS);

    // read aggregators
    VERTEX_AGGREGATORS = cmd.getOptionValue(OPTION_VERTEX_AGGREGATION_FUNCS);
    boolean vertexAggKeys = cmd.hasOption(OPTION_VERTEX_AGGREGATION_KEYS);
    if (vertexAggKeys) {
      VERTEX_AGGREGATOR_KEYS =
        cmd.getOptionValue(OPTION_VERTEX_AGGREGATION_KEYS);
      VERTEX_AGGREGATOR_RESULT_KEYS =
        cmd.getOptionValue(OPTION_VERTEX_AGGREGATION_RESULT_KEYS);
    }

    EDGE_AGGREGATORS = cmd.getOptionValue(OPTION_EDGE_AGGREGATION_FUNCS);
    boolean edgeAggKeys = cmd.hasOption(OPTION_EDGE_AGGREGATION_KEYS);
    if (edgeAggKeys) {
      EDGE_AGGREGATOR_KEYS = cmd.getOptionValue(OPTION_EDGE_AGGREGATION_KEYS);
      EDGE_AGGREGATOR_RESULT_KEYS =
        cmd.getOptionValue(OPTION_EDGE_AGGREGATION_RESULT_KEYS);
    }
  }

  /**
   * Method to get keys as list
   *
   * @param keys keys string
   * @return keys as list
   */
  private static List<String> getKeys(String keys) {
    keys = keys.replace("\\s", "");
    return Arrays.asList(TOKEN_SEPARATOR.split(keys));
  }

  /**
   * Method to build aggregators
   *
   * @param aggs        aggregators as whole string
   * @param keys        aggregator keys as whole string
   * @param resultKeys  aggregator result keys as whole string
   * @return List of PropertyValueAggregators
   */
  private static List<PropertyValueAggregator> getAggregators(String
    aggs, String keys, String resultKeys) {

    List<PropertyValueAggregator> aggregatorList = Lists.newArrayList();

    aggs = aggs.replace("\\s", "");
    keys = keys.replace("\\s", "");
    resultKeys = resultKeys.replace("\\s", "");

    List<String> aggsList = Arrays.asList(TOKEN_SEPARATOR.split(aggs));
    List<String> keyList = Arrays.asList(TOKEN_SEPARATOR.split(keys));
    List<String> resultKeyList = Arrays.asList(TOKEN_SEPARATOR.split
      (resultKeys));

    if (!aggs.equals("none")) {
      for (int i = 0; i < aggsList.size(); i++) {
        switch (aggsList.get(i)) {
        case "count" :
          aggregatorList.add(new CountAggregator());
          break;
        case "max" :
          aggregatorList.add(new MaxAggregator(keyList.get(i),
            resultKeyList.get(i)));
          break;
        case "min" :
          aggregatorList.add(new MinAggregator(keyList.get(i),
            resultKeyList.get(i)));
          break;
        default:
          aggregatorList.add(null);
          break;
        }
      }
    }
    return aggregatorList;
  }

  /**
   * Returns the grouping operator implementation based on the given strategy.
   *
   * @param strategy        grouping strategy to use
   * @param vertexKeys      vertex property keys used for grouping
   * @param edgeKeys        edge property keys used for grouping
   * @param useVertexLabels use vertex label for grouping, true/false
   * @param useEdgeLabels   use edge label for grouping, true/false
   * @param vAggs           used vertex aggregators
   * @param eAggs           used edge aggregators
   * @return grouping operator implementation
   */
  private static Grouping getOperator(GroupingStrategy strategy,
    List<String> vertexKeys, List<String> edgeKeys,
    boolean useVertexLabels, boolean useEdgeLabels,
    List<PropertyValueAggregator> vAggs, List<PropertyValueAggregator> eAggs) {

    Grouping.GroupingBuilder builder =
      new Grouping.GroupingBuilder()
        .setStrategy(strategy)
        .useVertexLabel(useVertexLabels)
        .useEdgeLabel(useEdgeLabels);

    if (vAggs.size() > 0) {
      for (PropertyValueAggregator agg:vAggs) {
        if (agg != null) {
          builder.addVertexAggregator(agg);
        }
      }
    }

    if (eAggs.size() > 0) {
      for (PropertyValueAggregator agg: eAggs) {
        if (agg != null) {
          builder.addEdgeAggregator(agg);
        }
      }
    }

    if (vertexKeys.size() > 0) {
      for (String vKey : vertexKeys) {
        builder.addVertexGroupingKey(vKey);
      }
    }

    if (edgeKeys.size() > 0) {
      for (String eKey : edgeKeys) {
        builder.addEdgeGroupingKey(eKey);
      }
    }
    return builder.build();
  }


  /**
   * Method to create and add lines to a csv-file
   * @throws IOException
   */
  private static void writeCSV() throws IOException {

    String head = String.format("%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s%n",
      "Parallelism", "dataset", "vertexKeys", "edgeKeys", "USE_VERTEX_LABELS",
      "USE_EDGE_LABELS", "Vertex Aggregators", "Vertex-Aggregator-Keys",
      "EPGMEdge-Aggregators", "EPGMEdge-Aggregator-Keys", "Runtime(s)");

    String tail = String.format("%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s%n",
      getExecutionEnvironment().getParallelism(), INPUT_PATH,
      VERTEX_GROUPING_KEYS, EDGE_GROUPING_KEYS, USE_VERTEX_LABELS,
      USE_EDGE_LABELS, VERTEX_AGGREGATORS, VERTEX_AGGREGATOR_KEYS,
      EDGE_AGGREGATORS, EDGE_AGGREGATOR_KEYS,
      getExecutionEnvironment().getLastJobExecutionResult()
        .getNetRuntime(TimeUnit.SECONDS));

    File f = new File(CSV_PATH);
    if (f.exists() && !f.isDirectory()) {
      FileUtils.writeStringToFile(f, tail, true);
    } else {
      PrintWriter writer = new PrintWriter(CSV_PATH, "UTF-8");
      writer.print(head);
      writer.print(tail);
      writer.close();
    }
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public String getDescription() {
    return GroupingBenchmark.class.getName();
  }
}