/*
* This file is part of Gradoop.
*
* Gradoop is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Gradoop is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Gradoop. If not, see <http://www.gnu.org/licenses/>.
*/
package org.gradoop.examples.io;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.DataSetUtils;
import org.gradoop.common.model.impl.properties.Properties;
import org.gradoop.flink.io.impl.csv.CSVConstants;
import org.gradoop.flink.io.impl.csv.CSVDataSink;
import org.gradoop.flink.io.impl.graph.GraphDataSource;
import org.gradoop.flink.io.impl.graph.tuples.ImportEdge;
import org.gradoop.flink.io.impl.graph.tuples.ImportVertex;
import org.gradoop.flink.model.impl.LogicalGraph;
import org.gradoop.flink.util.GradoopFlinkConfig;
import java.util.Calendar;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Example program that reads the Pokec social network from a CSV representation
* into a {@link LogicalGraph}, build a summary graph based on users attributed
* and stores it into JSON files.
*
* The dataset is available under https://snap.stanford.edu/data/soc-pokec.html.
*/
public class PokecExample {
/**
* Filename that contains Pokec profiles
*/
private static final String PROFILES = "/soc-pokec-profiles.txt";
/**
* Filename that contains Pokec relationships
*/
private static final String RELATIONSHIPS = "/soc-pokec-relationships.txt";
/**
* In the dataset, a missing value is denoted by that value.
*/
private static final String NULL_STRING = "null";
/**
* Vertex label to use during import
*/
private static final String VERTEX_LABEL = "Person";
/**
* Edge label to use during import
*/
private static final String EDGE_LABEL = "knows";
/**
* Position of the "gender" attribute in the profiles CSV
*/
private static final int CSV_IDX_GENDER = 3;
/**
* Position of the "region" attribute in the profiles CSV
*/
private static final int CSV_IDX_REGION = 4;
/**
* Position of the "age" attribute in the profiles CSV
*/
private static final int CSV_IDX_AGE = 7;
/**
* Position of the "body" attribute in the profile CSV
*/
private static final int CSV_IDX_BODY = 8;
/**
* Position of the "I_am_working_in_field" attribute in the profile CSV
*/
private static final int CSV_WORKING_FIELD = 9;
/**
* Position of the "eye_color" attribute in the profile CSV
*/
private static final int CSV_EYE_COLOR = 16;
/**
* Position of the "hair_color" attribute in the profile CSV
*/
private static final int CSV_HAIR_COLOR = 17;
/**
* Property key to use for the gender attribute
*/
private static final String PROP_KEY_GENDER = "gender";
/**
* Property key to use for the region attribute
*/
private static final String PROP_KEY_REGION = "region";
/**
* Property key to use for the decade attribute
*/
private static final String PROP_KEY_DECADE = "decade";
/**
* Property key to use for the height attribute
*/
private static final String PROP_KEY_HEIGHT = "height";
/**
* Property key to use for the height group attribute
*/
private static final String PROP_KEY_HEIGHT_GROUP = "height_group";
/**
* Property key to use for the weight attribute
*/
private static final String PROP_KEY_WEIGHT = "weight";
/**
* Property key to use for the weight group attribute
*/
private static final String PROP_KEY_WEIGHT_GROUP = "weight_group";
/**
* Property key to use for the working field attribute
*/
private static final String PROP_KEY_WORKING_FIELD = "working_field";
/**
* Property key to use for the eye color attribute
*/
private static final String PROP_KEY_EYE_COLOR = "eye_color";
/**
* Property key to use for the hair color attribute
*/
private static final String PROP_KEY_HAIR_COLOR = "hair_color";
/**
* Reads the Pokec network from a given directory. The graph can be stored in
* local file system or HDFS.
*
* args[0]: path to directory that contains pokec files (e.g. hdfs:///pokec/)
* args[1]: path to write the output graph to (e.g. hdfs:///output/)
* @param args arguments
* @throws Exception
*/
public static void main(String[] args) throws Exception {
final String inputDir = args[0];
final String outputDir = args[1];
// init Flink execution environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// create default Gradoop config
final GradoopFlinkConfig config = GradoopFlinkConfig.createConfig(env);
final String profiles = inputDir + PROFILES;
final String relations = inputDir + RELATIONSHIPS;
//--------------------------------------------------------------------------
// Read vertices
//--------------------------------------------------------------------------
// read profiles from file and create import vertices
DataSet<ImportVertex<Long>> importVertices = env
.readTextFile(profiles)
.map(new MapFunction<String, ImportVertex<Long>>() {
private Pattern splitPattern = Pattern.compile("\\t");
private Pattern numberPattern = Pattern.compile("(\\d+)");
private int year = Calendar.getInstance().get(Calendar.YEAR);
private ImportVertex<Long> importVertex = new ImportVertex<>();
@SuppressWarnings("Duplicates")
@Override
public ImportVertex<Long> map(String line) throws Exception {
String[] fields = line.split(splitPattern.pattern());
importVertex.setId(Long.parseLong(fields[0])); // user-id
importVertex.setLabel(VERTEX_LABEL);
Properties properties = Properties.create();
// set gender if existing
String field = fields[CSV_IDX_GENDER];
if (!field.equals(NULL_STRING)) {
int gender = Integer.parseInt(field);
properties.set(PROP_KEY_GENDER, gender == 1 ? "male" : "female");
}
// set region if existing
if (!fields[CSV_IDX_REGION].equals(NULL_STRING)) {
properties.set(PROP_KEY_REGION, fields[CSV_IDX_REGION]);
}
// compute year of birth from users age (if existing)
field = fields[CSV_IDX_AGE];
if (!field.equals(NULL_STRING) && !field.equals("0")) {
int yob = year - Integer.parseInt(field);
properties.set(PROP_KEY_DECADE, yob - yob % 10);
}
// try to get the height and the weight of the user
field = fields[CSV_IDX_BODY];
if (!field.equals(NULL_STRING)) {
Matcher matcher = numberPattern.matcher(field);
if (matcher.find()) {
try {
int height = Integer.parseInt(matcher.group(1));
int heightGroup = height - height % 10;
properties.set(PROP_KEY_HEIGHT, height);
properties.set(PROP_KEY_HEIGHT_GROUP, heightGroup);
} catch (NumberFormatException ignored) { }
}
if (matcher.find()) {
try {
int weight = Integer.parseInt(matcher.group(1));
int weightGroup = weight - weight % 10;
properties.set(PROP_KEY_WEIGHT, weight);
properties.set(PROP_KEY_WEIGHT_GROUP, weightGroup);
} catch (NumberFormatException ignored) { }
}
}
// set working field if existing
field = fields[CSV_WORKING_FIELD];
if (!field.equals(NULL_STRING) && !field.contains(CSVConstants.VALUE_DELIMITER)) {
properties.set(PROP_KEY_WORKING_FIELD, fields[CSV_WORKING_FIELD]);
}
// set eye color if existing
field = fields[CSV_EYE_COLOR];
if (!field.equals(NULL_STRING) && !field.contains(CSVConstants.VALUE_DELIMITER)) {
properties.set(PROP_KEY_EYE_COLOR, fields[CSV_EYE_COLOR]);
}
// set hair color if existing
field = fields[CSV_HAIR_COLOR];
if (!field.equals(NULL_STRING) && !field.contains(CSVConstants.VALUE_DELIMITER)) {
properties.set(PROP_KEY_HAIR_COLOR, fields[CSV_HAIR_COLOR]);
}
importVertex.setProperties(properties);
return importVertex;
}
});
//--------------------------------------------------------------------------
// Read edges
//--------------------------------------------------------------------------
// read relationships from file (edges)
// each edge is represented by a Tuple2 (source-id, target-id)
DataSource<Tuple2<Long, Long>> edges = env
.readCsvFile(relations)
.fieldDelimiter("\t")
.includeFields(true, true)
.types(Long.class, Long.class);
// assign a unique long id to each edge tuple
DataSet<Tuple2<Long, Tuple2<Long, Long>>> edgesWithId =
DataSetUtils.zipWithUniqueId(edges);
// transform to ImportEdge
final DataSet<ImportEdge<Long>> importEdges = edgesWithId.map(
new MapFunction<Tuple2<Long, Tuple2<Long, Long>>, ImportEdge<Long>>() {
private final ImportEdge<Long> importEdge = new ImportEdge<>();
@Override
public ImportEdge<Long> map(
Tuple2<Long, Tuple2<Long, Long>> value) throws Exception {
importEdge.setId(value.f0);
importEdge.setSourceId(value.f1.f0);
importEdge.setTargetId(value.f1.f1);
importEdge.setLabel(EDGE_LABEL);
importEdge.setProperties(Properties.create());
return importEdge;
}
}).withForwardedFields("f0;f1.f0->f1;f1.f1->f2");
//--------------------------------------------------------------------------
// Example Graph Analytics
//--------------------------------------------------------------------------
// Create an EPGM logical graph
new GraphDataSource<>(importVertices, importEdges, config)
.getLogicalGraph()
// store the result into csv files
.writeTo(new CSVDataSink(
outputDir,
config
));
env.execute();
}
}