/* * This file is part of Gradoop. * * Gradoop is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Gradoop is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Gradoop. If not, see <http://www.gnu.org/licenses/>. */ package org.gradoop.examples.io; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.operators.DataSource; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.utils.DataSetUtils; import org.gradoop.common.model.impl.properties.Properties; import org.gradoop.flink.io.impl.csv.CSVConstants; import org.gradoop.flink.io.impl.csv.CSVDataSink; import org.gradoop.flink.io.impl.graph.GraphDataSource; import org.gradoop.flink.io.impl.graph.tuples.ImportEdge; import org.gradoop.flink.io.impl.graph.tuples.ImportVertex; import org.gradoop.flink.model.impl.LogicalGraph; import org.gradoop.flink.util.GradoopFlinkConfig; import java.util.Calendar; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Example program that reads the Pokec social network from a CSV representation * into a {@link LogicalGraph}, build a summary graph based on users attributed * and stores it into JSON files. * * The dataset is available under https://snap.stanford.edu/data/soc-pokec.html. */ public class PokecExample { /** * Filename that contains Pokec profiles */ private static final String PROFILES = "/soc-pokec-profiles.txt"; /** * Filename that contains Pokec relationships */ private static final String RELATIONSHIPS = "/soc-pokec-relationships.txt"; /** * In the dataset, a missing value is denoted by that value. */ private static final String NULL_STRING = "null"; /** * Vertex label to use during import */ private static final String VERTEX_LABEL = "Person"; /** * Edge label to use during import */ private static final String EDGE_LABEL = "knows"; /** * Position of the "gender" attribute in the profiles CSV */ private static final int CSV_IDX_GENDER = 3; /** * Position of the "region" attribute in the profiles CSV */ private static final int CSV_IDX_REGION = 4; /** * Position of the "age" attribute in the profiles CSV */ private static final int CSV_IDX_AGE = 7; /** * Position of the "body" attribute in the profile CSV */ private static final int CSV_IDX_BODY = 8; /** * Position of the "I_am_working_in_field" attribute in the profile CSV */ private static final int CSV_WORKING_FIELD = 9; /** * Position of the "eye_color" attribute in the profile CSV */ private static final int CSV_EYE_COLOR = 16; /** * Position of the "hair_color" attribute in the profile CSV */ private static final int CSV_HAIR_COLOR = 17; /** * Property key to use for the gender attribute */ private static final String PROP_KEY_GENDER = "gender"; /** * Property key to use for the region attribute */ private static final String PROP_KEY_REGION = "region"; /** * Property key to use for the decade attribute */ private static final String PROP_KEY_DECADE = "decade"; /** * Property key to use for the height attribute */ private static final String PROP_KEY_HEIGHT = "height"; /** * Property key to use for the height group attribute */ private static final String PROP_KEY_HEIGHT_GROUP = "height_group"; /** * Property key to use for the weight attribute */ private static final String PROP_KEY_WEIGHT = "weight"; /** * Property key to use for the weight group attribute */ private static final String PROP_KEY_WEIGHT_GROUP = "weight_group"; /** * Property key to use for the working field attribute */ private static final String PROP_KEY_WORKING_FIELD = "working_field"; /** * Property key to use for the eye color attribute */ private static final String PROP_KEY_EYE_COLOR = "eye_color"; /** * Property key to use for the hair color attribute */ private static final String PROP_KEY_HAIR_COLOR = "hair_color"; /** * Reads the Pokec network from a given directory. The graph can be stored in * local file system or HDFS. * * args[0]: path to directory that contains pokec files (e.g. hdfs:///pokec/) * args[1]: path to write the output graph to (e.g. hdfs:///output/) * @param args arguments * @throws Exception */ public static void main(String[] args) throws Exception { final String inputDir = args[0]; final String outputDir = args[1]; // init Flink execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // create default Gradoop config final GradoopFlinkConfig config = GradoopFlinkConfig.createConfig(env); final String profiles = inputDir + PROFILES; final String relations = inputDir + RELATIONSHIPS; //-------------------------------------------------------------------------- // Read vertices //-------------------------------------------------------------------------- // read profiles from file and create import vertices DataSet<ImportVertex<Long>> importVertices = env .readTextFile(profiles) .map(new MapFunction<String, ImportVertex<Long>>() { private Pattern splitPattern = Pattern.compile("\\t"); private Pattern numberPattern = Pattern.compile("(\\d+)"); private int year = Calendar.getInstance().get(Calendar.YEAR); private ImportVertex<Long> importVertex = new ImportVertex<>(); @SuppressWarnings("Duplicates") @Override public ImportVertex<Long> map(String line) throws Exception { String[] fields = line.split(splitPattern.pattern()); importVertex.setId(Long.parseLong(fields[0])); // user-id importVertex.setLabel(VERTEX_LABEL); Properties properties = Properties.create(); // set gender if existing String field = fields[CSV_IDX_GENDER]; if (!field.equals(NULL_STRING)) { int gender = Integer.parseInt(field); properties.set(PROP_KEY_GENDER, gender == 1 ? "male" : "female"); } // set region if existing if (!fields[CSV_IDX_REGION].equals(NULL_STRING)) { properties.set(PROP_KEY_REGION, fields[CSV_IDX_REGION]); } // compute year of birth from users age (if existing) field = fields[CSV_IDX_AGE]; if (!field.equals(NULL_STRING) && !field.equals("0")) { int yob = year - Integer.parseInt(field); properties.set(PROP_KEY_DECADE, yob - yob % 10); } // try to get the height and the weight of the user field = fields[CSV_IDX_BODY]; if (!field.equals(NULL_STRING)) { Matcher matcher = numberPattern.matcher(field); if (matcher.find()) { try { int height = Integer.parseInt(matcher.group(1)); int heightGroup = height - height % 10; properties.set(PROP_KEY_HEIGHT, height); properties.set(PROP_KEY_HEIGHT_GROUP, heightGroup); } catch (NumberFormatException ignored) { } } if (matcher.find()) { try { int weight = Integer.parseInt(matcher.group(1)); int weightGroup = weight - weight % 10; properties.set(PROP_KEY_WEIGHT, weight); properties.set(PROP_KEY_WEIGHT_GROUP, weightGroup); } catch (NumberFormatException ignored) { } } } // set working field if existing field = fields[CSV_WORKING_FIELD]; if (!field.equals(NULL_STRING) && !field.contains(CSVConstants.VALUE_DELIMITER)) { properties.set(PROP_KEY_WORKING_FIELD, fields[CSV_WORKING_FIELD]); } // set eye color if existing field = fields[CSV_EYE_COLOR]; if (!field.equals(NULL_STRING) && !field.contains(CSVConstants.VALUE_DELIMITER)) { properties.set(PROP_KEY_EYE_COLOR, fields[CSV_EYE_COLOR]); } // set hair color if existing field = fields[CSV_HAIR_COLOR]; if (!field.equals(NULL_STRING) && !field.contains(CSVConstants.VALUE_DELIMITER)) { properties.set(PROP_KEY_HAIR_COLOR, fields[CSV_HAIR_COLOR]); } importVertex.setProperties(properties); return importVertex; } }); //-------------------------------------------------------------------------- // Read edges //-------------------------------------------------------------------------- // read relationships from file (edges) // each edge is represented by a Tuple2 (source-id, target-id) DataSource<Tuple2<Long, Long>> edges = env .readCsvFile(relations) .fieldDelimiter("\t") .includeFields(true, true) .types(Long.class, Long.class); // assign a unique long id to each edge tuple DataSet<Tuple2<Long, Tuple2<Long, Long>>> edgesWithId = DataSetUtils.zipWithUniqueId(edges); // transform to ImportEdge final DataSet<ImportEdge<Long>> importEdges = edgesWithId.map( new MapFunction<Tuple2<Long, Tuple2<Long, Long>>, ImportEdge<Long>>() { private final ImportEdge<Long> importEdge = new ImportEdge<>(); @Override public ImportEdge<Long> map( Tuple2<Long, Tuple2<Long, Long>> value) throws Exception { importEdge.setId(value.f0); importEdge.setSourceId(value.f1.f0); importEdge.setTargetId(value.f1.f1); importEdge.setLabel(EDGE_LABEL); importEdge.setProperties(Properties.create()); return importEdge; } }).withForwardedFields("f0;f1.f0->f1;f1.f1->f2"); //-------------------------------------------------------------------------- // Example Graph Analytics //-------------------------------------------------------------------------- // Create an EPGM logical graph new GraphDataSource<>(importVertices, importEdges, config) .getLogicalGraph() // store the result into csv files .writeTo(new CSVDataSink( outputDir, config )); env.execute(); } }