/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.test.recordJobs.graph; import java.io.Serializable; import java.util.ArrayList; import java.util.Iterator; import eu.stratosphere.api.common.Plan; import eu.stratosphere.api.common.Program; import eu.stratosphere.api.common.ProgramDescription; import eu.stratosphere.api.java.record.operators.FileDataSink; import eu.stratosphere.api.java.record.operators.FileDataSource; import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields; import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsFirstExcept; import eu.stratosphere.api.java.record.functions.JoinFunction; import eu.stratosphere.api.java.record.functions.ReduceFunction; import eu.stratosphere.api.java.record.io.CsvOutputFormat; import eu.stratosphere.api.java.record.io.DelimitedInputFormat; import eu.stratosphere.api.java.record.operators.JoinOperator; import eu.stratosphere.api.java.record.operators.ReduceOperator; import eu.stratosphere.types.Record; import eu.stratosphere.types.StringValue; import eu.stratosphere.util.Collector; /** * Implementation of the triangle enumeration example Pact program. * The program expects a file with RDF triples (in XML serialization) as input. Triples must be separated by linebrakes. * * The program filters for foaf:knows predicates to identify relationships between two entities (typically persons). * Relationships are interpreted as edges in a social graph. Then the program enumerates all triangles which are build * by edges in that graph. * * Usually, triangle enumeration is used as a pre-processing step to identify highly connected subgraphs. * The algorithm was published as MapReduce job by J. Cohen in "Graph Twiddling in a MapReduce World". * The Pact version was described in "MapReduce and PACT - Comparing Data Parallel Programming Models" (BTW 2011). */ public class EnumTrianglesRdfFoaf implements Program, ProgramDescription { private static final long serialVersionUID = 1L; /** * Reads RDF triples and filters on the foaf:knows RDF predicate. * The foaf:knows RDF predicate indicates that the RDF subject and object (typically of type foaf:person) know each * other. * Therefore, knowing connections between people are extracted and handles as graph edges. * The EdgeInFormat filters all rdf triples with foaf:knows predicates. The subjects and objects URLs are * compared. * The lexicographically smaller URL is set as the first field of the output record, the greater one as the second field. */ public static class EdgeInFormat extends DelimitedInputFormat { private static final long serialVersionUID = 1L; private final StringValue rdfSubj = new StringValue(); private final StringValue rdfPred = new StringValue(); private final StringValue rdfObj = new StringValue(); @Override public Record readRecord(Record target, byte[] bytes, int offset, int numBytes) { final int limit = offset + numBytes; int startPos = offset; // read RDF subject startPos = parseVarLengthEncapsulatedStringField(bytes, startPos, limit, ' ', rdfSubj, '"'); if (startPos < 0) { // invalid record, exit return null; } // read RDF predicate startPos = parseVarLengthEncapsulatedStringField(bytes, startPos, limit, ' ', rdfPred, '"'); if (startPos < 0 || !rdfPred.getValue().equals("<http://xmlns.com/foaf/0.1/knows>")) { // invalid record or predicate is not a foaf-knows predicate, exit return null; } // read RDF object startPos = parseVarLengthEncapsulatedStringField(bytes, startPos, limit, ' ', rdfObj, '"'); if (startPos < 0) { // invalid record, exit return null; } // compare RDF subject and object if (rdfSubj.compareTo(rdfObj) <= 0) { // subject is smaller, subject becomes first attribute, object second target.setField(0, rdfSubj); target.setField(1, rdfObj); } else { // object is smaller, object becomes first attribute, subject second target.setField(0, rdfObj); target.setField(1, rdfSubj); } return target; } /* * Utility method to efficiently parse encapsulated, variable length strings */ private int parseVarLengthEncapsulatedStringField(byte[] bytes, int startPos, int limit, char delim, StringValue field, char encaps) { boolean isEncaps = false; // check whether string is encapsulated if (bytes[startPos] == encaps) { isEncaps = true; } if (isEncaps) { // string is encapsulated for (int i = startPos; i < limit; i++) { if (bytes[i] == encaps) { if (bytes[i+1] == delim) { field.setValueAscii(bytes, startPos, i-startPos+1); return i+2; } } } return -1; } else { // string is not encapsulated int i; for (i = startPos; i < limit; i++) { if (bytes[i] == delim) { field.setValueAscii(bytes, startPos, i-startPos); return i+1; } } if (i == limit) { field.setValueAscii(bytes, startPos, i-startPos); return i+1; } else { return -1; } } } } /** * Builds triads (open triangle) from all two edges that share a vertex. * The common vertex is */ @ConstantFields(0) public static class BuildTriads extends ReduceFunction implements Serializable { private static final long serialVersionUID = 1L; // list of non-matching vertices private final ArrayList<StringValue> otherVertices = new ArrayList<StringValue>(32); // matching vertex private final StringValue matchVertex = new StringValue(); // mutable output record private final Record result = new Record(); // initialize list of non-matching vertices for one vertex public BuildTriads() { this.otherVertices.add(new StringValue()); } @Override public void reduce(Iterator<Record> records, Collector<Record> out) throws Exception { // read the first edge final Record rec = records.next(); // read the matching vertex rec.getFieldInto(0, this.matchVertex); // read the non-matching vertex and add it to the list rec.getFieldInto(1, this.otherVertices.get(0)); // set the matching vertex in the output record this.result.setField(0, this.matchVertex); int numEdges = 1; // while there are more edges while (records.hasNext()) { // read the next edge final Record next = records.next(); final StringValue myVertex; // obtain an object to store the non-matching vertex if (numEdges >= this.otherVertices.size()) { // we need an additional vertex object // create the object myVertex = new StringValue(); // and put it in the list this.otherVertices.add(myVertex); } else { // we reuse a previously created object from the list myVertex = this.otherVertices.get(numEdges); } // read the non-matching vertex into the obtained object next.getFieldInto(1, myVertex); // combine the current edge with all vertices in the non-matching vertex list for (int i = 0; i < numEdges; i++) { // get the other non-matching vertex final StringValue otherVertex = this.otherVertices.get(i); // add my and other vertex to the output record depending on their ordering if (otherVertex.compareTo(myVertex) < 0) { this.result.setField(1, otherVertex); this.result.setField(2, myVertex); out.collect(this.result); } else { next.setField(2, otherVertex); out.collect(next); } } numEdges++; } } } /** * Matches all missing edges with existing edges from input. * If the missing edge for a triad is found, the triad is transformed to a triangle by adding the missing edge. */ @ConstantFieldsFirstExcept({}) public static class CloseTriads extends JoinFunction implements Serializable { private static final long serialVersionUID = 1L; @Override public void join(Record triad, Record missingEdge, Collector<Record> out) throws Exception { // emit triangle (already contains missing edge at field 0 out.collect(triad); } } /** * Assembles the Plan of the triangle enumeration example Pact program. */ @Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String edgeInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); FileDataSource edges = new FileDataSource(new EdgeInFormat(), edgeInput, "BTC Edges"); ReduceOperator buildTriads = ReduceOperator.builder(new BuildTriads(), StringValue.class, 0) .name("Build Triads") .build(); JoinOperator closeTriads = JoinOperator.builder(new CloseTriads(), StringValue.class, 1, 0) .keyField(StringValue.class, 2, 1) .name("Close Triads") .build(); closeTriads.setParameter("INPUT_LEFT_SHIP_STRATEGY", "SHIP_REPARTITION_HASH"); closeTriads.setParameter("INPUT_RIGHT_SHIP_STRATEGY", "SHIP_REPARTITION_HASH"); closeTriads.setParameter("LOCAL_STRATEGY", "LOCAL_STRATEGY_HASH_BUILD_SECOND"); FileDataSink triangles = new FileDataSink(new CsvOutputFormat(), output, "Output"); CsvOutputFormat.configureRecordFormat(triangles) .recordDelimiter('\n') .fieldDelimiter(' ') .field(StringValue.class, 0) .field(StringValue.class, 1) .field(StringValue.class, 2); triangles.setInput(closeTriads); closeTriads.setSecondInput(edges); closeTriads.setFirstInput(buildTriads); buildTriads.setInput(edges); Plan plan = new Plan(triangles, "Enumerate Triangles"); plan.setDefaultParallelism(numSubTasks); return plan; } /* * (non-Javadoc) * @see eu.stratosphere.pact.common.plan.PlanAssemblerDescription#getDescription() */ @Override public String getDescription() { return "Parameters: [numSubStasks] [inputRDFTriples] [outputTriangles]"; } }