package edu.cmu.graphchi.hadoop; import edu.cmu.graphchi.ChiLogger; import edu.cmu.graphchi.preprocessing.EdgeProcessor; import edu.cmu.graphchi.preprocessing.FastSharder; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.pig.*; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.tools.pigstats.PigStatusReporter; import java.io.IOException; import java.util.logging.Logger; /** * Special PIG loader that wraps a graphchi application. This * allows execution of GraphChi programs under Hadoop/PIG. * * Generally, if you have an GraphChi application extending PigGraphChibase, * you can use it by calling in PIG: * results = LOAD '$GRAPH' USING my.app.GraphChiApp() as (...) * * Above, $GRAPH is a path where you have stored a file in edge list format: * mydata = FOREACH mygraph GENERATE from_id, to_id, edge_value; * STORE mydata INTO '$GRAPH' * * For an example, see * @see edu.cmu.graphchi.apps.pig.PigPagerank */ public abstract class PigGraphChiBase extends LoadFunc implements LoadMetadata { private static final Logger logger = ChiLogger.getLogger("pig-graphchi-base"); private String location; private boolean activeNode = false; private Job job; private boolean ready = false; private String status = "initializing"; protected PigGraphChiBase() { } // Example: (vertex:int, value:float)" protected abstract String getSchemaString(); @Override public ResourceSchema getSchema(String str, Job job) throws IOException { return null; } @Override public ResourceStatistics getStatistics(String s, Job job) throws IOException { return null; } @Override public String[] getPartitionKeys(String s, Job job) throws IOException { return null; // Disable partition } @Override public void setPartitionFilter(Expression expression) throws IOException { } @Override public InputFormat getInputFormat() throws IOException { return new PigTextInputFormat(); } protected abstract int getNumShards(); protected String getGraphName() { return "pigudfgraph"; } @Override public void setLocation(String location, Job job) throws IOException { logger.info("Set HDFS location for GraphChi Pig: " + location); PigTextInputFormat.setInputPaths(job, location); this.location = location; this.job = job; } public void setStatusString(String status) { this.status = status; } protected abstract void runGraphChi() throws Exception; protected abstract FastSharder createSharder(String graphName, int numShards) throws IOException; @Override public void prepareToRead(final RecordReader recordReader, final PigSplit pigSplit) throws IOException { try { int j = 0; for(String s : pigSplit.getLocations()) { System.out.println((j++) + "Split : " + s); } System.out.println("Num paths: " + pigSplit.getNumPaths()); System.out.println("" + pigSplit.getConf()); System.out.println("split index " + pigSplit.getSplitIndex()); Thread progressThread = new Thread(new Runnable() { public void run() { int i = 0; while(!ready) { PigStatusReporter.getInstance().progress(); PigStatusReporter.getInstance().setStatus("GraphChi running (" + i + "): " + getStatusString()); try { Thread.sleep(5000); } catch (InterruptedException ioe) {} } } }); progressThread.start(); if (pigSplit.getSplitIndex() > 0) { PigStatusReporter.getInstance().setStatus("Redundant GraphChi-mapper - will die"); throw new RuntimeException("Split index > 0 -- this mapper will die (expected, not an error)."); } activeNode = true; Thread chiThread = new Thread(new Runnable() { public void run() { try { setStatusString("Preprocessing: reading data from HDFS: " + location); final FastSharder sharder = createSharder(getGraphName(), getNumShards()); HDFSGraphLoader hdfsLoader = new HDFSGraphLoader(location, new EdgeProcessor<Float>() { long counter = 0; public Float receiveEdge(int from, int to, String token) { try { sharder.addEdge(from, to, token); counter++; if (counter % 100000 == 0) { setStatusString("Preprocessing, read " + counter + " edges"); } } catch (IOException e) { throw new RuntimeException(e); } return null; } }); hdfsLoader.load(pigSplit.getConf()); setStatusString("Sharding..."); sharder.process(); logger.info("Starting to run GraphChi"); setStatusString("Start GraphChi engine"); runGraphChi(); logger.info("Ready."); } catch (Exception err) { err.printStackTrace(); } ready = true; }}); chiThread.start(); } catch (Exception e) { e.printStackTrace(); } } protected String getStatusString() { return this.status; } protected abstract Tuple getNextResult(TupleFactory tupleFactory) throws ExecException; @Override public Tuple getNext() throws IOException { if (!activeNode) return null; while (!ready) { logger.info("GraphChi-Java running: waiting for graphchi-engine to finish: " + this.getStatusString()); PigStatusReporter.getInstance().setStatus(getStatusString()); PigStatusReporter.getInstance().progress(); try { Thread.sleep(5000); } catch (InterruptedException ioe) { } } return getNextResult(TupleFactory.getInstance()); } }