package edu.cmu.graphchi.hadoop;
import edu.cmu.graphchi.ChiLogger;
import edu.cmu.graphchi.preprocessing.EdgeProcessor;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.logging.Logger;
import java.util.regex.Pattern;
/**
* Loads a graph from HDFS edge by edge and calls
* a callback for each edge. Used by the Pig-integration:
* @see edu.cmu.graphchi.hadoop.PigGraphChiBase
*/
public class HDFSGraphLoader {
private EdgeProcessor edgeProcessor;
private String hdfsLocation;
private static final Logger logger = ChiLogger.getLogger("hdfs-graph-loader");
public HDFSGraphLoader(String hdfsLocation, EdgeProcessor edgeProcessor) {
this.edgeProcessor = edgeProcessor;
this.hdfsLocation = hdfsLocation;
}
public void load(Configuration conf) throws Exception {
FileSystem fs = FileSystem.get(conf);
Path inputPath = new Path(hdfsLocation);
if (fs.isFile(inputPath)) {
processFile(fs, inputPath);
} else {
FileStatus[] dirFiles = fs.listStatus(inputPath);
for(FileStatus status : dirFiles) {
logger.info("Dir entry: " + status.toString() + " " + status.getPath().getName());
if (status.getPath().getName().startsWith("part-")) {
processFile(fs, status.getPath());
}
}
}
}
private void processFile(FileSystem fs, Path inputPath) throws IOException {
logger.info("Process: " + inputPath);
FSDataInputStream in = fs.open(inputPath);
BufferedReader rd = new BufferedReader(new InputStreamReader(in));
Pattern tokenPattern = Pattern.compile("(\t)+|( )+|(,)+");
String ln;
while ((ln = rd.readLine()) != null) {
if (ln.startsWith("#")) continue;
String[] tok = tokenPattern.split(ln);
if (tok.length >= 2) {
try {
int from = Integer.parseInt(tok[0]);
int to = Integer.parseInt(tok[1]);
edgeProcessor.receiveEdge(from, to, tok.length == 3 ? tok[2] : null);
} catch (NumberFormatException nfe) {
logger.warning("Number format exceptions on line: " + ln);
nfe.printStackTrace();
}
}
}
}
}