package edu.isi.karma.mapreduce.driver;
import java.io.File;
import java.io.FileInputStream;
import java.util.Properties;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class InputFileDirectoryLoader extends Configured implements Tool{
private static Logger logger = LoggerFactory.getLogger(InputFileDirectoryLoader.class);
public void configure(Properties p) throws Exception {
Configuration conf = getConf();
conf.setIfUnset("fs.default.name", p.getProperty("fs.default.name"));
}
public int run(String[] args) throws Exception {
// Configuration processed by ToolRunner
Properties p = new Properties();
p.load(new FileInputStream(new File(args[0])));
configure(p);
String inputDirectoryName = p.getProperty("input.directory");
File f = new File(inputDirectoryName);
if(!f.exists() || !f.isDirectory()){
logger.error("Invalid input directory: " + inputDirectoryName);
return -1;
}
String outputFileName = p.getProperty("output.file");
Path outputPath = new Path(outputFileName);
SequenceFile.Writer writer = SequenceFile.createWriter(getConf(),Writer.keyClass(Text.class),
Writer.valueClass(Text.class), Writer.file(outputPath));
for(File document : f.listFiles())
{
if(document.isFile())
{
String contents = FileUtils.readFileToString(document);
writer.append(new Text(document.getName()), new Text(contents));
}
}
writer.close();
return 0;
}
public static void main(String[] args) throws Exception {
// Let ToolRunner handle generic command-line options
int res = ToolRunner.run(new Configuration(), new InputFileDirectoryLoader(), args);
System.exit(res);
}
}