package edu.isi.karma.mapreduce.function;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import edu.isi.karma.rdf.CommandLineArgumentParser;
public class CreateJSONFromSequenceFile {
static String filePath = null;
static String outputPath = null;
static String outputtype = "0"; //by default output json array
public static void main(String[] args) throws IOException, InstantiationException, IllegalAccessException, ClassNotFoundException {
Options options = createCommandLineOptions();
CommandLine cl = CommandLineArgumentParser.parse(args, options, CreateJSONFromSequenceFile.class.getSimpleName());
if(cl == null)
{
return;
}
filePath = (String) cl.getOptionValue("filepath");
outputPath = filePath;
if (cl.hasOption("outputpath")) {
outputPath = (String) cl.getOptionValue("outputpath");
}
if(cl.hasOption("outputtype")){
outputtype = (String) cl.getOptionValue("outputtype");
}
FileSystem hdfs = FileSystem.get(new Configuration());
RemoteIterator<LocatedFileStatus> itr = hdfs.listFiles(new Path(filePath), true);
while (itr.hasNext()) {
LocatedFileStatus status = itr.next();
String fileName = status.getPath().getName();
if (status.getLen() > 0) {
String outputFileName = outputPath + File.separator + fileName;// + ".json";
List<FSDataOutputStream> streams = new LinkedList<>();
if(cl.hasOption("splits"))
{
Integer splits = Integer.parseInt((String) cl.getOptionValue("splits"));
for(int i = 0; i < splits; i ++)
{
streams.add(hdfs.create(new Path(outputFileName+"."+i + ".json")));
}
}
else
{
streams.add(hdfs.create(new Path(outputFileName+ ".json")));
}
createJSONFromSequenceFileFrom(status.getPath(), streams);
}
}
}
public static void createJSONFromSequenceFileFrom(Path input, List<FSDataOutputStream> streams) throws IOException, InstantiationException, IllegalAccessException, ClassNotFoundException {
Path inputPath = input;
Configuration conf = new Configuration();
List<PrintWriter> fws = new LinkedList<>();
for(FSDataOutputStream stream : streams)
{
PrintWriter fw = new PrintWriter(stream);
fws.add(fw);
if (outputtype.equals("0")){
fw.write("[\n");
}
}
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(inputPath));
Writable key = (Writable) Class.forName(reader.getKeyClass().getCanonicalName()).newInstance();
Text val = new Text();
int writtenTo = 0;
Iterator<PrintWriter> pwIterator = fws.iterator();
while(reader.next(key, val))
{
if(!pwIterator.hasNext())
{
pwIterator = fws.iterator();
}
PrintWriter fw = pwIterator.next();
if(writtenTo < fws.size())
{
writtenTo++;
}
else
{
if (outputtype.equals("0")){
fw.write(",\n");
}
}
fw.write(val.toString());
fw.write("\n");
}
for(PrintWriter fw : fws)
{
if(outputtype.equals("0")){
fw.write("\n]\n");
}
fw.close();
}
reader.close();
}
private static Options createCommandLineOptions() {
Options options = new Options();
options.addOption(new Option("filepath", "filepath", true, "location of the input file directory"));
options.addOption(new Option("outputpath", "outputpath", true, "location of output file directory"));
options.addOption(new Option("splits", "splits", true, "number of splits per file"));
options.addOption(new Option("outputtype", "outputtype", true, "0 for JSON Array or 1 for json lines"));
options.addOption(new Option("help", "help", false, "print this message"));
return options;
}
}