package edu.isi.karma.mapreduce.function; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.SequenceFile.Writer; import org.apache.hadoop.io.Text; import org.json.JSONException; import org.json.JSONObject; import org.json.JSONTokener; import edu.isi.karma.rdf.CommandLineArgumentParser; public class CreateSequenceFile { Map<String, Writer> writers = new ConcurrentHashMap<>(); boolean useKey = true; boolean outputFileName = false; String filePath = null; String outputPath = null; public static void main(String[] args) throws IOException { CreateSequenceFile csf = new CreateSequenceFile(); csf.setup(args); csf.execute(); } public void execute() throws IOException, FileNotFoundException { ExecutorService executor = Executors.newFixedThreadPool(4); FileSystem hdfs = FileSystem.get(new Configuration()); RemoteIterator<LocatedFileStatus> itr = hdfs.listFiles(new Path(filePath), true); List<Future<Boolean>> results = new LinkedList<>(); while (itr.hasNext()) { LocatedFileStatus status = itr.next(); String fileName = status.getPath().getName(); if (fileName.substring(fileName.lastIndexOf(".") + 1).contains("json")) { results.add(executor.submit(getNewJSONProcessor(hdfs, status, fileName))); } } for(Future<Boolean> result : results) { try { result.get(5, TimeUnit.MINUTES); } catch (InterruptedException | ExecutionException | TimeoutException e) { // TODO Auto-generated catch block e.printStackTrace(); } } executor.shutdown(); for(SequenceFile.Writer writer : writers.values()) { writer.close(); } } protected JSONFileProcessor getNewJSONProcessor(FileSystem hdfs, LocatedFileStatus status, String fileName) throws IOException { return new JSONFileProcessor(hdfs.open(status.getPath()), fileName); } public void setup(String[] args) { Options options = createCommandLineOptions(); CommandLine cl = CommandLineArgumentParser.parse(args, options, CreateSequenceFile.class.getSimpleName()); if(cl == null) { return; } filePath = (String) cl.getOptionValue("filepath"); outputPath = filePath; useKey = Boolean.parseBoolean((String) cl.getOptionValue("usekey")); outputFileName = Boolean.parseBoolean((String) cl.getOptionValue("outputfilename")); if (cl.hasOption("outputpath")) { outputPath = (String) cl.getOptionValue("outputpath"); } } protected class JSONFileProcessor implements Callable<Boolean> { protected InputStream stream; protected String fileName; protected String defaultOutputFileName; public JSONFileProcessor(InputStream stream, String fileName) { this.stream = stream; this.fileName = fileName; defaultOutputFileName = outputPath + File.separator +fileName.substring(0, fileName.lastIndexOf(".")) + ".seq"; } @Override public Boolean call() throws Exception { JSONTokener tokener = new JSONTokener(new InputStreamReader(stream, "UTF-8")); addValuesToSequenceFile(tokener); return true; } public SequenceFile.Writer createSequenceFile(Path outputPath) throws IOException { SequenceFile.Writer writer = null; if(useKey) { writer = SequenceFile.createWriter(new Configuration(),Writer.keyClass(Text.class), Writer.valueClass(Text.class), Writer.file(outputPath),Writer.compression(CompressionType.NONE)); } else { writer = SequenceFile.createWriter(new Configuration(),Writer.keyClass(BytesWritable.class), Writer.valueClass(Text.class), Writer.file(outputPath),Writer.compression(CompressionType.NONE)); } return writer; } public void addValuesToSequenceFile(JSONTokener tokener) throws JSONException, IOException { char c = tokener.nextClean(); if (c == '[') { while (!tokener.end()) { Object o = tokener.nextValue(); if (o instanceof JSONObject) { JSONObject obj = (JSONObject) o; SequenceFile.Writer writer = getWriter(obj); if(useKey) { if(outputFileName) { writer.append(new Text(fileName), new Text(obj.toString())); } else { writer.append(new Text(obj.getString("@id")), new Text(obj.toString())); } } else { writer.append(new BytesWritable(), new Text(obj.toString())); } } char tmp = tokener.nextClean(); if (tmp == ']') break; } } else if(c == '{') { tokener.back(); Object o = tokener.nextValue(); if (o instanceof JSONObject) { JSONObject obj = (JSONObject) o; SequenceFile.Writer writer = getWriter(obj); if(useKey) { if(outputFileName) { writer.append(new Text(fileName), new Text(obj.toString())); } else { writer.append(new Text(obj.getString("@id")), new Text(obj.toString())); } } else { writer.append(new BytesWritable(), new Text(obj.toString())); } } } } public SequenceFile.Writer getWriter(JSONObject obj) throws IOException { if(!writers.containsKey(defaultOutputFileName)) { Path outputPath = new Path(defaultOutputFileName); synchronized(writers) { writers.put(defaultOutputFileName, createSequenceFile(outputPath)); } } return writers.get(defaultOutputFileName); } } private static Options createCommandLineOptions() { Options options = new Options(); options.addOption(new Option("filepath", "filepath", true, "location of the input file directory")); options.addOption(new Option("usekey", "usekey", true,"whether use key for sequence file")); options.addOption(new Option("outputfilename", "outputfilename",true, "whether output file name as key")); options.addOption(new Option("outputpath", "outputpath", true, "location of output file directory")); options.addOption(new Option("help", "help", false, "print this message")); return options; } }