package hip.ch3.json; import hip.util.HadoopCompat; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.json.simple.JSONObject; import org.json.simple.parser.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; /** * Assumes one line per JSON object */ public class JsonInputFormat extends FileInputFormat<LongWritable, MapWritable> { private static final Logger log = LoggerFactory.getLogger(JsonInputFormat.class); @Override public RecordReader<LongWritable, MapWritable> createRecordReader( InputSplit split, TaskAttemptContext context) { return new JsonRecordReader(); } @Override protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = new CompressionCodecFactory(HadoopCompat.getConfiguration(context)) .getCodec(file); return codec == null; } public static class JsonRecordReader extends RecordReader<LongWritable, MapWritable> { private static final Logger LOG = LoggerFactory.getLogger(JsonRecordReader .class); private LineRecordReader reader = new LineRecordReader(); private final Text currentLine_ = new Text(); private final MapWritable value_ = new MapWritable(); private final JSONParser jsonParser_ = new JSONParser(); @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { reader.initialize(split, context); } @Override public synchronized void close() throws IOException { reader.close(); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return reader.getCurrentKey(); } @Override public MapWritable getCurrentValue() throws IOException, InterruptedException { return value_; } @Override public float getProgress() throws IOException, InterruptedException { return reader.getProgress(); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { while (reader.nextKeyValue()) { value_.clear(); if (decodeLineToJson(jsonParser_, reader.getCurrentValue(), value_)) { return true; } } return false; } public static boolean decodeLineToJson(JSONParser parser, Text line, MapWritable value) { log.info("Got string '{}'", line); try { JSONObject jsonObj = (JSONObject) parser.parse(line.toString()); for (Object key : jsonObj.keySet()) { Text mapKey = new Text(key.toString()); Text mapValue = new Text(); if (jsonObj.get(key) != null) { mapValue.set(jsonObj.get(key).toString()); } value.put(mapKey, mapValue); } return true; } catch (ParseException e) { LOG.warn("Could not json-decode string: " + line, e); return false; } catch (NumberFormatException e) { LOG.warn("Could not parse field into number: " + line, e); return false; } } } }