package com.esri.json.hadoop;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
*
* Enumerates records from an Unenclosed JSON file - use either Esri JSON or GeoJSON subclass
*
*/
public abstract class UnenclosedBaseJsonRecordReader extends RecordReader<LongWritable, Text> implements
org.apache.hadoop.mapred.RecordReader<LongWritable, Text> {
static final Log LOG = LogFactory.getLog(UnenclosedBaseJsonRecordReader.class.getName());
protected BufferedReader inputReader;
protected LongWritable mkey = null;
protected Text mval = null;
protected long readerPosition;
protected long start, end;
protected boolean firstBraceConsumed = false;
protected UnenclosedBaseJsonRecordReader() throws IOException {
mkey = createKey();
mval = createValue();
}
protected UnenclosedBaseJsonRecordReader(org.apache.hadoop.mapred.InputSplit split,
Configuration conf) throws IOException {
org.apache.hadoop.mapred.FileSplit fileSplit = (org.apache.hadoop.mapred.FileSplit)split;
start = fileSplit.getStart();
end = fileSplit.getLength() + start;
Path filePath = fileSplit.getPath();
commonInit(filePath, conf);
}
@Override
public void close() throws IOException {
if (inputReader != null)
inputReader.close();
}
@Override
public LongWritable createKey() {
return new LongWritable();
}
@Override
public Text createValue() {
return new Text();
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return mkey;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return mval;
}
@Override
public long getPos() throws IOException {
return readerPosition;
}
@Override
public float getProgress() throws IOException {
return (float)(readerPosition-start)/(end-start);
}
@Override
public void initialize(InputSplit split, TaskAttemptContext taskContext)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit)split;
start = fileSplit.getStart();
end = fileSplit.getLength() + start;
Path filePath = fileSplit.getPath();
commonInit(filePath, taskContext.getConfiguration());
}
@Override
public boolean next(LongWritable key, Text value) throws IOException {
/*
* NOTE : we are not using a JSONParser, so this will not validate JSON structure aside from correct counts of '{' and '}'
* The fact that it may handle some invalid JSON, does not imply that we support invalid JSON;
* rather, updates to the code may require valid JSON in order to locate record boundaries.
*
* We will count '{' and '}' to find the beginning and end of each record, while ignoring braces in string literals.
*/
int chr = 0;
int brace_depth = 0;
char lit_char = 0;
boolean first_brace_found = false;
// The case of split point exactly at whitespace between records,
// is handled by forcing the record following to the split following,
// in the interest of better balancing the splits, by consuming the
// whitespace before checking the end of the split.
if (!firstBraceConsumed) { // That should only ever be true on the very first read in the split
chr = getNonWhite();
firstBraceConsumed = (chr == '{');
}
if ( readerPosition + (firstBraceConsumed ? 0 : 1) > end ) {
return false;
}
StringBuilder sb = new StringBuilder(2000);
if (firstBraceConsumed) {
// first open brace was consumed already;
// update initial state accordingly
brace_depth = 1;
sb.append("{");
first_brace_found = true;
firstBraceConsumed = false;
key.set(readerPosition - 1);
}
boolean inEscape = false;
while (brace_depth > 0 || !first_brace_found)
{
chr = getChar();
if (chr < 0) {
if (first_brace_found){
// last record was invalid
LOG.error("Parsing error : EOF occured before record ended");
}
return false;
}
switch (chr)
{
case '\\':
inEscape = (lit_char != 0 && !inEscape);
break;
case '"':
case '\'':
if (lit_char == 0) {
lit_char = (char) chr; // mark start literal (double/single quote)
}
else if (inEscape) {
inEscape = false;
}
else if (lit_char == chr) {
lit_char = 0; // mark end literal (double/single-quote)
}
// ignored because we found a ' inside a " " block quote (or vice versa)
break;
case '{':
if (inEscape) {
inEscape = false;
}
else if (lit_char == 0) { // not in string literal,
brace_depth++; // so increase brace depth
if (!first_brace_found) {
first_brace_found = true;
key.set(readerPosition - 1); // set record key to the char offset of the first '{'
}
}
break;
case '}':
if (inEscape) {
inEscape = false;
}
else if (lit_char == 0) { // not in string literal,
brace_depth--; // so decrease brace depth
}
break;
default:
inEscape = false;
break;
}
if (brace_depth < 0){
// found more '}'s than we did '{'s
LOG.error("Parsing error : no '{' - unmatched '}' in record");
return false;
}
if (first_brace_found){
sb.append((char)chr);
}
}
// no '{' found before EOF. Not an error as this could mean that there is extra white-space at the end
if (!first_brace_found){
return false;
}
value.set(sb.toString());
return true;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return next(mkey, mval);
}
private void commonInit(Path filePath, Configuration conf) throws IOException {
readerPosition = start;
FileSystem fs = filePath.getFileSystem(conf);
inputReader = new BufferedReader(new InputStreamReader(fs.open(filePath)));
if (start != 0) {
// split starts inside the json
inputReader.skip(start);
moveToRecordStart();
}
}
protected int getChar() throws IOException {
int ch = inputReader.read();
readerPosition++;
return ch;
}
protected int getNonWhite() throws IOException {
int ch;
do {
ch = getChar();
} while (Character.isWhitespace((char)ch));
return ch;
}
/**
* Given an arbitrary byte offset into an unenclosed JSON document,
* find the start of the next record in the document. Discard trailing
* bytes from the previous record if we happened to seek to the middle
* of it.
*
* @throws IOException
*/
protected abstract boolean moveToRecordStart() throws IOException;
}