package com.esri.json.hadoop;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
/**
*
* Enumerates records from an Unenclosed GeoJSON file
*
*/
/*
* The JSON will look like this (white-space ignored)
*
* { // start record 1
* "type" : ""
* "properties" : {}
* "geometry" : {}
* } // end record 1
* { // start record 2
* "type" : ""
* "properties" : {}
* "geometry" : {}
* } // end record 2
*/
public class UnenclosedGeoJsonRecordReader extends UnenclosedBaseJsonRecordReader {
static final Log LOG = LogFactory.getLog(UnenclosedGeoJsonRecordReader.class.getName());
public UnenclosedGeoJsonRecordReader() throws IOException { // explicit just to declare exception
super();
}
public UnenclosedGeoJsonRecordReader(org.apache.hadoop.mapred.InputSplit split,
Configuration conf) throws IOException {
//attrLabel = "properties";
super(split, conf);
}
// Record boundary defined as : \{\s*"(properties|geometry)"\s*:\s*\{
protected boolean moveToRecordStart() throws IOException {
// The case of split point exactly at whitespace between records, is
// handled by forcing it to the split following, in the interest of
// better balancing the splits, by consuming the whitespace in next().
// The alternative of forcing it to the split preceding, could be
// done like what is commented here.
// while (next != '{' || skipDup > 0) { // skipDup>0 => record already consumed
// next = getChar();
// if (next < 0) return false; // end of stream, no good
// if (next == '}') skipDup = -1; // Definitely not
// else if (skipDup == 0) skipDup = 1; // no info - Maybe so until refuted by '}'
// }
final int START=0, BRACE=1, TYPE=2, FOUND=3, FAIL=4;
int next = 0, state = START;
long resetPosition = readerPosition;
boolean inEscape = false;
String fieldName = "";
while (true) {
switch(state) {
case START:
// scan until we reach a {
while (next != '{') {
next = getChar();
// end of stream, no good
if (next < 0) {
return false;
}
}
resetPosition = readerPosition;
inputReader.mark(100);
// ok last char was '{', skip till we get to a '"'
next = getNonWhite();
if (next < 0) { // end of stream, no good
return false;
}
if (next == '"') {
state = BRACE;
} // else redo START
break;
case BRACE:
fieldName = "";
// Next should be a field name of "geometry" or "properties" or "type".
// If we see another opening brace, the previous one must have been inside
// a quoted string literal (after which the double quote we found, was a
// closing quote mark rather than the opening quote mark) - start over.
while (next != '{') {
next = getChar();
if (next < 0) { // end of stream, no good
return false;
}
inEscape = (!inEscape && next == '\\');
if (!inEscape && next == '"') {
break;
}
fieldName += (char)next;
}
if (fieldName.equals("properties") || fieldName.equals("geometry")) {
// ok last char was '"', skip till we get to a ':'
if ((next = getNonWhite()) < 0) { // end of stream, no good
return false;
}
if (next != ':') {
state = START;
} else {
// and finally, if the next char is a {, we know for sure that this is a valid record
if ((next = getNonWhite()) < 0) { // end of stream, no good
return false;
}
state = (next == '{') ? FOUND : START;
}
} else if (fieldName.equals("type")) {
state = TYPE;
} else {
// not a field name we were expecting, start over
state = START;
}
break;
case TYPE: // expect :"Feature","
if ((next = getNonWhite()) < 0) { // end of stream, no good
return false;
}
if (next != ':') {
state = START;
} else {
if ((next = getNonWhite()) < 0) { // end of stream, no good
return false;
}
if (next != '"') {
state = START;
} else {
fieldName = "";
while (true) {
if ((next = getChar()) < 0) return false; // end of stream, no good
inEscape = (!inEscape && next == '\\');
if (!inEscape && next == '"') break;
fieldName += (char)next;
}
if (!"feature".equals(fieldName.toLowerCase())) {
state = START;
} else {
if ((next = getNonWhite()) < 0) { // end of stream, no good
return false;
}
if (next != ',') {
state = START;
} else {
if ((next = getNonWhite()) < 0) { // end of stream, no good
return false;
}
state = (next == '"') ? BRACE : START;
}
}
}
}
break;
case FOUND:
inputReader.reset();
readerPosition = resetPosition;
firstBraceConsumed = true;
return true;
case FAIL:
return false;
default:
throw new RuntimeException("Internal error");
}
}
}
}