package com.esri.json.hadoop; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; /** * * Enumerates records from an Esri Unenclosed JSON file * */ /* * The JSON will look like this (white-space ignored) * * { // start record 1 * "attributes" : {} * "geometry" : {} * } // end record 1 * { // start record 2 * "attributes" : {} * "geometry" : {} * } // end record 2 */ public class UnenclosedEsriJsonRecordReader extends UnenclosedBaseJsonRecordReader { static final Log LOG = LogFactory.getLog(UnenclosedEsriJsonRecordReader.class.getName()); public UnenclosedEsriJsonRecordReader() throws IOException { // explicit just to declare exception super(); } public UnenclosedEsriJsonRecordReader(org.apache.hadoop.mapred.InputSplit split, Configuration conf) throws IOException { //attrLabel = "attributes"; super(split, conf); } /** * Given an arbitrary byte offset into a unenclosed JSON document, * find the start of the next record in the document. Discard trailing * bytes from the previous record if we happened to seek to the middle * of it * * Record boundary defined as : \{\s*"(attributes|geometry)"\s*:\s*\{ * * @throws IOException */ protected boolean moveToRecordStart() throws IOException { int next = 0; long resetPosition = readerPosition; // The case of split point exactly at whitespace between records, is // handled by forcing it to the split following, in the interest of // better balancing the splits, by consuming the whitespace in next(). // The alternative of forcing it to the split preceding, could be // done like what is commented here. // while (next != '{' || skipDup > 0) { // skipDup>0 => record already consumed // next = getChar(); // if (next < 0) return false; // end of stream, no good // if (next == '}') skipDup = -1; // Definitely not // else if (skipDup == 0) skipDup = 1; // no info - Maybe so until refuted by '}' // } while (true) { // scan until we reach a { while (next != '{') { next = getChar(); // end of stream, no good if (next < 0) { return false; } } resetPosition = readerPosition; inputReader.mark(100); // ok last char was '{', skip till we get to a '"' next = getNonWhite(); if (next < 0) { // end of stream, no good return false; } if (next != '"') { continue; } boolean inEscape = false; String fieldName = ""; // Next should be a field name of attributes or geometry . // If we see another opening brace, the previous one must have been inside // a quoted string literal (after which the double quote we found, was a // closing quote mark rather than the opening quote mark) - start over. while (next != '{') { next = getChar(); if (next < 0) { // end of stream, no good return false; } inEscape = (!inEscape && next == '\\'); if (!inEscape && next == '"') { break; } fieldName += (char)next; } if (!(fieldName.equals("attributes") || fieldName.equals("geometry"))) { // not the field name we were expecting, start over continue; } // ok last char was '"', skip till we get to a ':' next = getNonWhite(); if (next < 0) { // end of stream, no good return false; } if (next != ':') { continue; } // and finally, if the next char is a {, we know for sure that this is a valid record next = getNonWhite(); if (next < 0) { // end of stream, no good return false; } if (next == '{') { // at this point we can be sure that we have found the record boundary break; } } inputReader.reset(); readerPosition = resetPosition; firstBraceConsumed = true; return true; } }