UnenclosedBaseJsonRecordReader.java example

Explorer

spatial-framework-for-hadoop-master
- hive
  - src
    - main
      - java
        com
        esri
        hadoop
        hive
        BinUtils.java
        GeometryUtils.java
        HiveGeometry.java
        HiveGeometryOIHelper.java
        LogUtils.java
        ST_Aggr_ConvexHull.java
        ST_Aggr_Intersection.java
        ST_Aggr_Union.java
        ST_Area.java
        ST_AsBinary.java
        ST_AsGeoJson.java
        ST_AsJson.java
        ST_AsShape.java
        ST_AsText.java
        ST_Bin.java
        ST_BinEnvelope.java
        ST_Boundary.java
        ST_Buffer.java
        ST_Centroid.java
        ST_Contains.java
        ST_ConvexHull.java
        ST_CoordDim.java
        ST_Crosses.java
        ST_Difference.java
        ST_Dimension.java
        ST_Disjoint.java
        ST_Distance.java
        ST_EndPoint.java
        ST_EnvIntersects.java
        ST_Envelope.java
        ST_Equals.java
        ST_ExteriorRing.java
        ST_GeodesicLengthWGS84.java
        ST_GeomCollection.java
        ST_GeomFromGeoJson.java
        ST_GeomFromJson.java
        ST_GeomFromShape.java
        ST_GeomFromText.java
        ST_GeomFromWKB.java
        ST_Geometry.java
        ST_GeometryAccessor.java
        ST_GeometryN.java
        ST_GeometryProcessing.java
        ST_GeometryRelational.java
        ST_GeometryType.java
        ST_InteriorRingN.java
        ST_Intersection.java
        ST_Intersects.java
        ST_Is3D.java
        ST_IsClosed.java
        ST_IsEmpty.java
        ST_IsMeasured.java
        ST_IsRing.java
        ST_IsSimple.java
        ST_Length.java
        ST_LineFromWKB.java
        ST_LineString.java
        ST_M.java
        ST_MLineFromWKB.java
        ST_MPointFromWKB.java
        ST_MPolyFromWKB.java
        ST_MaxM.java
        ST_MaxX.java
        ST_MaxY.java
        ST_MaxZ.java
        ST_MinM.java
        ST_MinX.java
        ST_MinY.java
        ST_MinZ.java
        ST_MultiLineString.java
        ST_MultiPoint.java
        ST_MultiPolygon.java
        ST_NumGeometries.java
        ST_NumInteriorRing.java
        ST_NumPoints.java
        ST_Overlaps.java
        ST_Point.java
        ST_PointFromWKB.java
        ST_PointN.java
        ST_PointZ.java
        ST_PolyFromWKB.java
        ST_Polygon.java
        ST_Relate.java
        ST_SRID.java
        ST_SetSRID.java
        ST_StartPoint.java
        ST_SymmetricDiff.java
        ST_Touches.java
        ST_Union.java
        ST_Within.java
        ST_X.java
        ST_Y.java
        ST_Z.java
        serde
        BaseJsonSerDe.java
        EsriJsonSerDe.java
        GeoJsonSerDe.java
        JsonSerde.java
        shims
        HiveShims.java
    - test
      - java
        com
        esri
        hadoop
        hive
        TestStAsShape.java
        TestStGeomFromShape.java
        TestStGeometryType.java
        TestStLineString.java
        TestStMinX.java
        TestStMinY.java
        TestStMultiPoint.java
        TestStMultiPolygon.java
        TestStPoint.java
        TestStX.java
        TestStY.java
        serde
        JsonSerDeTestingBase.java
        TestEsriJsonSerDe.java
        TestGeoJsonSerDe.java
- json
  - src
    - main
      - java
        com
        esri
        json
        EsriFeature.java
        EsriFeatureClass.java
        EsriField.java
        EsriFieldType.java
        EsriJsonFactory.java
        deserializer
        GeometryJsonDeserializer.java
        GeometryTypeJsonDeserializer.java
        SpatialReferenceJsonDeserializer.java
        hadoop
        EnclosedBaseJsonRecordReader.java
        EnclosedEsriJsonInputFormat.java
        EnclosedEsriJsonRecordReader.java
        EnclosedGeoJsonInputFormat.java
        EnclosedGeoJsonRecordReader.java
        EnclosedJsonInputFormat.java
        EnclosedJsonRecordReader.java
        UnenclosedBaseJsonRecordReader.java
        UnenclosedEsriJsonInputFormat.java
        UnenclosedEsriJsonRecordReader.java
        UnenclosedGeoJsonInputFormat.java
        UnenclosedGeoJsonRecordReader.java
        UnenclosedJsonInputFormat.java
        UnenclosedJsonRecordReader.java
        serializer
        GeometryJsonSerializer.java
        GeometryTypeJsonSerializer.java
        SpatialReferenceJsonSerializer.java
    - test
      - java
        com
        esri
        json
        hadoop
        TestEnclosedEsriJsonRecordReader.java
        TestEnclosedGeoJsonRecordReader.java
        TestUnenclosedEsriJsonRecordMrv1.java
        TestUnenclosedEsriJsonRecordReader.java
        TestUnenclosedGeoJsonRecordReader.java

package com.esri.json.hadoop;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/**
 * 
 * Enumerates records from an Unenclosed JSON file - use either Esri JSON or GeoJSON subclass
 * 
 */
public abstract class UnenclosedBaseJsonRecordReader extends RecordReader<LongWritable, Text> implements
    org.apache.hadoop.mapred.RecordReader<LongWritable, Text> {
	static final Log LOG = LogFactory.getLog(UnenclosedBaseJsonRecordReader.class.getName());
	
	protected BufferedReader inputReader;
	protected LongWritable mkey = null;
	protected Text mval = null;
	protected long readerPosition;
	protected long start, end;
	protected boolean firstBraceConsumed = false;

	protected UnenclosedBaseJsonRecordReader() throws IOException {
		mkey = createKey();
		mval = createValue();
	}

	protected UnenclosedBaseJsonRecordReader(org.apache.hadoop.mapred.InputSplit split,
											 Configuration conf) throws IOException {
		org.apache.hadoop.mapred.FileSplit fileSplit = (org.apache.hadoop.mapred.FileSplit)split;
		start = fileSplit.getStart();
		end = fileSplit.getLength() + start;
		Path filePath = fileSplit.getPath();
        commonInit(filePath, conf);
	}

	@Override
	public void close() throws IOException {
		if (inputReader != null)
			inputReader.close();
	}

	@Override
	public LongWritable createKey() {
		return new LongWritable();
	}

	@Override
	public Text createValue() {
		return new Text();
	}

	@Override
	public LongWritable getCurrentKey() throws IOException, InterruptedException {
		return mkey;
	}

	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
		return mval;
	}

	@Override
	public long getPos() throws IOException {
		return readerPosition;
	}

	@Override
	public float getProgress() throws IOException {
		return (float)(readerPosition-start)/(end-start);
	}

	@Override
	public void initialize(InputSplit split, TaskAttemptContext taskContext)
				throws IOException, InterruptedException {
		FileSplit fileSplit = (FileSplit)split;
		start = fileSplit.getStart();
		end = fileSplit.getLength() + start;
		Path filePath = fileSplit.getPath();
        commonInit(filePath, taskContext.getConfiguration());
	}

	@Override
	public boolean next(LongWritable key, Text value) throws IOException {
		/*
		 * NOTE : we are not using a JSONParser, so this will not validate JSON structure aside from correct counts of '{' and '}'
		 * The fact that it may handle some invalid JSON, does not imply that we support invalid JSON;
         * rather, updates to the code may require valid JSON in order to locate record boundaries.
		 *
		 * We will count '{' and '}' to find the beginning and end of each record, while ignoring braces in string literals.
		 */

		int chr = 0;
		int brace_depth = 0;
		char lit_char = 0;
		boolean first_brace_found = false;

		// The case of split point exactly at whitespace between records,
		// is handled by forcing the record following to the split following,
		// in the interest of better balancing the splits, by consuming the
		// whitespace before checking the end of the split.
		if (!firstBraceConsumed) {  // That should only ever be true on the very first read in the split
			chr = getNonWhite();
			firstBraceConsumed = (chr == '{');
		}

		if ( readerPosition + (firstBraceConsumed ? 0 : 1)  >  end )  {
			return false;
		}

		StringBuilder sb = new StringBuilder(2000);

		if (firstBraceConsumed) {
			// first open brace was consumed already;
			// update initial state accordingly
			brace_depth = 1;
			sb.append("{");
			first_brace_found = true;
			firstBraceConsumed = false;
			key.set(readerPosition - 1);
		}

		boolean inEscape = false;
		while (brace_depth > 0 || !first_brace_found)
		{
			chr = getChar();

			if (chr < 0) {
				if (first_brace_found){
					// last record was invalid
					LOG.error("Parsing error : EOF occured before record ended");
				}
				return false;
			}

			switch (chr)
			{
			case '\\':
				inEscape = (lit_char != 0 && !inEscape);
				break;
			case '"':
			case '\'':
				if (lit_char == 0) {
					lit_char = (char) chr;  // mark start literal (double/single quote)
				}
				else if (inEscape) {
					inEscape = false;
				}
				else if (lit_char == chr) {
					lit_char = 0;   // mark end literal (double/single-quote)
				}
 				// ignored because we found a ' inside a " " block quote (or vice versa)
				break;
			case '{':
				if (inEscape) {
					inEscape = false;
				}
				else if (lit_char == 0) {  // not in string literal,
					brace_depth++;         // so increase brace depth
					if (!first_brace_found) {
						first_brace_found = true;
						key.set(readerPosition - 1); // set record key to the char offset of the first '{'
					}
				}
				break;
			case '}':
				if (inEscape) {
					inEscape = false;
				}
				else if (lit_char == 0) { // not in string literal,
					brace_depth--;  //  so decrease brace depth
				}
				break;
			default:
				inEscape = false;
				break;
			}

			if (brace_depth < 0){
				// found more '}'s than we did '{'s
				LOG.error("Parsing error : no '{' - unmatched '}' in record");
				return false;
			}

			if (first_brace_found){
				sb.append((char)chr);
			}
		}

		// no '{' found before EOF.  Not an error as this could mean that there is extra white-space at the end
		if (!first_brace_found){
			return false;
		}

		value.set(sb.toString());
		return true;
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		return next(mkey, mval);
	}


    private void commonInit(Path filePath, Configuration conf) throws IOException {

		readerPosition = start;

		FileSystem fs = filePath.getFileSystem(conf);
		inputReader = new BufferedReader(new InputStreamReader(fs.open(filePath)));

		if (start != 0) {
			// split starts inside the json
			inputReader.skip(start);
			moveToRecordStart();
		}

	}

	protected int getChar() throws IOException {
		int ch = inputReader.read();
		readerPosition++;
		return ch;
	}

	protected int getNonWhite() throws IOException {
		int ch;
		do {
			ch = getChar();
		} while (Character.isWhitespace((char)ch));
		return ch;
	}

	/**
	 * Given an arbitrary byte offset into an unenclosed JSON document, 
	 * find the start of the next record in the document.  Discard trailing
	 * bytes from the previous record if we happened to seek to the middle
	 * of it.
	 * 
	 * @throws IOException
	 */
	protected abstract boolean moveToRecordStart() throws IOException;
}