/* * Copyright 2011 Internet Archive * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.archive.bacon.io; import java.io.*; import java.util.*; import org.json.JSONArray; import org.json.JSONObject; import org.json.JSONException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.pig.LoadFunc; import org.apache.pig.ResourceSchema; import org.apache.pig.StoreFuncInterface; import org.apache.pig.PigException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.builtin.PigStorage; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.data.BagFactory; /** * Simple store function that writes out Pig 'map' objects as JSON * strings. * * It leverages PigStorage to support writing compressed files with * various encodings: gzip, bzip2, etc. * * In fact, this is really just a hack that expects to receive a Pig * 'map' object, converts it to a JSON string, then passes that string * on to PigStorage(). * */ public class JSONStorage extends LoadFunc implements StoreFuncInterface { // Use a '\0' as the delimiter. It should never appear, so if it // does show up, we should notice it! PigStorage ps = new PigStorage("\0"); RecordReader reader; TupleFactory mTupleFactory = TupleFactory.getInstance(); BagFactory mBagFactory = BagFactory.getInstance(); public JSONStorage( ) { } /** * StoreFuncInterface */ @Override public String relToAbsPathForStoreLocation( String location, Path curDir ) throws IOException { return this.ps.relToAbsPathForStoreLocation( location, curDir ); } @Override public OutputFormat getOutputFormat( ) { return this.ps.getOutputFormat( ); } @Override public void setStoreLocation( String location, Job job ) throws IOException { this.ps.setStoreLocation( location, job ); } @Override public void checkSchema(ResourceSchema s) throws IOException { this.ps.checkSchema( s ); } @Override public void prepareToWrite(RecordWriter writer) { this.ps.prepareToWrite( writer ); } @Override public void putNext( Tuple tuple ) throws IOException { int size = tuple.size(); try { JSONObject json; // If the tuple to serialize as JSON has one field which is a // Map, then serialize that Map, not the tuple. Otherwise, // serialize the tuple. if ( tuple.size() == 1 && DataType.findType( tuple.get(0) ) == DataType.MAP ) { json = (JSONObject) JSON.toJSON( tuple.get(0) ); } else { json = (JSONObject) JSON.toJSON( tuple ); } String jstring = json.toString(); Tuple output = mTupleFactory.newTuple( jstring ); this.ps.putNext( output ); } catch ( JSONException je ) { throw new IOException( je ); } } @Override public void setStoreFuncUDFContextSignature(String signature) { this.ps.setStoreFuncUDFContextSignature( signature ); } @Override public void cleanupOnFailure(String location, Job job) throws IOException { this.ps.cleanupOnFailure( location, job ); } /** * LoadFunc */ @Override public void setLocation(String location, Job job) throws IOException { this.ps.setLocation( location, job ); } @Override public InputFormat getInputFormat() throws IOException { return this.ps.getInputFormat(); } @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { this.reader = reader; // FIXME: Do we still even need to call PigStorage.prepareToRead()? this.ps.prepareToRead( reader, split ); } @Override public Tuple getNext() throws IOException { try { if ( ! this.reader.nextKeyValue() ) { return null; } Text text = (Text) this.reader.getCurrentValue( ); if ( text == null ) return null; JSONObject json = new JSONObject( text.toString() ); Object o = JSON.fromJSON( json ); Tuple tuple; if ( o instanceof Map ) { tuple = mTupleFactory.newTuple( o ); } else { tuple = (Tuple) o; } return tuple; } catch ( JSONException je ) { throw new IOException( je ); } catch ( InterruptedException e ) { // From the Pig example/howto code. int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode,PigException.REMOTE_ENVIRONMENT, e); } } }