/* * Copyright 2011 Internet Archive * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.archive.bacon; import java.io.*; import java.net.*; import org.apache.pig.EvalFunc; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.util.WrappedIOException; import org.apache.pig.impl.logicalLayer.schema.Schema; /** * ReBag: take the elements from a tuple and put them into a bag. * * This was motivated by the fact that the built-in functions STRSPLIT * and TOKENIZE don't work the way I want them to. STRSPLIT returns a * tuple, and TOKENIZE does not allow for custom delimiters. Sheesh. * * So, in addition to writing my own tokenizer, which acts like * STRSPLIT but returns a bag rather thana tuple; I have also written * this function to take the elements of a tuple and put them into a * bag. I strongly suspect that this doesn't handle all the corner * cases nor follow all the Pig "good housekeeping" rules. * * I'm leaving this here as an experiment. */ public class ReBag extends EvalFunc<DataBag> { TupleFactory tupleFactory = TupleFactory.getInstance(); BagFactory bagFactory = BagFactory .getInstance(); /** * Re-bag the tuple elements. Somewhat strangely, the incoming * tuple is wrapped inside another tuple, so we have to look inside * te nested tuple for the actual elements. That is, the incoming * tuple looks like: * ((foo,bar,baz)) */ public DataBag exec( Tuple input ) throws IOException { try { if ( input == null ) return null; DataBag output = bagFactory.newDefaultBag(); for ( Object o : input.getAll() ) { if ( o instanceof Tuple ) { Tuple inner = (Tuple) o; for ( Object p : inner.getAll() ) { output.add( tupleFactory.newTuple( p ) ); } } } return output; } catch ( Exception e ) { throw WrappedIOException.wrap("Caught exception processing input row ", e); } } /** * Generate an output schema based on the type of the first element * in the tuple being re-bagged. We assume that the type of all the * elements are the same and can thus just look at the first one. */ public Schema outputSchema( Schema input ) { try { if ( input == null ) { return Schema.generateNestedSchema( DataType.BAG, DataType.NULL ); } Schema elementSchema = new Schema(); for ( Schema.FieldSchema fs : input.getFields() ) { if ( fs.type == DataType.TUPLE ) { if ( fs.schema == null ) { break ; } for ( Schema.FieldSchema ifs : fs.schema.getFields() ) { // The type of all the elements in the output bag // are assumed to be the same as the type of the // first element of the tuple being re-bagged. elementSchema.add( ifs ); break ; } } } Schema bagSchema = new Schema( new Schema.FieldSchema( getSchemaName( this.getClass().getName().toLowerCase(), input ), elementSchema, DataType.BAG ) ); return bagSchema; } catch (Exception e) { e.printStackTrace( System.err ); return null; } } }