SubColumnExtraction.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.hadoop.zebra.types;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.HashSet;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.schema.ColumnType;
import org.apache.hadoop.zebra.parser.ParseException;


/**
 * This class extracts a subfield from a column or subcolumn stored
 * in entirety on disk
 * It should be used only by readers whose serializers do not
 * support projection
 */
public class SubColumnExtraction {
	static class SubColumn {
		Schema physical;
		Projection projection;
		ArrayList<SplitColumn> exec = null;
		SplitColumn top = null; // directly associated with physical schema 
		SplitColumn leaf = null; // target tuple corresponding to projection

    // list of SplitColumns to be created maps on their children
    ArrayList<SplitColumn> sclist = new ArrayList<SplitColumn>();

		SubColumn(Schema physical, Projection projection) throws ParseException, ExecException
		{
			this.physical = physical;
			this.projection = projection;
			top = new SplitColumn(Partition.SplitType.RECORD);
			exec = new ArrayList<SplitColumn>();
			exec.add(top); // breadth-first 
		
			SplitColumn sc;
			leaf = new SplitColumn(Partition.SplitType.RECORD);
			Schema.ColumnSchema fs;
			Schema.ParsedName pn = new Schema.ParsedName();
			String name;
			int j;
			HashSet<String> keySet;
			for (int i = 0; i < projection.getSchema().getNumColumns(); i++)
			{
				fs = projection.getColumnSchema(i);
				if (fs == null)
				  continue;
				name = fs.getName();
				if (name == null)
					continue;
				if (projection.getKeys() != null)
				  keySet = projection.getKeys().get(fs);
				else
				  keySet = null;
				pn.setName(name);
				fs = physical.getColumnSchema(pn);
				if (keySet != null)
				  pn.setDT(ColumnType.MAP);
				if (fs == null)
		     		continue; // skip non-existing field
		
				j = fs.getIndex();
				ColumnType ct = pn.getDT();
				if (ct == ColumnType.MAP || ct == ColumnType.RECORD || ct == ColumnType.COLLECTION)
				{
					// record/map subfield is expected
					sc = new SplitColumn(j, ct);
          if (ct == ColumnType.MAP)
            sclist.add(sc);
					exec.add(sc); // breadth-first
					// (i, j) represents the mapping between projection and physical schema
					buildSplit(sc, fs, pn, i, (projection.getKeys() == null ? null :
                keySet));
				} else {
					// (i, j) represents the mapping between projection and physical schema
					sc = new SplitColumn(j, i, leaf, null, Partition.SplitType.NONE);
					// no split on a leaf
				}
				top.addChild(sc);
		 	}
		}

    /**
     * build the split executions
     */
		private void buildSplit(SplitColumn parent, Schema.ColumnSchema fs,
        final Schema.ParsedName pn, final int projIndex, HashSet<String> keys) throws ParseException, ExecException
		{
			// recursive call to get the next level schema
		  ColumnType ct = pn.getDT();
			if (ct != fs.getType())
	      	throw new ParseException(fs.getName()+" is not of proper type.");
	
			String prefix;
			int fieldIndex;
			SplitColumn sc;
			Partition.SplitType callerDT = (ct == ColumnType.MAP ? Partition.SplitType.MAP :
				                               (ct == ColumnType.RECORD ? Partition.SplitType.RECORD :
				                                 (ct == ColumnType.COLLECTION ? Partition.SplitType.COLLECTION :
				                        	         Partition.SplitType.NONE)));
			prefix = pn.parseName(fs);
			Schema schema = fs.getSchema();
			if (callerDT == Partition.SplitType.RECORD || callerDT == Partition.SplitType.COLLECTION)
			{
        if (keys != null)
          throw new AssertionError("Internal Logical Error: empty key map expected.");
				 if ((fieldIndex = schema.getColumnIndex(prefix)) == -1)
	        		return; // skip non-existing fields
				 fs = schema.getColumn(fieldIndex);
			} else {       
        parent.setKeys(keys); // map key is set at parent which is of type MAP
        fs = schema.getColumn(0); // MAP value is a singleton type!
				fieldIndex = 0;
			}
	
		  ct = pn.getDT();
			if (ct != ColumnType.ANY)
			{
				// record subfield is expected
			 	sc = new SplitColumn(fieldIndex, ct);
        if (ct == ColumnType.MAP)
          sclist.add(sc);
			 	exec.add(sc); // breadth-first
			 	buildSplit(sc, fs, pn, projIndex, null);
			} else {
				sc = new SplitColumn(fieldIndex, projIndex, leaf, null, Partition.SplitType.NONE);
				// no split on a leaf
			}
			parent.addChild(sc);
		}

    /**
     * dispatch the source tuple from disk
     */
		void dispatchSource(Tuple src)
		{
			top.dispatch(src);
	 	}

    /**
     * dispatch the target tuple
     */
		private void dispatch(Tuple tgt) throws ExecException
		{
			leaf.dispatch(tgt);
			createMaps();
			leaf.setBagFields();
		}

    /**
     * the execution
     */
		void splitColumns(Tuple dest) throws ExecException, IOException
		{
			int i;
			dispatch(dest);
            clearMaps();
            int execSize = exec.size();
			for (i = 0; i < execSize; i++)
			{
			    SplitColumn execElement = exec.get(i);
				if (execElement != null)
				{
					// split is necessary
					execElement.split();
				}
			}
		}

    /**
     * create MAP fields if necessary
     */
    private void createMaps() throws ExecException
    {
      for (int i = 0; i < sclist.size(); i++)
        sclist.get(i).createMap();
    }

    /**
     * clear map fields if necessary
     */
    private void clearMaps() throws ExecException
    {
      for (int i = 0; i < sclist.size(); i++)
        sclist.get(i).clearMap();
    }
	}

  /**
   * helper class to represent one execution
   */
  private static class SplitColumn {
	 int fieldIndex = -1; // field index to parent
	 int projIndex = -1; // index in projection: only used by leaves
	 ArrayList<SplitColumn> children = null;
	 int index = -1; // index in the logical schema 
	 Object field = null;
	 SplitColumn leaf = null; // leaf holds the target tuple
	 Partition.SplitType st = Partition.SplitType.NONE;
	 HashSet<String> keys;
	 Schema scratchSchema; // a temporary 1-column schema to be used to create a tuple
	                       // for a COLLETION column
	 ArrayList<Integer> bagFieldIndices;

	 void dispatch(Object field) { this.field = field; }

	 void setKeys(HashSet<String> keys) { this.keys = keys; }

	 SplitColumn(Partition.SplitType st)
	 {
		 this.st = st;
	 }

	 SplitColumn(ColumnType ct)
	 {
		 if (ct == ColumnType.MAP)
			 st = Partition.SplitType.MAP;
		 else if (ct == ColumnType.RECORD)
			 st = Partition.SplitType.RECORD;
		 else if (ct == ColumnType.COLLECTION)
		 {
		   st = Partition.SplitType.COLLECTION;
		   try {
		      scratchSchema = new Schema("foo");
		   } catch (ParseException e) {
		     // no-op: should not throw at all.
		   }
		 } else
			 st = Partition.SplitType.NONE;
	 }

	 SplitColumn(int fieldIndex, ColumnType ct)
	 {
		 this(ct);
		 this.fieldIndex = fieldIndex;
	 }

	 SplitColumn(int fieldIndex, Partition.SplitType st)
	 {
		 this.fieldIndex = fieldIndex;
		 this.st = st;
	 }

	 SplitColumn(int fieldIndex, HashSet<String> keys, Partition.SplitType st)
	 {
		 this.fieldIndex = fieldIndex;
		 this.keys = keys;
		 this.st = st;
	 }

	 SplitColumn(int fieldIndex,int projIndex, SplitColumn leaf, HashSet<String> keys, Partition.SplitType st)
	 {
		 this(fieldIndex, keys, st);
		 this.projIndex = projIndex;
		 this.leaf = leaf;
	 }
	 
   /**
    * the split op
    */
	 @SuppressWarnings("unchecked")
	 void split() throws IOException, ExecException
	 {
		 if (children == null)
			 return;
		 
		 int size = children.size();
		 if (st == Partition.SplitType.RECORD)
		 {
			 for (int i = 0; i < size; i++)
			 {
			     SplitColumn child = children.get(i);
				 if (child.projIndex != -1) // a leaf: set projection directly
			 		((Tuple)child.leaf.field).set(child.projIndex, ((Tuple) field).get(child.fieldIndex));
				 else
					 child.field = ((Tuple) field).get(child.fieldIndex);
			 }
		 } else if (st == Partition.SplitType.COLLECTION) {
		    DataBag srcBag, tgtBag;
		    srcBag = (DataBag) field;
		    Tuple tuple;
		    for (int i = 0; i < size; i++)
		    {
		      SplitColumn child = children.get(i);
		      if (child.projIndex != -1) // a leaf: set projection directly
		      {
		        tgtBag = (DataBag)((Tuple)child.leaf.field).get(child.projIndex);
		      } else {
		        tgtBag = (DataBag) child.field;
		        tgtBag.clear();
		      }
		      for (Iterator<Tuple> it = srcBag.iterator(); it.hasNext(); )
		      {
		        tuple = TypesUtils.createTuple(scratchSchema);
		        tuple.set(0, it.next().get(child.fieldIndex));
		        tgtBag.add(tuple);
		      }
		    }
		 } else if (st == Partition.SplitType.MAP && keys != null) {
       String key;
       Iterator<String> it;
       Object value;
			 for (int i = 0; i < size; i++)
			 {
			     SplitColumn child = children.get(i);
				 if (child.projIndex != -1) // a leaf: set projection directly
         {
           for (it = keys.iterator(); it.hasNext(); )
           {
             key = it.next();
             value = ((Map<String, Object>) field).get(key);
             if (value == null)
               continue;
			 		   ((Map<String, Object>) (((Tuple)child.leaf.field).get(child.projIndex))).put(key, value);
           }
         } else {
           for (it = keys.iterator(); it.hasNext(); )
           {
             key = it.next();
					   child.field = ((Map<String, Object>) field).get(key);
           }
         }
			 }
		 }
	 }

   /**
    * add a child that needs a subfield of this (sub)column
    */
	 void addChild(SplitColumn child) throws ExecException {
		 if (children == null)
			 children = new ArrayList<SplitColumn>();
		 children.add(child);
		 if (st == Partition.SplitType.COLLECTION) {
		   if (child.projIndex != -1)
		   {
		     child.leaf.addBagFieldIndex(child.projIndex);
		   } else {
		     ((Tuple) child.field).set(child.fieldIndex, TypesUtils.createBag());
		   }
		 }
	 }
	 
	 /**
	  * add a bag field index
	  */
	 void addBagFieldIndex(int i)
	 {
	   if (bagFieldIndices == null)
	     bagFieldIndices = new ArrayList<Integer>();
	   bagFieldIndices.add(i);
	 }
	 
	 /**
	  * set bag fields if necessary
	  */
	 void setBagFields() throws ExecException
	 {
	   if (bagFieldIndices == null)
	     return;
	   for (int i = 0; i < bagFieldIndices.size(); i++)
	   {
	     ((Tuple) field).set(bagFieldIndices.get(i), TypesUtils.createBag());
	   }
	 }

   /**
    * create MAP fields for children
    */
   void createMap() throws ExecException
   {
     if (st == Partition.SplitType.MAP)
     {
       int size = children.size();
       for (int i = 0; i < size; i++)
       {
				 if (children.get(i).projIndex != -1)
			 		 ((Tuple)children.get(i).leaf.field).set(children.get(i).projIndex, new HashMap<String, Object>());
				 else
           children.get(i).field = new HashMap<String, Object>();
       }
     }
   }
   
    /**
     * clear map for children
     */
	  @SuppressWarnings("unchecked")
    void clearMap() throws ExecException
    {
      if (st == Partition.SplitType.MAP)
      {
        int size = children.size();
        for (int i = 0; i < size; i++)
        {
	 			  if (children.get(i).projIndex != -1)
	 		 		  ((Map)((Tuple)children.get(i).leaf.field).get(children.get(i).projIndex)).clear();
          else
            ((Map)children.get(i).field).clear();
        }
      }
    }
   
  }
}