/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.hadoop.zebra.types; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.util.Iterator; import java.util.Map; import junit.framework.Assert; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DefaultDataBag; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.hadoop.zebra.schema.ColumnType; import org.apache.hadoop.zebra.schema.Schema; import org.apache.hadoop.zebra.schema.Schema.ColumnSchema; import org.apache.hadoop.zebra.parser.ParseException; /** * Utility methods manipulating Table types (specifically, Tuple objects). */ public class TypesUtils { //static TupleFactory tf = ZebraTupleFactory.getInstance(); static TupleFactory tf = ZebraTupleFactory.getZebraTupleFactoryInstance(); /** * Create a tuple based on a schema * * @param schema * The schema that the tuple will conform to. * @return A suitable Tuple object that can be used to read or write a Table * with the same input or output schema. */ public static Tuple createTuple(Schema schema) throws IOException { Tuple tuple = tf.newTuple(schema.getNumColumns()); for (int i = 0; i < schema.getNumColumns(); ++i) { tuple.set(i, null); } return tuple; } /** * create a tuple based on number of columns */ public static Tuple createTuple(int size) throws IOException { Tuple tuple = tf.newTuple(size); for (int i = 0; i < size; ++i) { tuple.set(i, null); } return tuple; } /** * Create a PIG Bag object. * * @return A Pig DataBag object. */ public static DataBag createBag() { return new DefaultDataBag(); } public static DataBag createBag(Schema schema) { return new DefaultDataBag(); } /** * Reset the Tuple so that all fields are NULL field. This is different from * clearing the tuple, in which case the size of the tuple will become zero. * * @param tuple * Input tuple. */ public static void resetTuple(Tuple tuple) { try { int tupleSize = tuple.size(); for (int i = 0; i < tupleSize; ++i) { tuple.set(i, null); } } catch (Exception e) { throw new RuntimeException("Internal error: " + e.toString()); } } private static void checkTypeError(ColumnSchema cs, ColumnType type) throws IOException { throw new IOException("Incompatible Tuple object - datum is " + type + ", but schema says " + cs.getType()); } private static void checkColumnType(ColumnSchema cs, ColumnType type) throws IOException { switch (type) { case BOOL: case DOUBLE: case STRING: case BYTES: case MAP: case COLLECTION: case RECORD: if (cs.getType() != type) { checkTypeError(cs, type); } break; case FLOAT: if (cs.getType() != ColumnType.FLOAT && cs.getType() != ColumnType.DOUBLE) { checkTypeError(cs, type); } break; case LONG: if (cs.getType() != ColumnType.LONG && cs.getType() != ColumnType.FLOAT && cs.getType() != ColumnType.DOUBLE) { checkTypeError(cs, type); } break; case INT: if (cs.getType() != ColumnType.INT && cs.getType() != ColumnType.LONG && cs.getType() != ColumnType.FLOAT && cs.getType() != ColumnType.DOUBLE) { checkTypeError(cs, type); } break; } } @SuppressWarnings("unchecked") private static void checkColumn(Object d, ColumnSchema cs) throws IOException { if (d instanceof Boolean) { checkColumnType(cs, ColumnType.BOOL); } else if (d instanceof Integer) { checkColumnType(cs, ColumnType.INT); } else if (d instanceof Long) { checkColumnType(cs, ColumnType.LONG); } else if (d instanceof Float) { checkColumnType(cs, ColumnType.FLOAT); } else if (d instanceof Double) { checkColumnType(cs, ColumnType.DOUBLE); } else if (d instanceof String) { checkColumnType(cs, ColumnType.STRING); } else if (d instanceof DataByteArray) { checkColumnType(cs, ColumnType.BYTES); } else if (d instanceof Map) { checkMapColumn((Map<String, Object>)d, cs); } else if (d instanceof DataBag) { checkCollectionColumn((DataBag)d, cs); } else if (d instanceof Tuple) { checkRecordColumn((Tuple)d, cs); } else { throw new IOException("Unknown data type"); } } private static void checkMapColumn(Map<String, Object> m, ColumnSchema cs) throws IOException { checkColumnType(cs, ColumnType.MAP); Schema schema = cs.getSchema(); Assert.assertTrue(schema.getNumColumns() == 1); ColumnSchema tempColumnSchema = schema.getColumn(0); if (tempColumnSchema.getType() == ColumnType.BYTES) { // We do not check inside of map if its value type is BYTES; // This is for Pig, since it only supports BYTES as map value type. return; } Map<String, Object> m1 = (Map<String, Object>)m; for (Map.Entry<String, Object> e : m1.entrySet()) { Object d = e.getValue(); if (d != null) { checkColumn(d, tempColumnSchema); return; // We only check the first non-null map value in the map; } } } private static void checkCollectionColumn(DataBag bag, ColumnSchema cs) throws IOException { checkColumnType(cs, ColumnType.COLLECTION); Schema schema = cs.getSchema(); Assert.assertTrue(schema.getNumColumns() == 1); Iterator<Tuple> iter = bag.iterator(); while (iter.hasNext()) { Tuple tempTuple = iter.next(); // collection has to be on record; if (tempTuple != null) { checkRecordColumn(tempTuple, schema.getColumn(0)); return; // We only check the first non-null record in the collection; } } } private static void checkRecordColumn(Tuple d, ColumnSchema cs) throws IOException { checkColumnType(cs, ColumnType.RECORD); checkNumberColumnCompatible(d, cs.getSchema()); for (int i=0; i<d.size(); i++) { if (d.get(i) != null) { // "null" can match any type; checkColumn(d.get(i), cs.getSchema().getColumn(i)); } } } /** * Check whether the input row object is compatible with the expected schema * * @param tuple * Input Tuple object * @param schema * Table schema * @throws IOException */ public static void checkCompatible(Tuple tuple, Schema schema) throws IOException { // Create a dummy record ColumnSchema since we do not have it; ColumnSchema dummy = new ColumnSchema("dummy", schema); checkRecordColumn(tuple, dummy); } /** * Check whether the input row object is compatible with the expected schema * on number of Columns; * @param tuple * Input Tuple object * @param schema * Table schema * @throws IOException */ public static void checkNumberColumnCompatible(Tuple tuple, Schema schema) throws IOException { if (tuple.size() != schema.getNumColumns()) { throw new IOException("Incompatible Tuple object - tuple has " + tuple.size() + " columns, but schema says " + schema.getNumColumns() + " columns"); } } /** * Reading a tuple from disk with projection. */ public static class TupleReader { private Tuple tuple; //@SuppressWarnings("unused") private Schema physical; private Projection projection; SubColumnExtraction.SubColumn subcolextractor = null; /** * Constructor - create a TupleReader than can parse the serialized Tuple * with the specified physical schema, and produce the Tuples based on the * projection. * * @param physical * The physical schema of on-disk data. * @param projection * The logical schema of tuples user expect. */ public TupleReader(Schema physical, Projection projection) throws IOException, ParseException { tuple = createTuple(physical); this.physical = physical; this.projection = projection; subcolextractor = new SubColumnExtraction.SubColumn(physical, projection); subcolextractor.dispatchSource(tuple); } public Schema getSchema() { return physical; } public Projection getprojction() { return projection; } /** * Read a tuple from the stream, and perform projection. * * @param in * The input stream * @param row * The input tuple that should conform to the projection schema. * @throws IOException */ public void get(DataInputStream in, Tuple row) throws IOException, ParseException { checkNumberColumnCompatible(row, projection.getSchema()); tuple.readFields(in); TypesUtils.resetTuple(row); try { subcolextractor.splitColumns(row); } catch (ExecException e) { // not going to happen. } } } /** * Writing a tuple to disk. */ public static class TupleWriter { private Schema physical; /** * The constructor * * @param physical * The physical schema of the tuple. */ public TupleWriter(Schema physical) { this.physical = physical; } /** * Write a tuple to the output stream. * * @param out * The output stream * @param row * The user tuple that should conform to the physical schema. * @throws IOException */ public void put(DataOutputStream out, Tuple row) throws IOException { row.write(out); } } /** * Checking and formatting an input tuple to conform to the input schema.<br> * * The current implementation always create a new tuple because PIG * expects Slice.next(tuple) always returning a brand new tuple. * * @param tuple * @throws IOException * */ public static void formatTuple(Tuple tuple, int ncols) throws IOException { Tuple one = createTuple(ncols); tuple.reference(one); return; /* * Dead code below. */ // int n = schema.getNumColumns(); // if (tuple.size() == n) return; // if (tuple.size() == 0) { // for (int i = 0; i < schema.getNumColumns(); ++i) { // tuple.append(null); // } // return; // } // throw new IOException("Tuple already formatted with " + tuple.size() // + " fields"); } }