/* * (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.memory; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import com.linkedin.cubert.operator.PhaseContext; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.AbstractTuple; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.ColumnType; import com.linkedin.cubert.block.TupleStoreBlock; import com.linkedin.cubert.utils.SortAlgo; import com.linkedin.cubert.utils.TupleStore; /** * Columnar Tuple Store * ColumnarTupleStore is an implementation of columnar store of pig tuples in Cubert. Currently, this data * structure only supports flat schema. For primitive data types the store uses a segmented implementation of * primitive arrays which makes it more memory efficient. * * @author spyne * Created on 1/12/15. */ public class ColumnarTupleStore implements TupleStore { private final BlockSchema schema; private final int nColumns; /* Number of rows in the store */ private int size; /* Data Store */ private final SegmentedArrayList[] dataStore; /* Start and end indices for subrange operation */ private int start = 0; public ColumnarTupleStore(BlockSchema schema) { this(schema, false); } public ColumnarTupleStore(BlockSchema schema, boolean encodeStrings) { // TOOD: set up the valid schema test here this.schema = schema; nColumns = schema.getNumColumns(); dataStore = new SegmentedArrayList[nColumns]; prepareDataStore(encodeStrings); } public ColumnarTupleStore(ColumnarTupleStore store, int start, int size) { this.start = start; this.size = size; dataStore = store.dataStore; schema = store.getSchema(); nColumns = schema.getNumColumns(); } private void prepareDataStore(boolean encodeStrings) { final int batchSize = 1 << 10; ColumnType[] columnTypes = schema.getColumnTypes(); for (int i = 0; i < columnTypes.length; i++) { ColumnType colType = columnTypes[i]; switch (colType.getType()) { case INT: dataStore[i] = new IntArrayList(batchSize); break; case LONG: dataStore[i] = new LongArrayList(batchSize); break; case DOUBLE: dataStore[i] = new DoubleArrayList(batchSize); break; case FLOAT: dataStore[i] = new FloatArrayList(batchSize); break; case STRING: if (encodeStrings) dataStore[i] = new DictEncodedArrayList(batchSize); else dataStore[i] = new ObjectArrayList(batchSize); break; case BAG: dataStore[i] = new BagArrayList( colType.getColumnSchema().getColumnType(0).getColumnSchema(), encodeStrings); break; default: dataStore[i] = new ObjectArrayList(batchSize); break; } } } @Override public void addToStore(Tuple tuple) throws IOException { for (int i = 0; i < nColumns; ++i) { dataStore[i].add(tuple.get(i)); } ++size; } @Override public void clear() { for (int i = 0; i < nColumns; ++i) { dataStore[i].clear(); } size = 0; } @Override public Iterator<Tuple> iterator() { return new Iterator<Tuple>() { private int count = 0; private int index = start; private ColumnStoreTuple tuple = new ColumnStoreTuple(start); @Override public boolean hasNext() { return count < size; } @Override public Tuple next() { tuple.rowid = index; ++index; ++count; return tuple; } @Override public void remove() { throw new UnsupportedOperationException("Not available in a Tuple Store"); } }; } @Override public Tuple getTuple(int index, Tuple reuse) { if (reuse == null) { return new ColumnStoreTuple(index); } else if (reuse instanceof ColumnStoreTuple) { ((ColumnStoreTuple) reuse).rowid = index; return reuse; } try { for (int i = 0; i < nColumns; ++i) { reuse.set(i, dataStore[i].get(index)); } return reuse; } catch (ExecException e) { e.printStackTrace(); throw new RuntimeException(e); } } public Tuple newTuple() { return TupleFactory.getInstance().newTuple(schema.getNumColumns()); } @Override public int getNumTuples() { return size; } @Override public BlockSchema getSchema() { return schema; } @Override public int[] getOffsets() { final int[] indexes = new int[size]; for (int i = 0; i < indexes.length; i++) { indexes[i] = i; } return indexes; } @Override public void sort(SortAlgo<Tuple> algo) { throw new UnsupportedOperationException("Not Implemented yet."); } public Map<Object, Block> getSubBlocksForPartitionCol(String colName) throws ExecException { final Map<Object, Block> subBlockMap = new HashMap<Object, Block>(); final SegmentedArrayList list = dataStore[schema.getIndex(colName)]; for (int start = 0, i = 0; i < size; ++i) { if (i+1 == size || list.compareIndices(i, i+1) != 0) { Block block = new TupleStoreBlock( new ColumnarTupleStore(this, start, i - start + 1), new BlockProperties("block" + start, schema, (BlockProperties[]) null)); subBlockMap.put(list.get(i), block); start = i+1; } } return subBlockMap; } public class ColumnStoreTuple extends AbstractTuple implements PrimitiveTuple { private int rowid; public ColumnStoreTuple(int rowid) { this.rowid = rowid; } @Override public int size() { return nColumns; } @Override public Object get(int fieldNum) throws ExecException { return dataStore[fieldNum].get(rowid); } public long getLong(int fieldNum) { return ((LongArrayList) dataStore[fieldNum]).getLong(rowid); } public int getInt(int fieldNum) { return ((IntArrayList) dataStore[fieldNum]).getInt(rowid); } public double getDouble(int fieldNum) { return ((DoubleArrayList) dataStore[fieldNum]).getDouble(rowid); } @Override public List<Object> getAll() { List<Object> fields = new ArrayList<Object>(nColumns); try { for (int i = 0; i < nColumns; ++i) { fields.add(get(i)); } return fields; } catch (ExecException e) { e.printStackTrace(); throw new RuntimeException(e); } } @Override public void set(int fieldNum, Object val) throws ExecException { throw new UnsupportedOperationException("Not Implemented"); } @Override public void append(Object val) { throw new UnsupportedOperationException("Not Implemented"); } @Override public long getMemorySize() { throw new UnsupportedOperationException("Not Implemented"); } /** * Implementation is copied from DefaultTuple implementation in pig. * * @param obj the other object to be compared * @return -1 if the other is less, 0 if equal or 1 if the other is greater to this object */ @Override public int compareTo(Object obj) { if (obj instanceof Tuple) { Tuple other = (Tuple) obj; int otherSize = other.size(); if (otherSize < nColumns) return 1; else if (otherSize > nColumns) return -1; else { for (int i = 0; i < nColumns; i++) { try { int c = DataType.compare(get(i), other.get(i)); if (c != 0) return c; } catch (ExecException e) { throw new RuntimeException("Unable to compare tuples", e); } } return 0; } } else { return DataType.compare(this, obj); } } @Override public void write(DataOutput out) throws IOException { throw new UnsupportedOperationException("Not Implemented"); } @Override public void readFields(DataInput in) throws IOException { throw new UnsupportedOperationException("Not Implemented"); } } }