/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.ColumnType; import com.linkedin.cubert.block.DataType; import com.linkedin.cubert.operator.cube.DimensionKey; import com.linkedin.cubert.utils.JsonUtils; public class ExtractMedianOperator implements TupleOperator { private Block dataBlock; private BlockSchema schema; private long blockId; private Tuple partitionKey; private Map<DimensionKey, List<Long>> positionMap = new HashMap<DimensionKey, List<Long>>(); private Tuple outputTuple; private Map<String, Integer> argsMap = new HashMap<String, Integer>(); @Override public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException, InterruptedException { String[] inputBlockNames = JsonUtils.asArray(json, "input"); assert (inputBlockNames.length == 2); // get data block String dataName = inputBlockNames[0]; dataBlock = input.get(dataName); blockId = dataBlock.getProperties().getBlockId(); schema = getOutputSchema(dataBlock.getProperties().getSchema()); String positionName = inputBlockNames[1]; Block positionBlock = input.get(positionName); // jsonutils as map for args? setArgs(json.get("args")); // set up right relation into a hashmap (ancestor -> position of median) Tuple t; DimensionKey key; int pkIndex = argsMap.get("pkIndex"); int positionIndex = argsMap.get("positionIndex"); while ((t = positionBlock.next()) != null) { // two fields are pkIndex and positionIndex so make dimension key size - 2 key = new DimensionKey(new int[t.size() - 2]); for (int i = 0; i < t.size(); i++) { if (i != pkIndex && i != positionIndex) { Integer dimValue = (Integer) t.get(i); key.set(i, dimValue); } } List<Long> positions; if (positionMap.containsKey(key)) { positions = positionMap.get(key); positions.add((Long) t.get(t.size() - 2)); } else { positions = new ArrayList<Long>(); positions.add((Long) t.get(t.size() - 2)); } positionMap.put(key, positions); /* * int location = (Integer) t.get(0); int month = (Integer) t.get(1); key = * new DimensionKey(new int[] {location, month}); * * List<Long> positions; if (positionMap.containsKey(key)) { positions = * positionMap.get(key); positions.add((Long)t.get(2)); } else { positions = * new ArrayList<Long>(); positions.add((Long) t.get(2)); } * positionMap.put(key, positions); */ } // instantiate outputTuple outputTuple = TupleFactory.getInstance().newTuple(schema.getColumnNames().length); } private void setArgs(JsonNode args) { String argString = args.toString().replace("{", "").replace("}", "").replaceAll("\"", ""); String[] elements = argString.split(","); for (String element : elements) { String[] keyValueComponents = element.split(":"); String key = keyValueComponents[0]; Integer val = Integer.parseInt(keyValueComponents[1]); argsMap.put(key, val); } } @Override public Tuple next() throws IOException, InterruptedException { // while positionMap is not empty, we first check if any of the <K,V> pairs has a // V that is zero, in which // case we can emit that tuple and remove the <K,V> pair from the hashmap. // if none are zero, we want to keep getting more tuples from the data block. // each of those tuples will get enumerated into it's ancestors and update the // hashmap. // after updating the hashmap, once again check if any <K,V> pairs are zero, and // emit the tuple // and remove the <K,V> pairs from the hashmap if so. // can store an outputTuple, and if any hashmap values are zero, set it's // ancestors to the fields // of the outputTuple and then remove from the hashmap. if all hashmap values are // zero, maybe // set outputTuple to null. or keep updating outputTuple's value field no matter // what. while (positionMap.size() > 0) { // check if any <K,V> pairs has a V that is zero. Tuple previousRetVal = updateHashMapAndRemoveTuple(); if (previousRetVal != null) { return previousRetVal; } // if none are ready to be outputted, must enumerate more tuples in the // datablock. Tuple t; int valueIndex = argsMap.get("valueIndex"); int outputTupleSize = outputTuple.size(); while ((t = dataBlock.next()) != null) { // subtract one because that is the valueIndex, the rest are dimensions DimensionKey dataKey = new DimensionKey(new int[t.size() - 1]); for (int i = 0; i < t.size(); i++) { if (i != valueIndex) { Integer dimValue = (Integer) t.get(i); dataKey.set(i, dimValue); } } DimensionKey[] ancestors = ancestors(dataKey); outputTuple.set(outputTupleSize - 1, t.get(valueIndex)); for (DimensionKey ancestor : ancestors) { // for each enumeration, update hashmap if (positionMap.containsKey(ancestor)) { List<Long> positions = positionMap.get(ancestor); for (int i = 0; i < positions.size(); i++) { Long position = positions.get(i); position -= 1; positions.set(i, position); } positionMap.put(ancestor, positions); } } // check if hashmap has any -1 and output if so Tuple retval = updateHashMapAndRemoveTuple(); if (retval != null) { return retval; } else { continue; } /* * // enumerate t, update outputTuple, update hashmap, check if hashmap * has any -1 and output if so DimensionKey dataKey = new DimensionKey(new * int[] { (Integer) t.get(0) , (Integer) t.get(1)}); DimensionKey[] * ancestors = ancestors(dataKey); outputTuple.set(2, (Long) t.get(2)); * for (DimensionKey ancestor : ancestors) { // for each enumeration, * update hashmap if (positionMap.containsKey(ancestor)) { List<Long> * positions = positionMap.get(ancestor); for (int i = 0; i < * positions.size(); i++) { Long position = positions.get(i); position -= * 1; positions.set(i, position); } positionMap.put(ancestor, positions); * } } * * // check if hashmap has any -1 and output if so Tuple retval = * updateHashMapAndRemoveTuple(); if (retval != null) { return retval; } * else { continue; } */ } } return null; } private Tuple updateHashMapAndRemoveTuple() throws ExecException { Iterator<Entry<DimensionKey, List<Long>>> iterator = positionMap.entrySet().iterator(); while (iterator.hasNext()) { Entry<DimensionKey, List<Long>> pair = iterator.next(); List<Long> positions = pair.getValue(); Iterator<Long> positionIterator = positions.iterator(); while (positionIterator.hasNext()) { Long position = positionIterator.next(); if (position == -1) { int[] dimensions = pair.getKey().getArray(); for (int i = 0; i < dimensions.length; i++) { outputTuple.set(i, dimensions[i]); } positionIterator.remove(); if (positions.size() == 0) { iterator.remove(); } return outputTuple; } } /* * if (pair.getValue() == -1) // since we are zero indexed { int[] dimensions * = pair.getKey().getArray(); outputTuple.set(0, dimensions[0]); * outputTuple.set(1, dimensions[1]); iterator.remove(); return outputTuple; * // value should have already been set! } */ } return null; } private DimensionKey[] ancestors(DimensionKey dataKey) { int[] dataKeyArray = dataKey.getArray(); int dimensionKeySize = dataKeyArray.length; int numAncestors = (int) Math.pow(2, dimensionKeySize); DimensionKey[] retval = new DimensionKey[numAncestors]; for (int i = 0; i < numAncestors; i++) { DimensionKey ancestor = new DimensionKey(new int[dimensionKeySize]); for (int j = 0; j < dimensionKeySize; j++) { // if the right most bit is 1, select that element if (((i >> j) & 1) == 1) { ancestor.set(j, dataKeyArray[j]); } else { ancestor.set(j, -1); } } retval[i] = ancestor; } return retval; /* * int location = dataKey.getArray()[0]; int month = dataKey.getArray()[1]; * DimensionKey[] retval = new DimensionKey[4]; retval[0] = dataKey; retval[1] = * new DimensionKey(new int[] {location, -1}); retval[2] = new DimensionKey(new * int[] {-1, month}); retval[3] = new DimensionKey(new int[] {-1, -1}); * * return retval; */ } // assumption: valueIndex is the last index of the inputSchema private BlockSchema getOutputSchema(BlockSchema inputSchema) { int sizeOfOutputSchema = inputSchema.getColumnNames().length; ColumnType[] columns = new ColumnType[sizeOfOutputSchema]; for (int i = 0; i < inputSchema.getColumnNames().length - 1; i++) { ColumnType column = inputSchema.getColumnType(i); columns[i] = column; } ColumnType lastColumn; if (inputSchema.getColumnType(inputSchema.getNumColumns() - 1).getType() == DataType.LONG) { lastColumn = new ColumnType("value", DataType.LONG); } else { lastColumn = new ColumnType("value", DataType.DOUBLE); } columns[sizeOfOutputSchema - 1] = lastColumn; return new BlockSchema(columns); /* * int sizeOfOutputSchema = inputSchema.getColumnNames().length + 1; ColumnType[] * columns = new ColumnType[sizeOfOutputSchema]; * * for (int i = 0; i < inputSchema.getColumnNames().length; i++) { ColumnType * column = inputSchema.getColumnType(i); if (column.getType() != DataType.BAG) { * columns[i] = column; } else { ColumnType tuple = * column.getColumnSchema().getColumnType(0); * * ColumnType[] innerColumns = tuple.getColumnSchema().getColumnTypes(); assert * (innerColumns.length == 2); * * columns[i] = innerColumns[0]; columns[i+1] = innerColumns[1]; } } return new * BlockSchema(columns); */ } @Override public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { Iterator<PostCondition> iter = preConditions.values().iterator(); iter.next(); PostCondition condition = iter.next(); BlockSchema inputSchema = condition.getSchema(); return new PostCondition(getOutputSchema(inputSchema), condition.getPartitionKeys(), condition.getSortKeys()); /* * System.out.println("PRECONDITIONS: " + preConditions); * System.out.println("JSON: " + json); * * PostCondition condition = preConditions.values().iterator().next(); * * ColumnType[] columns = new ColumnType[3]; * * ColumnType first = new ColumnType("location", DataType.INT); ColumnType second * = new ColumnType("month", DataType.INT); ColumnType third = new * ColumnType("value", DataType.LONG); * * columns[0] = first; columns[1] = second; columns[2] = third; * * BlockSchema s = new BlockSchema(columns); * * return new PostCondition(s, condition.getPartitionKeys(), * condition.getSortKeys()); */ } }