package com.linkedin.cubert.operator; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.plan.physical.GenerateDictionary; import com.linkedin.cubert.utils.CodeDictionary; import com.linkedin.cubert.utils.FileCache; import com.linkedin.cubert.utils.JsonUtils; /** * Reduce side Refersh Dictionary Operator. Will take in shuffled data from map side * operator. For each new {column_name, column_value} received, assign new dictionary * code. * * @author Mani Parkhe */ public class DictionaryRefreshReduceSideOperator implements TupleOperator { private Block block; private Map<String, CodeDictionary> dictionaryMap = null; private boolean dictionaryUpdated = false; private Tuple output; private Integer newCode; private Iterator<Entry<String, CodeDictionary>> dictionaryIterator = null; private Iterator<String> valuesAndCodesIterator = null; private String currentColumnName; private CodeDictionary currentDictionary; /** * {@inheritDoc} * * @see com.linkedin.cubert.operator.TupleOperator#setInput(java.util.Map, * org.codehaus.jackson.JsonNode, com.linkedin.cubert.block.BlockProperties) */ @Override public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException, InterruptedException { if (json.has("dictionary")) { // load the dictionary from file String dictionaryName = JsonUtils.getText(json, "dictionary"); String cachedPath = FileCache.get(dictionaryName); dictionaryMap = GenerateDictionary.loadDictionary(cachedPath, false, null); } else { dictionaryMap = new HashMap<String, CodeDictionary>(); String[] columns = JsonUtils.getText(json, "columns").split(","); for (String name : columns) dictionaryMap.put(name, new CodeDictionary()); } output = TupleFactory.getInstance().newTuple(3); String inputBlockName = JsonUtils.asArray(json, "input")[0]; block = input.get(inputBlockName); dictionaryUpdated = false; } /** * Fetch the next dictionary to report. * * @return <code> true </code> if there are more dictionaries to report. * <code> false </code> otherwise. */ private boolean fetchNextDictionary() { if (!dictionaryIterator.hasNext()) return false; Entry<String, CodeDictionary> entry = dictionaryIterator.next(); currentColumnName = entry.getKey(); currentDictionary = entry.getValue(); valuesAndCodesIterator = currentDictionary.keySet().iterator(); return true; } /** * {@inheritDoc} First update dictionary with new values from input block. Output 1 * row per dictionary column for each distinct value. * * @see com.linkedin.cubert.operator.TupleOperator#next() */ @Override public Tuple next() throws IOException, InterruptedException { // Update dictionary with new values from input block if (!dictionaryUpdated) { updateDictionaries(); dictionaryUpdated = true; dictionaryIterator = dictionaryMap.entrySet().iterator(); } // Fetch the (first or) next dictionary column. if (valuesAndCodesIterator == null) { if (!fetchNextDictionary()) return null; } // Exhausted current dictionary, get the next one. if (!valuesAndCodesIterator.hasNext()) { valuesAndCodesIterator = null; return this.next(); } // For current (dictionary) column, report next distinct column value and // dictionary code. String columnValue = valuesAndCodesIterator.next(); newCode = currentDictionary.getCodeForKey(columnValue); output.set(0, currentColumnName); output.set(1, columnValue); output.set(2, newCode); return output; } /** * @throws IOException * @throws InterruptedException * @throws ExecException * Iterate through all tuples of input block and update dictionary for * missing column values. */ private void updateDictionaries() throws IOException, InterruptedException, ExecException { Tuple tuple; while ((tuple = block.next()) != null) { String colName = (String) tuple.get(0); String colValue = (String) tuple.get(1); CodeDictionary dictionary = dictionaryMap.get(colName); if (dictionary.getCodeForKey(colValue) == -1) dictionary.addKey(colValue); } } /** * {@inheritDoc} * * @see com.linkedin.cubert.operator.TupleOperator#getPostCondition(java.util.Map, * org.codehaus.jackson.JsonNode) */ @Override public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { return new PostCondition(new BlockSchema("STRING colname, STRING colvalue, INT code"), null, null); } }