package com.linkedin.cubert.operator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
import com.linkedin.cubert.block.Block;
import com.linkedin.cubert.block.BlockProperties;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.plan.physical.GenerateDictionary;
import com.linkedin.cubert.utils.CodeDictionary;
import com.linkedin.cubert.utils.FileCache;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.Pair;
/**
* Map side operator for Refresh dictionary MR style job. This will load existing
* dictionary in memory. For every column that needs encoding, emit column values that are
* not already in dictionary.
*
* @author Mani Parkhe
*/
public class DictionaryRefreshMapSideOperator implements TupleOperator
{
private int numColumns;
private CodeDictionary[] dictionaries;
private Block dataBlock;
private boolean[] isDictionaryField;
Tuple output;
private List<Set<String>> emitCache;
Queue<Pair<String, String>> emitKeys = null;
private String[] columnNames;
private String replaceNull;
// private String defaultValue;
/**
* {@inheritDoc}
*
* @see com.linkedin.cubert.operator.TupleOperator#setInput(java.util.Map,
* org.codehaus.jackson.JsonNode, com.linkedin.cubert.block.BlockProperties)
*/
@Override
public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException,
InterruptedException
{
Map<String, CodeDictionary> dictionaryMap = null;
if (json.has("dictionary"))
{
// load the dictionary from file
String dictionaryName = JsonUtils.getText(json, "dictionary");
String cachedPath = FileCache.get(dictionaryName);
dictionaryMap = GenerateDictionary.loadDictionary(cachedPath, false, null);
}
else
{
dictionaryMap = new HashMap<String, CodeDictionary>();
String[] columns = JsonUtils.getText(json, "columns").split(",");
for (String name : columns)
dictionaryMap.put(name, new CodeDictionary());
}
// if (json.has("defaultValue"))
// {
// defaultValue = JsonUtils.getText(json, "defaultValue");
// }
String inputBlockName = JsonUtils.asArray(json, "input")[0];
dataBlock = input.get(inputBlockName);
BlockSchema inputSchema = dataBlock.getProperties().getSchema();
columnNames = inputSchema.getColumnNames();
numColumns = inputSchema.getNumColumns();
output = TupleFactory.getInstance().newTuple(2);
emitKeys = new LinkedList<Pair<String, String>>();
// create dictionary array
dictionaries = new CodeDictionary[numColumns];
emitCache = new ArrayList<Set<String>>();
for (int i = 0; i < numColumns; i++)
emitCache.add(new HashSet<String>());
isDictionaryField = new boolean[numColumns];
for (int i = 0; i < numColumns; i++)
{
String colName = inputSchema.getName(i);
boolean dict = isDictionaryField[i] = dictionaryMap.containsKey(colName);
if (dict)
dictionaries[i] = dictionaryMap.get(colName);
else
dictionaries[i] = null;
}
}
/**
* {@inheritDoc}
*
* @see com.linkedin.cubert.operator.TupleOperator#next()
*/
@Override
public Tuple next() throws IOException,
InterruptedException
{
while (emitKeys.peek() == null)
{
Tuple tuple = dataBlock.next();
if (tuple == null)
return null;
processTuple(tuple);
}
Pair<String, String> elem = emitKeys.poll();
output.set(0, elem.getFirst());
output.set(1, elem.getSecond());
return output;
}
/**
* @param tuple
* Iterates through each element in <code> tuple </code> and populates
* queue for any new element seen.
* @throws ExecException
*/
private void processTuple(Tuple tuple) throws ExecException
{
for (int i = 0; i < isDictionaryField.length; i++)
{
if (!isDictionaryField[i])
continue;
Object val = tuple.get(i);
String colValue = (val == null) ? replaceNull : val.toString();
if (dictionaries[i].getCodeForKey(colValue) != -1
|| emitCache.get(i).contains(colValue))
continue;
emitCache.get(i).add(colValue);
emitKeys.add(new Pair<String, String>(columnNames[i], colValue));
}
}
/**
* {@inheritDoc}
*
* @see com.linkedin.cubert.operator.TupleOperator#getPostCondition(java.util.Map,
* org.codehaus.jackson.JsonNode)
*/
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions,
JsonNode json) throws PreconditionException
{
return new PostCondition(new BlockSchema("STRING colname, STRING colvalue"),
null,
null);
}
}