package edu.isi.karma.mapreduce.function;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SplitAndCleanJSONArray extends GenericUDF {
private static Logger LOG = LoggerFactory.getLogger(MergeJSON.class);
private ObjectInspectorConverters.Converter[] converters;
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException
{
assert(arguments.length == 1);
Text arrayToCleanText = (Text)converters[0].convert(arguments[0].get());
List<Text> cleanedValues = new LinkedList<>();
if(arrayToCleanText == null)
{
return cleanedValues;
}
try{
String arrayToClean = arrayToCleanText.toString();
if(arrayToClean.startsWith("[") && arrayToClean.endsWith("]"))
{
arrayToClean = arrayToClean.substring(1,arrayToClean.length()-1);
}
Set<Text> cleanedValuesSet = new HashSet<>();
String[] values = arrayToClean.split(",");
for(String value : values)
{
cleanedValuesSet.add(new Text(value.replace("\"", "")));
}
cleanedValues.addAll(cleanedValuesSet);
}
catch(Exception e)
{
LOG.error("Unabled to split and clean array",e.getMessage());
}
return cleanedValues;
}
@Override
public String getDisplayString(String[] arguments) {
assert (arguments.length == 1);
return "SplitAndCleanJSONArray(" + arguments[0] + ")";
}
@Override
public ObjectInspector initialize(ObjectInspector[] arguments)
throws UDFArgumentException {
if (arguments.length != 1) {
throw new UDFArgumentLengthException("The SplitAndCleanJSONArray takes only one argument");
}
converters = new ObjectInspectorConverters.Converter[arguments.length];
for (int i = 0; i < arguments.length; i++) {
converters[i] = ObjectInspectorConverters.getConverter(arguments[i], PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
}