/*
* chombo: Hadoop Map Reduce utility
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.chombo.transformer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.chombo.util.AttributePredicate;
import org.chombo.util.BasicUtils;
import org.chombo.util.ProcessorAttribute;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigValue;
/**
* @author pranab
*
*/
public class StringTransformer {
/**
* @author pranab
*
*/
public static class LowerCaseTransformer extends AttributeTransformer {
public LowerCaseTransformer(ProcessorAttribute prAttr) {
super(prAttr.getTargetFieldOrdinals().length);
}
public LowerCaseTransformer() {
super(1);
}
@Override
public String[] tranform(String value) {
transformed[0] = value.toLowerCase();
return transformed;
}
}
/**
* @author pranab
*
*/
public static class UpperCaseTransformer extends AttributeTransformer {
public UpperCaseTransformer(ProcessorAttribute prAttr) {
super(prAttr.getTargetFieldOrdinals().length);
}
public UpperCaseTransformer() {
super(1);
}
@Override
public String[] tranform(String value) {
transformed[0] = value.toUpperCase();
return transformed;
}
}
/**
* @author pranab
*
*/
public static class PatternBasedTransformer extends AttributeTransformer {
private Pattern pattern;
private Matcher matcher;
private boolean failOnMissingGroup;
private boolean retainOriginalField;
public PatternBasedTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
pattern = Pattern.compile(config.getString("regEx"));
failOnMissingGroup = config.getBoolean("failOnMissingGroup");
retainOriginalField = config.getBoolean("retainOriginalField");
}
public PatternBasedTransformer(int numTransAttributes, String regEx, boolean failOnMissingGroup) {
super(numTransAttributes);
pattern = Pattern.compile(regEx);
this.failOnMissingGroup = failOnMissingGroup;
}
@Override
public String[] tranform(String value) {
matcher = pattern.matcher(value);
if (matcher.matches()) {
int grIndx = 1;
for (int i = 0; i < transformed.length; ++i) {
if (retainOriginalField && i == 0) {
transformed[i] = value;
continue;
}
String extracted = matcher.group(grIndx);
if(extracted != null) {
transformed[i] = extracted;
} else {
if (failOnMissingGroup) {
throw new IllegalArgumentException("mtaching failed for a group in pattern based transformer");
} else {
transformed[i] = "";
}
}
++grIndx;
}
} else {
throw new IllegalArgumentException("mtaching failed for pattern based transformer");
}
return transformed;
}
}
/**
* @author pranab
*
*/
public static class SearchReplaceTransformer extends AttributeTransformer {
private String regEx;
private String replacement;
private boolean replaceAll;
public SearchReplaceTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
regEx = config.getString("regEx");
replacement = config.getString("replacement");
replaceAll = config.getBoolean("replaceAll");
}
public SearchReplaceTransformer(int numTransAttributes, String regEx, String replacement, boolean replaceAll) {
super(numTransAttributes);
this.regEx = regEx;
this.replacement = replacement;
this.replaceAll = replaceAll;
}
@Override
public String[] tranform(String value) {
if (replaceAll) {
transformed[0] = value.replaceAll(regEx, replacement);
} else {
transformed[0] = value.replaceFirst(regEx, replacement);
}
return transformed;
}
}
/**
* @author pranab
*
*/
public static class KeyValueTransformer extends AttributeTransformer {
private Config keyValConfig;
private Map<String, String> kayValues;
public KeyValueTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
keyValConfig = config.getConfig("keyValues");
}
public KeyValueTransformer(Map<String, String> kayValues) {
super(1);
this.kayValues = kayValues;
}
public KeyValueTransformer(ProcessorAttribute prAttr, Config config, InputStream inStrm) throws IOException {
super(1);
int fieldOrd = prAttr.getOrdinal();
String delim = config.getString("fieldDelim");
try {
kayValues = new HashMap<String, String>();
BufferedReader reader = new BufferedReader(new InputStreamReader(inStrm));
String line = null;
while((line = reader.readLine()) != null) {
String[] items = line.split(delim);
if (Integer.parseInt(items[0]) == fieldOrd) {
kayValues.put(items[1], items[2]);
}
}
} catch (IOException ex) {
throw ex;
} finally {
inStrm.close();
}
}
@Override
public String[] tranform(String value) {
String newValue = null;
if (null != keyValConfig) {
if (keyValConfig.hasPath(value)) {
newValue = keyValConfig.getString(value);
} else {
newValue = null;
}
} else {
newValue = kayValues.get(value);
}
transformed[0] = null != newValue ? newValue : value;
return transformed;
}
}
/**
* @author pranab
*
*/
public static class DefaultValueTransformer extends AttributeTransformer {
private String defaultValue;
public DefaultValueTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
defaultValue = config.getString("defaultValue");
}
public DefaultValueTransformer( String defaultValue) {
super(1);
this.defaultValue = defaultValue;
}
@Override
public String[] tranform(String value) {
if (value.isEmpty()) {
transformed[0] = defaultValue;
} else {
transformed[0] = value;
}
return transformed;
}
}
/**
* @author pranab
*
*/
public static class ForcedReplaceTransformer extends AttributeTransformer {
private String newValue;
public ForcedReplaceTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
newValue = config.getString("newValue");
}
public ForcedReplaceTransformer( String newValue) {
super(1);
this.newValue = newValue;
}
@Override
public String[] tranform(String value) {
transformed[0] = newValue;
return transformed;
}
}
/**
* @author pranab
*
*/
public static class AnoynmizerTransformer extends AttributeTransformer {
private String mask;
public AnoynmizerTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
mask = config.getString("mask");
}
public AnoynmizerTransformer( String mask) {
super(1);
this.mask = mask;
}
@Override
public String[] tranform(String value) {
transformed[0] = StringUtils.repeat(mask, value.length());
return transformed;
}
}
/**
* @author pranab
*
*/
public static class UniqueKeyGenerator extends AttributeTransformer {
private String algorithm;
public UniqueKeyGenerator(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
algorithm = config.getString("algorithm");
}
public UniqueKeyGenerator( String algorithm) {
super(1);
this.algorithm = algorithm;
}
@Override
public String[] tranform(String value) {
if (algorithm.equals("uuid")) {
transformed[0] = UUID.randomUUID().toString().replaceAll("-", "");
} else {
throw new IllegalArgumentException("invalid key generation algorithm");
}
return transformed;
}
}
/**
* @author pranab
*
*/
public static class TrimTransformer extends AttributeTransformer {
public TrimTransformer(ProcessorAttribute prAttr) {
super(prAttr.getTargetFieldOrdinals().length);
}
public TrimTransformer( ) {
super(1);
}
@Override
public String[] tranform(String value) {
transformed[0] = value.trim();
return transformed;
}
}
/**
* @author pranab
*
*/
public static class ConstantGenerator extends AttributeTransformer {
private String constValue;
public ConstantGenerator(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
constValue = config.getString("constValue");
}
public ConstantGenerator( String constValue) {
super(1);
this.constValue = constValue;
}
@Override
public String[] tranform(String value) {
transformed[0] = constValue;
return transformed;
}
}
/**
* @author pranab
*
*/
public static class GroupTransformer extends AttributeTransformer {
private Map<String, List<String>> groupValues = new HashMap<String, List<String>>();
public GroupTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
Set<Entry<String, ConfigValue>> entries = config.entrySet();
for (Entry<String, ConfigValue> entry : entries) {
String[] values = entry.getValue().unwrapped().toString().split(",");
groupValues.put(entry.getKey(), Arrays.asList(values));
}
}
public GroupTransformer( Map<String, List<String>> groupValues ) {
super(2);
this.groupValues = groupValues;
}
@Override
public String[] tranform(String value) {
transformed[0] = value;
String group = null;
for (String key : groupValues.keySet()) {
if (groupValues.get(key).contains(value)) {
group = key;
break;
}
}
if (null == group) {
throw new IllegalArgumentException("no group found");
}
transformed[1] = group;
return transformed;
}
}
/**
* @author pranab
*
*/
public static class StringCustomTransformer extends CustomTransformer {
public StringCustomTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr, config);
}
public StringCustomTransformer(String script, Map<String, Object> params) {
super(script, params);
}
protected Object getFieldValue(String value) {
return value;
}
protected String getOutput(Object out) {
String ret = null;
if (out instanceof String ) {
ret = "" + (String)out;
} else {
throw new IllegalArgumentException("string output expected");
}
return ret;
}
}
/**
* @author pranab
* Removes a field by returning null for transformed value
*/
public static class DeleteTransformer extends AttributeTransformer {
public DeleteTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
}
@Override
public String[] tranform(String value) {
return null;
}
}
/**
* Does string append or prepend with provided string
* @author pranab
*
*/
public static class ConcatenatorTransformer extends AttributeTransformer {
private String operation;
private String stringToAdd;
private String delimiter;
public ConcatenatorTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
operation = config.getString("operation");
stringToAdd = config.getString("stringToAdd");
delimiter = config.getString("delimiter");
}
public ConcatenatorTransformer(int numTransAttributes, String operation, String stringToAdd, String delimiter) {
super(numTransAttributes);
this.operation = operation;
this.stringToAdd = stringToAdd;
this.delimiter = delimiter;
}
@Override
public String[] tranform(String value) {
if (operation.equals("prepend")) {
transformed[0] = stringToAdd + delimiter + value;
} else if (operation.equals("append")){
transformed[0] = value + delimiter + stringToAdd;
} else {
throw new IllegalArgumentException("invalid string concatenation operator");
}
return transformed;
}
}
/**
* Merges multiple fields into one
* @author pranab
*
*/
public static class FieldMergeTransformer extends AttributeTransformer implements ContextAwareTransformer {
private List<Integer> mergeFieldOrdinals;
private String delimiter;
private String[] fields;
public FieldMergeTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
config = getFieldSpecificConfig(prAttr.getOrdinal(), config);
mergeFieldOrdinals = config.getIntList("mergeFieldOrdinals");
delimiter = config.getString("delimiter");
}
public FieldMergeTransformer(int numTransAttributes, List<Integer> mergeFieldOrdinals, String delimiter) {
super(numTransAttributes);
this.mergeFieldOrdinals = mergeFieldOrdinals;
this.delimiter = delimiter;
}
@Override
public String[] tranform(String value) {
StringBuilder stBld = new StringBuilder(value);
for (int otherOrd : mergeFieldOrdinals) {
stBld.append(delimiter).append(fields[otherOrd]);
}
transformed[0] = stBld.toString();
return transformed;
}
@Override
public void setContext(Map<String, Object> context) {
fields = (String[])context.get("record");
}
}
/**
* Collapses multiple fields into one to solve filed delimiter embedded in field problem.
* @author pranab
*
*/
public static class WithinFieldDelimiterTransformer extends AttributeTransformer implements ContextAwareTransformer {
private int numFieldsToCollapse = -1;
private String replacementDelimiter = " ";
private String[] fields;
private int expectedNumFields;
private int curFieldOrdinal;
private String outputDelimiter = ",";
private static int lastCollapsedFieldsDefined;
private static int lastCollapsedFieldsNotDefined;
private static boolean checkedForValidity;
private static int collapsedFieldsNotDefinedCount;
private static int totalNumFieldsToCollapse;
public WithinFieldDelimiterTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
curFieldOrdinal = prAttr.getOrdinal();
config = getFieldSpecificConfig(prAttr.getOrdinal(), config);
//if not specified, then there could be only one field in the record with
//embedded delimiter problem
if (config.hasPath("numFieldsToCollapse")) {
numFieldsToCollapse = config.getInt("numFieldsToCollapse");
if (curFieldOrdinal > lastCollapsedFieldsDefined) {
lastCollapsedFieldsDefined = curFieldOrdinal;
}
++totalNumFieldsToCollapse;
} else {
++collapsedFieldsNotDefinedCount;
if (curFieldOrdinal > lastCollapsedFieldsNotDefined) {
lastCollapsedFieldsNotDefined = curFieldOrdinal;
}
}
if (config.hasPath("replacementDelimiter")) {
replacementDelimiter = config.getString("replacementDelimiter");
if (curFieldOrdinal > lastCollapsedFieldsDefined) {
lastCollapsedFieldsDefined = curFieldOrdinal;
}
}
if (config.hasPath("outputDelimiter")) {
outputDelimiter = config.getString("outputDelimiter");
}
expectedNumFields = config.getInt("expectedNumFields");
}
public WithinFieldDelimiterTransformer(int numTransAttributes, int curFieldOrdinal, int numFieldsToCollapse,
String replacementDelimiter, int expectedNumFields, String outputDelimiter) {
super(numTransAttributes);
this.curFieldOrdinal = curFieldOrdinal;
this.numFieldsToCollapse = numFieldsToCollapse;
this.replacementDelimiter = replacementDelimiter;
this.expectedNumFields = expectedNumFields;
this.outputDelimiter = outputDelimiter;
}
@Override
public String[] tranform(String value) {
if (expectedNumFields != fields.length) {
checkForValidity();
//get number of fields to collapse from the total field count if not specified
int actualNumFieldsToCollapse = numFieldsToCollapse < 0 ?
fields.length - expectedNumFields - totalNumFieldsToCollapse: numFieldsToCollapse;
int afterLastCollapsedFieldOrdinal = curFieldOrdinal + actualNumFieldsToCollapse + 1;
String collapsedFields = BasicUtils.join(fields, curFieldOrdinal, afterLastCollapsedFieldOrdinal,
replacementDelimiter);
if (numFieldsToCollapse < 0) {
//not specified implying num of embedded delimiters could vary across rows so collapse remaining
collapsedFields = collapsedFields + outputDelimiter + BasicUtils.join(fields,
afterLastCollapsedFieldOrdinal, fields.length, outputDelimiter);
}
transformed[0] = collapsedFields;
} else {
//record does not embedded delimiter issue
if (numFieldsToCollapse < 0) {
//collapse remaining fields
String collapsedFields = value + outputDelimiter + BasicUtils.join(fields,
curFieldOrdinal + 1, fields.length, outputDelimiter);
transformed[0] = collapsedFields;
} else {
transformed[0] = value;
}
}
return transformed;
}
private static void checkForValidity() {
if (!checkedForValidity) {
if (collapsedFieldsNotDefinedCount > 1) {
//multiple fields with undefined number of fields to collapse not allowed
throw new IllegalStateException(
"mulitiple fields found where number of fields to copplapse not specified");
} else if (collapsedFieldsNotDefinedCount == 1) {
//multiple fields with undefined number of fields to collapse not allowed
if (lastCollapsedFieldsNotDefined < lastCollapsedFieldsDefined) {
throw new IllegalStateException(
"if there is any field with undefined number of fields to collapse it should be last one");
}
}
checkedForValidity = true;
}
}
@Override
public void setContext(Map<String, Object> context) {
fields = (String[])context.get("record");
}
}
/**
* Splits string different ways
* @author pranab
*
*/
public static class SplitterTransformer extends AttributeTransformer {
private String operation;
private String delimiter;
private boolean failOnDelimNotFound;
private String retainPolicy;
public SplitterTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
intialize(config.getString("operation"), config.getString("delimiter"),
config.getBoolean("failOnDelimNotFound"),config.getString("retainPolicy"));
}
public SplitterTransformer(int numTransAttributes, String operation, String delimiter, boolean failOnDelimNotFound,
String retainPolicy) {
super(numTransAttributes);
intialize(operation, delimiter, failOnDelimNotFound,retainPolicy);
}
public void intialize(String operation, String delimiter, boolean failOnDelimNotFound,
String retainPolicy) {
this.operation = operation;
this.delimiter = delimiter;
this.failOnDelimNotFound = failOnDelimNotFound;
this.retainPolicy = retainPolicy;
}
@Override
public String[] tranform(String value) {
String[] items = null;
if (operation.equals("spltOnFirst")) {
items = BasicUtils.splitOnFirstOccurence(value, delimiter, failOnDelimNotFound);
retainOutputFields(items);
} else if (operation.equals("spltOnLast")){
items = BasicUtils.splitOnLastOccurence(value, delimiter, failOnDelimNotFound);
retainOutputFields(items);
} else if (operation.equals("spltOnAll")){
items = value.split(delimiter, -1);
if (items.length == transformed.length) {
transformed = items;
} else {
throw new IllegalArgumentException("did not get expected number of items after splitting");
}
} else {
throw new IllegalArgumentException("invalid string splitting operator");
}
return transformed;
}
/**
* @param items
*/
private void retainOutputFields(String[] items) {
if (retainPolicy.equals("first")) {
transformed[0] = items[0];
} else if (retainPolicy.equals("second")) {
transformed[0] = items[1];
} else if (retainPolicy.equals("both")) {
transformed[0] = items[0];
transformed[1] = items[1];
}
}
}
/**
* @author pranab
*
*/
public static class BinaryValueTransformer extends AttributeTransformer {
private AttributePredicate predicate;
private String trueValue;
private String falseValue;
public BinaryValueTransformer(ProcessorAttribute prAttr, Config config) {
super(prAttr.getTargetFieldOrdinals().length);
initialize(config.getString("predicateExpr"), config.getString("trueValue"),
config.getString("falseValue"));
}
public BinaryValueTransformer(String predicateExpr, String trueValue, String falseValue) {
super(1);
initialize(predicateExpr, trueValue, falseValue);
}
public void initialize(String predicateExpr, String trueValue, String falseValue) {
this.predicate = AttributePredicate.create(predicateExpr);
this.trueValue = trueValue;
this.falseValue = falseValue;
}
@Override
public String[] tranform(String value) {
transformed[0] = predicate.evaluate(value) ? trueValue : falseValue;
return transformed;
}
}
}