/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.example.plugin;
import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.StageConfigurer;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.TransformContext;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
/**
* Transform that can transforms specific fields to lowercase or uppercase.
*/
@Plugin(type = Transform.PLUGIN_TYPE)
@Name(StringCaseTransform.NAME)
@Description("Transforms configured fields to lowercase or uppercase.")
public class StringCaseTransform extends Transform<StructuredRecord, StructuredRecord> {
public static final String NAME = "StringCase";
private final Conf config;
private Set<String> upperFields;
private Set<String> lowerFields;
/**
* Config properties for the plugin.
*/
public static class Conf extends PluginConfig {
public static final String UPPER_FIELDS = "upperFields";
public static final String LOWER_FIELDS = "lowerFields";
private static final Pattern SPLIT_ON = Pattern.compile("\\s*,\\s*");
@Nullable
@Name(UPPER_FIELDS)
@Description("A comma separated list of fields to uppercase. Each field must be of type String.")
private String upperFields;
@Nullable
@Name(LOWER_FIELDS)
@Description("A comma separated list of fields to lowercase. Each field must be of type String.")
private String lowerFields;
private Set<String> getUpperFields() {
return parseToSet(upperFields);
}
private Set<String> getLowerFields() {
return parseToSet(lowerFields);
}
private Set<String> parseToSet(String str) {
Set<String> set = new HashSet<>();
if (str == null || str.isEmpty()) {
return set;
}
for (String element : SPLIT_ON.split(str)) {
set.add(element);
}
return set;
}
}
public StringCaseTransform(Conf config) {
this.config = config;
}
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
// the output schema is always the same as the input schema
Schema inputSchema = stageConfigurer.getInputSchema();
// if schema is null, that means it is either not known until runtime, or it is variable
if (inputSchema != null) {
// if the input schema is constant and known at configure time, check that all configured fields are strings
for (String fieldName : config.getUpperFields()) {
validateFieldIsString(inputSchema, fieldName);
}
for (String fieldName : config.getLowerFields()) {
validateFieldIsString(inputSchema, fieldName);
}
}
stageConfigurer.setOutputSchema(inputSchema);
}
@Override
public void initialize(TransformContext context) throws Exception {
upperFields = config.getUpperFields();
lowerFields = config.getLowerFields();
}
@Override
public void transform(StructuredRecord record, Emitter<StructuredRecord> emitter) throws Exception {
StructuredRecord.Builder builder = StructuredRecord.builder(record.getSchema());
for (Schema.Field field : record.getSchema().getFields()) {
String fieldName = field.getName();
if (upperFields.contains(fieldName)) {
builder.set(fieldName, record.get(fieldName).toString().toUpperCase());
} else if (lowerFields.contains(fieldName)) {
builder.set(fieldName, record.get(fieldName).toString().toLowerCase());
} else {
builder.set(fieldName, record.get(fieldName));
}
}
emitter.emit(builder.build());
}
private void validateFieldIsString(Schema schema, String fieldName) {
Schema.Field inputField = schema.getField(fieldName);
if (inputField == null) {
throw new IllegalArgumentException(
String.format("Field '%s' does not exist in input schema %s.", fieldName, schema));
}
Schema fieldSchema = inputField.getSchema();
Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
if (fieldType != Schema.Type.STRING) {
throw new IllegalArgumentException(
String.format("Field '%s' is of illegal type %s. Must be of type %s.",
fieldName, fieldType, Schema.Type.STRING));
}
}
}