/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.format;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.format.UnexpectedFormatException;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.UnsupportedTypeException;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.spi.stream.AbstractStreamEventRecordFormat;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* Stream record format that interprets the body as string of delimited fields.
*
* <p>
* The delimiter can be explicitly set through the "delimiter" setting, and the character set can also be set through
* the "charset" setting. By default, the format will use a schema of one field, where the field is an array of strings.
* The schema can be set to a schema of fields, with the i'th field corresponding to the i'th value in the delimited
* text. Fields can also be parsed as scalar types - boolean, integer, long, double, float, bytes, and string.
* In addition, the very last field can be an array of strings.
* </p>
*
* <p>
* If the "mapping" setting is provided, then we will use the mapping to parse the stream events rather than
* the order of the schema fields. "mapping" is in the format "index0:field0,index1:field1,..".
* For example, if "mapping" is "1:name,2:age", then a stream event like "sdf,bob,32,sdf,lkj" would be transformed into
* a record {@code {"name":"bob", "age":32}}.
* </p>
*/
public class DelimitedStringsRecordFormat extends AbstractStreamEventRecordFormat<StructuredRecord> {
public static final String CHARSET = "charset";
public static final String DELIMITER = "delimiter";
public static final String MAPPING = "mapping";
private Charset charset = Charsets.UTF_8;
private String delimiter = ",";
private RecordMaker recordMaker = new DefaultRecordMaker();
@Override
public StructuredRecord read(StreamEvent event) throws UnexpectedFormatException {
String bodyAsStr = Bytes.toString(event.getBody(), charset);
Iterator<String> bodyFields = Splitter.on(delimiter).split(bodyAsStr).iterator();
return recordMaker.make(schema, bodyFields);
}
@Override
protected Schema getDefaultSchema() {
// default is a String[]
return Schema.recordOf("streamEvent", Schema.Field.of("body", Schema.arrayOf(Schema.of(Schema.Type.STRING))));
}
@Override
protected void validateSchema(Schema desiredSchema) throws UnsupportedTypeException {
// a valid schema is a record of simple types. In other words, no maps, arrays, records, unions, or enums allowed.
// if mapping is null, the exception is the very last field, which is allowed to be an array of simple types.
// These types may be nullable, which is a union of a null and non-null type.
Iterator<Schema.Field> fields = desiredSchema.getFields().iterator();
// check that each field is a simple field, except for the very last field, which can be an array of simple types.
while (fields.hasNext()) {
Schema.Field field = fields.next();
Schema schema = field.getSchema();
// if we're not on the very last field, the field must be a simple type or a nullable simple type.
boolean isSimple = schema.getType().isSimpleType();
boolean isNullableSimple = schema.isNullableSimple();
if (!isSimple && !isNullableSimple) {
// if this is the very last field and a string array, it is valid. otherwise it is not.
if (fields.hasNext() || !isStringArray(schema)) {
throw new UnsupportedTypeException("Field " + field.getName() + " is of invalid type.");
}
}
}
}
@Override
protected void configure(Map<String, String> settings) {
String charsetStr = settings.get(CHARSET);
if (charsetStr != null) {
this.charset = Charset.forName(charsetStr);
}
String delimiter = settings.get(DELIMITER);
if (delimiter != null) {
this.delimiter = delimiter;
}
if (!getDefaultSchema().equals(schema)) {
String mapping = settings.get(MAPPING);
if (mapping != null) {
this.recordMaker = new MappedSchemaRecordMaker(parseMapping(mapping, schema));
for (Schema.Field field : schema.getFields()) {
if (!field.getSchema().isSimpleOrNullableSimple()) {
throw new IllegalArgumentException(
String.format("only simple types allowed (field '%s') when the '%s' setting is present",
field.getName(), MAPPING));
}
}
} else {
this.recordMaker = new SchemaRecordMaker();
}
} else {
this.recordMaker = new DefaultRecordMaker();
}
}
// check that it's an array of strings or array of nullable strings. the array itself can also be nullable.
private static boolean isStringArray(Schema schema) {
Schema arrSchema = schema.isNullable() ? schema.getNonNullable() : schema;
if (arrSchema.getType() == Schema.Type.ARRAY) {
Schema componentSchema = arrSchema.getComponentSchema();
if (componentSchema.isNullable()) {
return componentSchema.getNonNullable().getType() == Schema.Type.STRING;
} else {
return componentSchema.getType() == Schema.Type.STRING;
}
}
return false;
}
private static String parseBodyValue(String val, Schema fieldSchema) {
// if the body field is an empty string and the column is not a string type, interpret it as a null.
if (val != null && val.isEmpty() && (fieldSchema.getType() != Schema.Type.STRING)) {
return null;
}
return val;
}
private Map<String, Integer> parseMapping(String mappingString, Schema schema) {
Splitter.MapSplitter splitter = Splitter.on(",").trimResults().withKeyValueSeparator(":");
Map<String, String> stringMapping = splitter.split(mappingString);
Preconditions.checkArgument(stringMapping.size() >= 1, "mapping cannot be empty");
Preconditions.checkArgument(stringMapping.size() <= schema.getFields().size(),
"mapping cannot contain more entries than schema fields");
Map<String, Integer> mapping = Maps.newHashMap();
for (Map.Entry<String, String> entry : stringMapping.entrySet()) {
String fieldIndexString = entry.getKey();
String fieldName = entry.getValue();
Preconditions.checkArgument(schema.getField(fieldName) != null,
"schema is missing the mapped field " + fieldName);
try {
int fieldIndex = Integer.parseInt(fieldIndexString);
mapping.put(fieldName, fieldIndex);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("mapping keys must be integer indices");
}
}
return mapping;
}
/**
* Makes a {@link StructuredRecord} in {@link DelimitedStringsRecordFormat#read(StreamEvent)}.
*/
private interface RecordMaker {
StructuredRecord make(Schema schema, Iterator<String> bodyFields);
}
/**
* {@link RecordMaker} that uses the default schema.
*/
private static class DefaultRecordMaker implements RecordMaker {
@Override
public StructuredRecord make(Schema schema, Iterator<String> bodyFields) {
StructuredRecord.Builder builder = StructuredRecord.builder(schema);
List<String> fields = Lists.newArrayList(bodyFields);
builder.set("body", fields.toArray(new String[fields.size()]));
return builder.build();
}
}
/**
* {@link RecordMaker} that uses a schema.
*/
private static class SchemaRecordMaker implements RecordMaker {
@Override
public StructuredRecord make(Schema schema, Iterator<String> bodyFields) {
StructuredRecord.Builder builder = StructuredRecord.builder(schema);
Iterator<Schema.Field> fieldsIterator = schema.getFields().iterator();
while (fieldsIterator.hasNext()) {
Schema.Field field = fieldsIterator.next();
Schema fieldSchema = field.getSchema();
String fieldName = field.getName();
if (isStringArray(fieldSchema)) {
if (!fieldsIterator.hasNext()) {
// only do varargs-style string array parsing on bodyField if it's the last field
List<String> fields = Lists.newArrayList(bodyFields);
builder.set(fieldName, fields.toArray(new String[fields.size()]));
} else {
throw new UnexpectedFormatException(
String.format("string array type field '%s' must be the last schema field", fieldName));
}
} else {
// simple type (not string array)
String bodyField = bodyFields.hasNext() ? bodyFields.next() : null;
String val = parseBodyValue(bodyField, fieldSchema);
builder.convertAndSet(fieldName, val);
}
}
return builder.build();
}
}
/**
* {@link RecordMaker} that uses the "mapping" setting and a schema.
*/
private static class MappedSchemaRecordMaker implements RecordMaker {
private final Map<String, Integer> mapping;
private final int lastMappingIndex;
private MappedSchemaRecordMaker(Map<String, Integer> mapping) {
this.mapping = mapping;
this.lastMappingIndex = Collections.max(mapping.values());
}
@Override
public StructuredRecord make(Schema schema, Iterator<String> bodyFields) {
StructuredRecord.Builder builder = StructuredRecord.builder(schema);
// TODO: only read what's necessary from event.getBody() (e.g. if mapping is "0:f0", then only read first entry)
List<String> fields = Lists.newArrayList(Iterators.limit(bodyFields, lastMappingIndex + 1));
for (Schema.Field field : schema.getFields()) {
Schema fieldSchema = field.getSchema();
String fieldName = field.getName();
int index = mapping.get(fieldName);
if (index < fields.size()) {
String val = parseBodyValue(fields.get(index), fieldSchema);
builder.convertAndSet(fieldName, val);
}
}
return builder.build();
}
}
}