/* * Copyright © 2016016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.dq.testclasses; import co.cask.cdap.api.annotation.Description; import co.cask.cdap.api.annotation.Name; import co.cask.cdap.api.annotation.Plugin; import co.cask.cdap.api.data.format.FormatSpecification; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.data.stream.Stream; import co.cask.cdap.api.data.stream.StreamBatchReadable; import co.cask.cdap.api.dataset.lib.KeyValue; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.api.plugin.PluginConfig; import co.cask.cdap.api.stream.GenericStreamEventData; import co.cask.cdap.etl.api.Emitter; import co.cask.cdap.etl.api.PipelineConfigurer; import co.cask.cdap.etl.api.batch.BatchSource; import co.cask.cdap.etl.api.batch.BatchSourceContext; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.hadoop.io.LongWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.List; import java.util.Map; import javax.annotation.Nullable; /** * A {@link BatchSource} for {@link Stream} to use {@link Stream} as Source. */ @SuppressWarnings("unused") @Plugin(type = "batchsource") @Name("Stream") @Description("Batch source for a stream.") public class StreamBatchSource extends BatchSource<LongWritable, Object, StructuredRecord> { private static final Logger LOG = LoggerFactory.getLogger(StreamBatchSource.class); private static final String FORMAT_SETTING_PREFIX = "format.setting."; private static final Schema DEFAULT_SCHEMA = Schema.recordOf( "event", Schema.Field.of("ts", Schema.of(Schema.Type.LONG)), Schema.Field.of("headers", Schema.mapOf(Schema.of(Schema.Type.STRING), Schema.of(Schema.Type.STRING))), Schema.Field.of("body", Schema.of(Schema.Type.BYTES)) ); private static final String NAME_DESCRIPTION = "Name of the stream. Must be a valid stream name. " + "If it doesn't exist, it will be created."; private static final String DURATION_DESCRIPTION = "Size of the time window to read with each run of the pipeline. " + "The format is expected to be a number followed by an 's', 'm', 'h', or 'd' specifying the time unit, with 's' " + "for seconds, 'm' for minutes, 'h' for hours, and 'd' for days. For example, a value of '5m' means each run of " + "the pipeline will read 5 minutes of events from the stream."; private static final String DELAY_DESCRIPTION = "Optional delay for reading stream events. The value must be " + "of the same format as the duration value. For example, a duration of '5m' and a delay of '10m' means each run " + "of the pipeline will read events from 15 minutes before its logical start time to 10 minutes before its " + "logical start time. The default value is 0."; private static final String FORMAT_DESCRIPTION = "Optional format of the stream. Any format supported by CDAP " + "is also supported. For example, a value of 'csv' will attempt to parse stream events as comma separated values. " + "If no format is given, event bodies will be treated as bytes, resulting in a three field schema: " + "'ts' of type long, 'headers' of type map of string to string, and 'body' of type bytes."; private static final String SCHEMA_DESCRIPTION = "Optional schema for the body of stream events. Schema is used " + "in conjunction with format to parse stream events. Some formats like the avro format require schema, " + "while others do not. The schema given is for the body of the stream, so the final schema of records output " + "by the source will contain an additional field named 'ts' for the timestamp and a field named 'headers' " + "for the headers as as the first and second fields of the schema."; private StreamBatchConfig streamBatchConfig; // its possible the input records could have different schemas, though that isn't the case today. private Map<Schema, Schema> schemaCache = Maps.newHashMap(); @Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { streamBatchConfig.validate(); pipelineConfigurer.addStream(new Stream(streamBatchConfig.name)); } @Override public void prepareRun(BatchSourceContext context) { long duration = ETLUtils.parseDuration(streamBatchConfig.duration); long delay = Strings.isNullOrEmpty(streamBatchConfig.delay) ? 0 : ETLUtils.parseDuration(streamBatchConfig.delay); long endTime = context.getLogicalStartTime() - delay; long startTime = endTime - duration; LOG.info("Setting input to Stream : {}", streamBatchConfig.name); FormatSpecification formatSpec = streamBatchConfig.getFormatSpec(); StreamBatchReadable stream; if (formatSpec == null) { stream = new StreamBatchReadable(streamBatchConfig.name, startTime, endTime); } else { stream = new StreamBatchReadable(streamBatchConfig.name, startTime, endTime, formatSpec); } context.setInput(stream); } @Override public void transform(KeyValue<LongWritable, Object> input, Emitter<StructuredRecord> emitter) throws Exception { // if not format spec was given, the value is a StreamEvent if (Strings.isNullOrEmpty(streamBatchConfig.format)) { StreamEvent event = (StreamEvent) input.getValue(); Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of()); StructuredRecord output = StructuredRecord.builder(DEFAULT_SCHEMA) .set("ts", input.getKey().get()) .set("headers", headers) .set("body", event.getBody()) .build(); emitter.emit(output); } else { // otherwise, it will be a GenericStreamEventData @SuppressWarnings("unchecked") GenericStreamEventData<StructuredRecord> event = (GenericStreamEventData<StructuredRecord>) input.getValue(); StructuredRecord record = event.getBody(); Schema inputSchema = record.getSchema(); Schema outputSchema = schemaCache.get(inputSchema); // if we haven't seen this schema before, generate the output schema (add ts and header fields) if (outputSchema == null) { List<Schema.Field> fields = Lists.newArrayList(); fields.add(DEFAULT_SCHEMA.getField("ts")); fields.add(DEFAULT_SCHEMA.getField("headers")); fields.addAll(inputSchema.getFields()); outputSchema = Schema.recordOf(inputSchema.getRecordName(), fields); schemaCache.put(inputSchema, outputSchema); } // easier to just deal with an empty map than deal with nullables, so the headers field is non-nullable. Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of()); StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema); builder.set("ts", input.getKey().get()); builder.set("headers", headers); for (Schema.Field field : inputSchema.getFields()) { String fieldName = field.getName(); builder.set(fieldName, record.get(fieldName)); } emitter.emit(builder.build()); } } /** * {@link PluginConfig} class for {@link co.cask.cdap.etl.batch.source.StreamBatchSource} */ public static class StreamBatchConfig extends PluginConfig { @Description(NAME_DESCRIPTION) private String name; @Description(DURATION_DESCRIPTION) private String duration; @Description(DELAY_DESCRIPTION) @Nullable private String delay; @Description(FORMAT_DESCRIPTION) @Nullable private String format; @Description(SCHEMA_DESCRIPTION) @Nullable private String schema; private void validate() { // check the schema if there is one if (!Strings.isNullOrEmpty(schema)) { parseSchema(); } // check duration and delay long durationInMs = ETLUtils.parseDuration(duration); Preconditions.checkArgument(durationInMs > 0, "Duration must be greater than 0"); if (!Strings.isNullOrEmpty(delay)) { ETLUtils.parseDuration(delay); } } private FormatSpecification getFormatSpec() { FormatSpecification formatSpec = null; if (!Strings.isNullOrEmpty(format)) { // try to parse the schema if there is one Schema schemaObj = parseSchema(); // strip format.settings. from any properties and use them in the format spec ImmutableMap.Builder<String, String> builder = ImmutableMap.builder(); for (Map.Entry<String, String> entry : getProperties().getProperties().entrySet()) { if (entry.getKey().startsWith(FORMAT_SETTING_PREFIX)) { String key = entry.getKey(); builder.put(key.substring(FORMAT_SETTING_PREFIX.length(), key.length()), entry.getValue()); } } formatSpec = new FormatSpecification(format, schemaObj, builder.build()); } return formatSpec; } private Schema parseSchema() { try { return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema); } catch (IOException e) { throw new IllegalArgumentException("Invalid schema: " + e.getMessage()); } } } }