/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.writer.partitioner; import java.util.Collections; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; import org.apache.avro.SchemaBuilder.FieldAssembler; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.commons.lang3.StringUtils; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import com.google.common.base.Enums; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.State; import gobblin.util.DatePartitionType; import gobblin.util.ForkOperatorUtils; /** * A {@link WriterPartitioner} that partitions a record based on a timestamp. * * There are two ways to partition a timestamp: (1) specify a {@link DateTimeFormat} using * {@link #WRITER_PARTITION_PATTERN}, e.g., 'yyyy/MM/dd/HH'; (2) specify a * {@link DatePartitionType} using {@link #WRITER_PARTITION_GRANULARITY}. * * A prefix and a suffix can be added to the partition, e.g., the partition path can be * 'prefix/2015/11/05/suffix'. * * @author Ziyang Liu */ public abstract class TimeBasedWriterPartitioner<D> implements WriterPartitioner<D> { public static final String WRITER_PARTITION_PREFIX = ConfigurationKeys.WRITER_PREFIX + ".partition.prefix"; public static final String WRITER_PARTITION_SUFFIX = ConfigurationKeys.WRITER_PREFIX + ".partition.suffix"; public static final String WRITER_PARTITION_PATTERN = ConfigurationKeys.WRITER_PREFIX + ".partition.pattern"; public static final String WRITER_PARTITION_TIMEZONE = ConfigurationKeys.WRITER_PREFIX + ".partition.timezone"; public static final String DEFAULT_WRITER_PARTITION_TIMEZONE = ConfigurationKeys.PST_TIMEZONE_NAME; public static final String WRITER_PARTITION_GRANULARITY = ConfigurationKeys.WRITER_PREFIX + ".partition.granularity"; public static final DatePartitionType DEFAULT_WRITER_PARTITION_GRANULARITY = DatePartitionType.HOUR; public static final String PARTITIONED_PATH = "partitionedPath"; public static final String PREFIX = "prefix"; public static final String SUFFIX = "suffix"; private final String writerPartitionPrefix; private final String writerPartitionSuffix; private final DatePartitionType granularity; private final DateTimeZone timeZone; private final Optional<DateTimeFormatter> timestampToPathFormatter; private final Schema schema; public TimeBasedWriterPartitioner(State state, int numBranches, int branchId) { this.writerPartitionPrefix = getWriterPartitionPrefix(state, numBranches, branchId); this.writerPartitionSuffix = getWriterPartitionSuffix(state, numBranches, branchId); this.granularity = getGranularity(state, numBranches, branchId); this.timeZone = getTimeZone(state, numBranches, branchId); this.timestampToPathFormatter = getTimestampToPathFormatter(state, numBranches, branchId); this.schema = getSchema(); } private static String getWriterPartitionPrefix(State state, int numBranches, int branchId) { String propName = ForkOperatorUtils.getPropertyNameForBranch(WRITER_PARTITION_PREFIX, numBranches, branchId); return state.getProp(propName, StringUtils.EMPTY); } private static String getWriterPartitionSuffix(State state, int numBranches, int branchId) { String propName = ForkOperatorUtils.getPropertyNameForBranch(WRITER_PARTITION_SUFFIX, numBranches, branchId); return state.getProp(propName, StringUtils.EMPTY); } private static DatePartitionType getGranularity(State state, int numBranches, int branchId) { String propName = ForkOperatorUtils.getPropertyNameForBranch(WRITER_PARTITION_GRANULARITY, numBranches, branchId); String granularityValue = state.getProp(propName, DEFAULT_WRITER_PARTITION_GRANULARITY.toString()); Optional<DatePartitionType> granularity = Enums.getIfPresent(DatePartitionType.class, granularityValue.toUpperCase()); Preconditions.checkState(granularity.isPresent(), granularityValue + " is not a valid writer partition granularity"); return granularity.get(); } private Optional<DateTimeFormatter> getTimestampToPathFormatter(State state, int numBranches, int branchId) { String propName = ForkOperatorUtils.getPropertyNameForBranch(WRITER_PARTITION_PATTERN, numBranches, branchId); if (state.contains(propName)) { return Optional.of(DateTimeFormat.forPattern(state.getProp(propName)).withZone(this.timeZone)); } return Optional.absent(); } private static DateTimeZone getTimeZone(State state, int numBranches, int branchId) { String propName = ForkOperatorUtils.getPropertyNameForBranch(WRITER_PARTITION_TIMEZONE, numBranches, branchId); return DateTimeZone.forID(state.getProp(propName, DEFAULT_WRITER_PARTITION_TIMEZONE)); } private Schema getSchema() { if (this.timestampToPathFormatter.isPresent()) { return getDateTimeFormatBasedSchema(); } return getGranularityBasedSchema(); } @Override public Schema partitionSchema() { return this.schema; } @SuppressWarnings("fallthrough") @Override public GenericRecord partitionForRecord(D record) { long timestamp = getRecordTimestamp(record); GenericRecord partition = new GenericData.Record(this.schema); if (!Strings.isNullOrEmpty(this.writerPartitionPrefix)) { partition.put(PREFIX, this.writerPartitionPrefix); } if (!Strings.isNullOrEmpty(this.writerPartitionSuffix)) { partition.put(SUFFIX, this.writerPartitionSuffix); } if (this.timestampToPathFormatter.isPresent()) { String partitionedPath = getPartitionedPath(timestamp); partition.put(PARTITIONED_PATH, partitionedPath); } else { DateTime dateTime = new DateTime(timestamp, this.timeZone); partition.put(this.granularity.toString(), this.granularity.getField(dateTime)); } return partition; } private Schema getDateTimeFormatBasedSchema() { FieldAssembler<Schema> assembler = SchemaBuilder.record("GenericRecordTimePartition").namespace("gobblin.writer.partitioner").fields(); if (!Strings.isNullOrEmpty(this.writerPartitionPrefix)) { assembler = assembler.name(PREFIX).type(Schema.create(Schema.Type.STRING)).noDefault(); } assembler = assembler.name(PARTITIONED_PATH).type(Schema.create(Schema.Type.STRING)).noDefault(); if (!Strings.isNullOrEmpty(this.writerPartitionSuffix)) { assembler = assembler.name(SUFFIX).type(Schema.create(Schema.Type.STRING)).noDefault(); } return assembler.endRecord(); } @SuppressWarnings("fallthrough") private Schema getGranularityBasedSchema() { FieldAssembler<Schema> assembler = SchemaBuilder.record("GenericRecordTimePartition").namespace("gobblin.writer.partitioner").fields(); // Construct the fields in reverse order if (!Strings.isNullOrEmpty(this.writerPartitionSuffix)) { assembler = assembler.name(SUFFIX).type(Schema.create(Schema.Type.STRING)).noDefault(); } assembler = assembler.name(this.granularity.toString()).type(Schema.create(Schema.Type.STRING)).noDefault(); if (!Strings.isNullOrEmpty(this.writerPartitionPrefix)) { assembler = assembler.name(PREFIX).type(Schema.create(Schema.Type.STRING)).noDefault(); } Schema schema = assembler.endRecord(); Collections.reverse(schema.getFields()); return schema; } private String getPartitionedPath(long timestamp) { return this.timestampToPathFormatter.get().print(timestamp); } public abstract long getRecordTimestamp(D record); }