/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.version.finder; import java.util.Properties; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import gobblin.configuration.ConfigurationKeys; import gobblin.data.management.version.FileStatusTimestampedDatasetVersion; import gobblin.data.management.version.FileSystemDatasetVersion; import gobblin.data.management.version.TimestampedDatasetVersion; /** * {@link gobblin.data.management.version.finder.DatasetVersionFinder} for datasets based on path timestamps. * Uses a datetime pattern to find dataset versions from the dataset path * and parse the {@link org.joda.time.DateTime} representing the version. */ public class DateTimeDatasetVersionFinder extends AbstractDatasetVersionFinder<TimestampedDatasetVersion> { private static final Logger LOGGER = LoggerFactory.getLogger(DateTimeDatasetVersionFinder.class); /** * Date pattern of the partition. E.g. yyyy/MM/dd/hh/mm or yyyy/MM/dd */ public static final String DATE_TIME_PATTERN_KEY = "version.datetime.pattern"; /** * Time zone to be used E.g. UTC */ public static final String DATE_TIME_PATTERN_TIMEZONE_KEY = "version.datetime.timezone"; /** * By default the globPattern is bbtained by replacing all non-slash characters in datetime pattern by *. * E.g. yyyy/MM/dd/hh/mm -> *\/*\/*\/*\/*. * If this key is set, we use this globPatter to search for version */ public static final String OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY = "version.globPattern"; public static final String DEFAULT_DATE_TIME_PATTERN_TIMEZONE = ConfigurationKeys.PST_TIMEZONE_NAME; private final Path globPattern; protected final DateTimeFormatter formatter; private final String datePartitionPattern; public DateTimeDatasetVersionFinder(FileSystem fs, Config config) { super(fs); Preconditions.checkArgument(config.hasPath(DATE_TIME_PATTERN_KEY) , "Missing required property " + DATE_TIME_PATTERN_KEY); String pattern = config.getString(DATE_TIME_PATTERN_KEY); if (config.hasPath(OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY)) { this.globPattern = new Path(config.getString(OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY)); } else { this.globPattern = new Path(pattern.replaceAll("[^/]+", "*")); } LOGGER.debug(String.format("Setting timezone for patthern: %s. By default it is %s", pattern, DEFAULT_DATE_TIME_PATTERN_TIMEZONE)); if (config.hasPath(DATE_TIME_PATTERN_TIMEZONE_KEY)) { this.formatter = DateTimeFormat.forPattern(pattern).withZone( DateTimeZone.forID(config.getString(DATE_TIME_PATTERN_TIMEZONE_KEY))); } else { this.formatter = DateTimeFormat.forPattern(pattern).withZone(DateTimeZone.forID(DEFAULT_DATE_TIME_PATTERN_TIMEZONE)); } this.datePartitionPattern = pattern; } public DateTimeDatasetVersionFinder(FileSystem fs, Properties props) { this(fs, ConfigFactory.parseProperties(props)); } @Override public Class<? extends FileSystemDatasetVersion> versionClass() { return TimestampedDatasetVersion.class; } /** * Obtained by replacing all non-slash characters in datetime pattern by *. * E.g. yyyy/MM/dd/hh/mm -> *\/*\/*\/*\/* * Or glob pattern at {@value #OPTIONAL_GLOB_PATTERN_TIMEZONE_KEY} if set. */ @Override public Path globVersionPattern() { return this.globPattern; } /** * Parse {@link org.joda.time.DateTime} from {@link org.apache.hadoop.fs.Path} using datetime pattern. */ @Override public TimestampedDatasetVersion getDatasetVersion(Path pathRelativeToDatasetRoot, FileStatus versionFileStatus) { String dateTimeString = null; try { // pathRelativeToDatasetRoot can be daily/2016/03/02 or 2016/03/02. In either case we need to pick 2016/03/02 as version dateTimeString = StringUtils.substring(pathRelativeToDatasetRoot.toString(), pathRelativeToDatasetRoot.toString().length() - this.datePartitionPattern.length()); return new FileStatusTimestampedDatasetVersion(this.formatter.parseDateTime(dateTimeString), versionFileStatus); } catch (IllegalArgumentException exception) { LOGGER.warn(String.format( "Candidate dataset version with pathRelativeToDatasetRoot: %s has inferred dataTimeString:%s. " + "It does not match expected datetime pattern %s. Ignoring.", pathRelativeToDatasetRoot, dateTimeString, this.datePartitionPattern)); return null; } } }