/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.source; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.DurationFieldType; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Enums; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.SourceState; import gobblin.configuration.State; import gobblin.source.extractor.filebased.FileBasedHelperException; import gobblin.source.extractor.hadoop.HadoopFsHelper; import gobblin.util.DatePartitionType; import static gobblin.source.PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN; /** * PartitionRetriever that is optimized for nested directory structures where data is dumped on a regular basis * and most data has likely been processed by Gobblin already. * * For example, if {@link ConfigurationKeys#SOURCE_FILEBASED_DATA_DIRECTORY} is set to /my/data/, then the class assumes * folders following the pattern /my/data/daily/[year]/[month]/[day] are present. It will iterate through all the data * under these folders starting from the date specified by {@link #DATE_PARTITIONED_SOURCE_MIN_WATERMARK_VALUE} until * either {@link #DATE_PARTITIONED_SOURCE_MAX_FILES_PER_JOB} files have been processed, or until there is no more data * to process. For example, if {@link #DATE_PARTITIONED_SOURCE_MIN_WATERMARK_VALUE} is set to 2015/01/01, then the job * will read from the folder /my/data/daily/2015/01/01/, /my/data/daily/2015/01/02/, /my/data/2015/01/03/ etc. * */ public class DatePartitionedNestedRetriever implements PartitionAwareFileRetriever { private static final Logger LOG = LoggerFactory.getLogger(DatePartitionedNestedRetriever.class); private DateTimeFormatter partitionPatternFormatter; private DurationFieldType incrementalUnit; private String sourcePartitionPrefix; private String sourcePartitionSuffix; private Path sourceDir; private FileSystem fs; private HadoopFsHelper helper; private final String expectedExtension; public DatePartitionedNestedRetriever(String expectedExtension) { this.expectedExtension = expectedExtension; } @Override public void init(SourceState state) { DateTimeZone.setDefault(DateTimeZone .forID(state.getProp(ConfigurationKeys.SOURCE_TIMEZONE, ConfigurationKeys.DEFAULT_SOURCE_TIMEZONE))); initDatePartition(state); this.sourcePartitionPrefix = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PREFIX, StringUtils.EMPTY); this.sourcePartitionSuffix = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_SUFFIX, StringUtils.EMPTY); this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); this.helper = new HadoopFsHelper(state); } @Override public List<FileInfo> getFilesToProcess(long minWatermark, int maxFilesToReturn) throws IOException { DateTime currentDay = new DateTime(); DateTime lowWaterMarkDate = new DateTime(minWatermark); List<FileInfo> filesToProcess = new ArrayList<>(); try { helper.connect(); this.fs = helper.getFileSystem(); } catch (FileBasedHelperException e) { throw new IOException("Error initializing FileSystem", e); } for (DateTime date = lowWaterMarkDate; !date.isAfter(currentDay) && filesToProcess.size() < maxFilesToReturn; date = date.withFieldAdded(incrementalUnit, 1)) { // Constructs the path folder - e.g. /my/data/prefix/2015/01/01/suffix Path sourcePath = constructSourcePath(date); if (this.fs.exists(sourcePath)) { for (FileStatus fileStatus : this.fs.listStatus(sourcePath, getFileFilter())) { LOG.info("Will process file " + fileStatus.getPath()); filesToProcess.add(new FileInfo(fileStatus.getPath().toString(), fileStatus.getLen(), date.getMillis())); } } } return filesToProcess; } @Override public long getWatermarkFromString(String lowWaterMark) { return this.partitionPatternFormatter.parseMillis(lowWaterMark); } @Override public long getWatermarkIncrementMs() { return new DateTime(0).withFieldAdded(this.incrementalUnit, 1).getMillis(); } private void initDatePartition(State state) { initDatePartitionFromPattern(state); if (this.partitionPatternFormatter == null) { initDatePartitionFromGranularity(state); } } private void initDatePartitionFromPattern(State state) { String partitionPattern = null; try { partitionPattern = state.getProp(DATE_PARTITIONED_SOURCE_PARTITION_PATTERN); if (partitionPattern != null) { this.partitionPatternFormatter = DateTimeFormat.forPattern(partitionPattern).withZone(DateTimeZone.getDefault()); this.incrementalUnit = DatePartitionType.getLowestIntervalUnit(partitionPattern).getDurationType(); } } catch (Exception e) { throw new IllegalArgumentException("Invalid source partition pattern: " + partitionPattern, e); } } private void initDatePartitionFromGranularity(State state) { String granularityProp = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_GRANULARITY); DatePartitionType partitionType = null; if (granularityProp == null) { partitionType = PartitionedFileSourceBase.DEFAULT_DATE_PARTITIONED_SOURCE_PARTITION_GRANULARITY; } else { Optional<DatePartitionType> partitionTypeOpt = Enums.getIfPresent(DatePartitionType.class, granularityProp.toUpperCase()); Preconditions .checkState(partitionTypeOpt.isPresent(), "Invalid source partition granularity: " + granularityProp); partitionType = partitionTypeOpt.get(); } this.partitionPatternFormatter = DateTimeFormat.forPattern(partitionType.getDateTimePattern()); this.incrementalUnit = partitionType.getDateTimeFieldType().getDurationType(); } private Path constructSourcePath(DateTime date) { StringBuilder pathBuilder = new StringBuilder(); if (!this.sourcePartitionPrefix.isEmpty()) { pathBuilder.append(this.sourcePartitionPrefix); pathBuilder.append(Path.SEPARATOR); } pathBuilder.append(this.partitionPatternFormatter.print(date)); if (!this.sourcePartitionSuffix.isEmpty()) { pathBuilder.append(Path.SEPARATOR); pathBuilder.append(this.sourcePartitionSuffix); } return new Path(this.sourceDir, pathBuilder.toString()); } /** * This method is to filter out files that don't need to be processed by extension * @return the pathFilter */ private PathFilter getFileFilter() { final String extension = (this.expectedExtension.startsWith(".")) ? this.expectedExtension : "." + this.expectedExtension; return new PathFilter() { @Override public boolean accept(Path path) { return path.getName().endsWith(extension); } }; } }