/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.conversion.hive.provider; import java.io.IOException; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import lombok.AllArgsConstructor; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import gobblin.hive.HivePartition; import gobblin.hive.HiveTable; /** * Uses the file modification time of the data location of a {@link HiveTable} or {@link HivePartition} on HDFS */ @AllArgsConstructor public class HdfsBasedUpdateProvider implements HiveUnitUpdateProvider { private final FileSystem fs; // Cache modification times of data location to reduce the number of HDFS calls private static final Cache<Path, Long> PATH_TO_MOD_TIME_CACHE = CacheBuilder.newBuilder().maximumSize(2000).build(); /** * Get the update time of a {@link Partition} * * @return the update time if available, 0 otherwise * * {@inheritDoc} * @see HiveUnitUpdateProvider#getUpdateTime(org.apache.hadoop.hive.ql.metadata.Partition) */ @Override public long getUpdateTime(Partition partition) throws UpdateNotFoundException { try { return getUpdateTime(partition.getDataLocation()); } catch (IOException e) { throw new UpdateNotFoundException(String.format("Failed to get update time for %s", partition.getCompleteName()), e); } } /** * Get the update time of a {@link Table} * @return the update time if available, 0 otherwise * * {@inheritDoc} * @see HiveUnitUpdateProvider#getUpdateTime(org.apache.hadoop.hive.ql.metadata.Table) */ @Override public long getUpdateTime(Table table) throws UpdateNotFoundException { try { return getUpdateTime(table.getDataLocation()); } catch (IOException e) { throw new UpdateNotFoundException(String.format("Failed to get update time for %s.", table.getCompleteName()), e); } } private long getUpdateTime(final Path path) throws IOException, UpdateNotFoundException { try { return PATH_TO_MOD_TIME_CACHE.get(path, new Callable<Long>() { @Override public Long call() throws Exception { if (HdfsBasedUpdateProvider.this.fs.exists(path)) { return HdfsBasedUpdateProvider.this.fs.getFileStatus(path).getModificationTime(); } throw new UpdateNotFoundException(String.format("Data file does not exist at path %s", path)); } }); } catch (Exception e) { throw new IOException(e); } } }