/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.copy.recovery; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.concurrent.TimeUnit; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.security.UserGroupInformation; import com.google.common.base.Optional; import com.google.common.base.Predicate; import gobblin.configuration.State; import gobblin.data.management.copy.CopySource; import gobblin.data.management.copy.CopyEntity; import gobblin.data.management.copy.CopyableFile; import gobblin.util.guid.Guid; /** * Helper class for distcp work unit recovery. */ @Slf4j public class RecoveryHelper { public static final String PERSIST_DIR_KEY = "distcp.persist.dir"; public static final String PERSIST_RETENTION_KEY = "distcp.persist.retention.hours"; public static final int DEFAULT_PERSIST_RETENTION = 24; private final FileSystem fs; private final Optional<Path> persistDir; private final int retentionHours; public RecoveryHelper(FileSystem fs, State state) throws IOException { this.fs = fs; this.persistDir = getPersistDir(state); this.retentionHours = state.getPropAsInt(PERSIST_RETENTION_KEY, DEFAULT_PERSIST_RETENTION); } /** * Get the persist directory for this job. * @param state {@link State} containing job information. * @return A {@link Path} used as persist directory for this job. Note this path is user-specific for security reasons. * @throws java.io.IOException */ public static Optional<Path> getPersistDir(State state) throws IOException { if (state.contains(PERSIST_DIR_KEY)) { return Optional .of(new Path(state.getProp(PERSIST_DIR_KEY), UserGroupInformation.getCurrentUser().getShortUserName())); } return Optional.absent(); } /** * Moves a copied path into a persistent location managed by gobblin-distcp. This method is used when an already * copied file cannot be successfully published. In future runs, instead of re-copying the file, distcp will use the * persisted file. * * @param state {@link State} containing job information. * @param file {@link gobblin.data.management.copy.CopyEntity} from which input {@link Path} originated. * @param path {@link Path} to persist. * @return true if persist was successful. * @throws IOException */ public boolean persistFile(State state, CopyableFile file, Path path) throws IOException { if (!this.persistDir.isPresent()) { return false; } String guid = computeGuid(state, file); StringBuilder nameBuilder = new StringBuilder(guid); nameBuilder.append("_"); nameBuilder.append(shortenPathName(file.getOrigin().getPath(), 250 - nameBuilder.length())); if (!this.fs.exists(this.persistDir.get())) { this.fs.mkdirs(this.persistDir.get(), new FsPermission(FsAction.ALL, FsAction.READ, FsAction.NONE)); } Path targetPath = new Path(this.persistDir.get(), nameBuilder.toString()); log.info(String.format("Persisting file %s with guid %s to location %s.", path, guid, targetPath)); if (this.fs.rename(path, targetPath)) { this.fs.setTimes(targetPath, System.currentTimeMillis(), -1); return true; } return false; } /** * Searches the persist directory to find {@link Path}s matching the input {@link gobblin.data.management.copy.CopyEntity}. * @param state {@link State} containing job information. * @param file {@link gobblin.data.management.copy.CopyEntity} for which persisted {@link Path}s should be found. * @param filter {@link com.google.common.base.Predicate} used to filter found paths. * @return Optionally, a {@link Path} in the {@link FileSystem} that is the desired copy of the {@link gobblin.data.management.copy.CopyEntity}. * @throws IOException */ public Optional<FileStatus> findPersistedFile(State state, CopyEntity file, Predicate<FileStatus> filter) throws IOException { if (!this.persistDir.isPresent() || !this.fs.exists(this.persistDir.get())) { return Optional.absent(); } Path glob = new Path(this.persistDir.get(), computeGuid(state, file) + "_*"); for (FileStatus fileStatus : this.fs.globStatus(glob)) { if (filter.apply(fileStatus)) { return Optional.of(fileStatus); } } return Optional.absent(); } /** * Delete all persisted files older than the number of hours set by {@link #PERSIST_RETENTION_KEY}. * @throws IOException */ public void purgeOldPersistedFile() throws IOException { if (!this.persistDir.isPresent() || !this.fs.exists(this.persistDir.get())) { log.info("No persist directory to clean."); return; } long retentionMillis = TimeUnit.HOURS.toMillis(this.retentionHours); long now = System.currentTimeMillis(); for (FileStatus fileStatus : this.fs.listStatus(this.persistDir.get())) { if (now - fileStatus.getModificationTime() > retentionMillis) { if (!this.fs.delete(fileStatus.getPath(), true)) { log.warn("Failed to delete path " + fileStatus.getPath()); } } } } /** * Shorten an absolute path into a sanitized String of length at most bytes. This is useful for including a summary * of an absolute path in a file name. * * <p> * For example: shortenPathName("/user/gobblin/foo/bar/myFile.txt", 25) will be shortened to "_user_gobbl..._myFile.txt". * </p> * * @param path absolute {@link Path} to shorten. * @param bytes max number of UTF8 bytes that output string can use (note that, * for now, it is assumed that each character uses exactly one byte). * @return a shortened, sanitized String of length at most bytes. */ static String shortenPathName(Path path, int bytes) { String pathString = path.toUri().getPath(); String replaced = pathString.replace("/", "_"); if (replaced.length() <= bytes) { return replaced; } int bytesPerHalf = (bytes - 3) / 2; return replaced.substring(0, bytesPerHalf) + "..." + replaced.substring(replaced.length() - bytesPerHalf); } private static String computeGuid(State state, CopyEntity file) throws IOException { Optional<Guid> stateGuid = CopySource.getWorkUnitGuid(state); if (stateGuid.isPresent()) { return Guid.combine(file.guid(), stateGuid.get()).toString(); } throw new IOException("State does not contain a guid."); } }