/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util.logs; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Function; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.base.Stopwatch; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; import com.google.common.io.Closer; import com.google.common.io.Files; import com.google.common.util.concurrent.AbstractScheduledService; import gobblin.configuration.ConfigurationKeys; import gobblin.util.concurrent.ScheduledTask; import gobblin.util.concurrent.TaskScheduler; import gobblin.util.concurrent.TaskSchedulerFactory; import gobblin.util.DatasetFilterUtils; import gobblin.util.FileListUtils; import gobblin.util.HadoopUtils; /** * A utility class that periodically reads log files in a source log file directory for changes * since the last reads and appends the changes to destination log files with the same names as * the source log files in a destination log directory. The source and destination log files * can be on different {@link FileSystem}s. * * <p> * This class extends the {@link AbstractScheduledService} so it can be used with a * {@link com.google.common.util.concurrent.ServiceManager} that manages the lifecycle of * a {@link LogCopier}. * </p> * * <p> * This class is intended to be used in the following pattern: * * <pre> * {@code * LogCopier.Builder logCopierBuilder = LogCopier.newBuilder(); * LogCopier logCopier = logCopierBuilder * .useSrcFileSystem(FileSystem.getLocal(new Configuration())) * .useDestFileSystem(FileSystem.get(URI.create(destFsUri), new Configuration())) * .readFrom(new Path(srcLogDir)) * .writeTo(new Path(destLogDir)) * .useInitialDelay(60) * .useCopyInterval(60) * .useMaxMinutesPerLogFile(60) * .useMaxBytesPerLogFile(1024 * 1024) * .useTimeUnit(TimeUnit.SECONDS) * .build(); * * ServiceManager serviceManager = new ServiceManager(Lists.newArrayList(logCopier)); * serviceManager.startAsync(); * * // ... * serviceManager.stopAsync().awaitStopped(60, TimeUnit.SECONDS); * } * </pre> * * Checkout the Javadoc of {@link LogCopier.Builder} to see the available options for customization. * </p> * * @author Yinan Li */ public class LogCopier extends AbstractScheduledService { private static final Logger LOGGER = LoggerFactory.getLogger(LogCopier.class); private static final long DEFAULT_SOURCE_LOG_FILE_MONITOR_INTERVAL = 120; private static final long DEFAULT_LOG_COPY_INTERVAL_SECONDS = 60; private static final long DEFAULT_MAX_MINUTES_PER_LOG_FILE = Long.MAX_VALUE; private static final long DEFAULT_MAX_BYTES_PER_LOG_FILE = Long.MAX_VALUE; private static final TimeUnit DEFAULT_TIME_UNIT = TimeUnit.SECONDS; private static final int DEFAULT_LINES_WRITTEN_BEFORE_FLUSH = 100; private final FileSystem srcFs; private final FileSystem destFs; private final Path srcLogDir; private final Path destLogDir; private final long sourceLogFileMonitorInterval; private final long copyInterval; private final long maxMinutesPerLogFile; private final long maxBytesPerLogFile; private final TimeUnit timeUnit; private final Set<String> logFileExtensions; private final Optional<List<Pattern>> includingRegexPatterns; private final Optional<List<Pattern>> excludingRegexPatterns; private final Optional<String> logFileNamePrefix; private final int linesWrittenBeforeFlush; private final TaskScheduler<Path, LogCopyTask> scheduler; private LogCopier(Builder builder) { this.srcFs = builder.srcFs; this.destFs = builder.destFs; this.srcLogDir = this.srcFs.makeQualified(builder.srcLogDir); this.destLogDir = this.destFs.makeQualified(builder.destLogDir); this.sourceLogFileMonitorInterval = builder.sourceLogFileMonitorInterval; this.copyInterval = builder.copyInterval; this.maxMinutesPerLogFile = builder.maxMinutesPerLogFile; this.maxBytesPerLogFile = builder.maxBytesPerLogFile; this.timeUnit = builder.timeUnit; this.logFileExtensions = builder.logFileExtensions; this.includingRegexPatterns = Optional.fromNullable(builder.includingRegexPatterns); this.excludingRegexPatterns = Optional.fromNullable(builder.excludingRegexPatterns); this.logFileNamePrefix = Optional.fromNullable(builder.logFileNamePrefix); this.linesWrittenBeforeFlush = builder.linesWrittenBeforeFlush; this.scheduler = TaskSchedulerFactory.get(builder.schedulerName, Optional.<String> absent()); } @Override protected void shutDown() throws Exception { this.scheduler.close(); } @Override protected void runOneIteration() throws IOException { checkSrcLogFiles(); } @Override protected Scheduler scheduler() { return Scheduler.newFixedRateSchedule(0, this.sourceLogFileMonitorInterval, this.timeUnit); } /** * Perform a check on new source log files and submit copy tasks for new log files. */ private void checkSrcLogFiles() throws IOException { List<FileStatus> srcLogFiles = FileListUtils.listFilesRecursively(this.srcFs, this.srcLogDir, new PathFilter() { @Override public boolean accept(Path path) { return LogCopier.this.logFileExtensions.contains(Files.getFileExtension(path.getName())); } }); if (srcLogFiles.isEmpty()) { LOGGER.warn("No log file found under directory " + this.srcLogDir); return; } Set<Path> newLogFiles = Sets.newHashSet(); for (FileStatus srcLogFile : srcLogFiles) { newLogFiles.add(srcLogFile.getPath()); } HashSet<Path> deletedLogFiles = Sets.newHashSet(getSourceFiles()); // Compute the set of deleted log files since the last check deletedLogFiles.removeAll(newLogFiles); // Compute the set of new log files since the last check newLogFiles.removeAll(getSourceFiles()); // Schedule a copy task for each new log file for (final Path srcLogFile : newLogFiles) { String destLogFileName = this.logFileNamePrefix.isPresent() ? this.logFileNamePrefix.get() + "." + srcLogFile.getName() : srcLogFile.getName(); final Path destLogFile = new Path(this.destLogDir, destLogFileName); this.scheduler.schedule(new LogCopyTask(srcLogFile, destLogFile), this.copyInterval, this.timeUnit); } // Cancel the copy task for each deleted log file for (Path deletedLogFile : deletedLogFiles) { Optional<LogCopyTask> logCopyTask = this.scheduler.getScheduledTask(deletedLogFile); if (logCopyTask.isPresent()) { this.scheduler.cancel(logCopyTask.get()); } } } /** * Get a new {@link LogCopier.Builder} instance for building a {@link LogCopier}. * * @return a new {@link LogCopier.Builder} instance */ public static Builder newBuilder() { return new Builder(); } /** * A builder class for {@link LogCopier}. */ public static class Builder { private static final Splitter COMMA_SPLITTER = Splitter.on(',').omitEmptyStrings().trimResults(); private FileSystem srcFs; private Path srcLogDir; private FileSystem destFs; private Path destLogDir; private long sourceLogFileMonitorInterval = DEFAULT_SOURCE_LOG_FILE_MONITOR_INTERVAL; private long copyInterval = DEFAULT_LOG_COPY_INTERVAL_SECONDS; private long maxMinutesPerLogFile = DEFAULT_MAX_MINUTES_PER_LOG_FILE; private long maxBytesPerLogFile = DEFAULT_MAX_BYTES_PER_LOG_FILE; private TimeUnit timeUnit = DEFAULT_TIME_UNIT; private Set<String> logFileExtensions; private List<Pattern> includingRegexPatterns; private List<Pattern> excludingRegexPatterns; private String logFileNamePrefix; private int linesWrittenBeforeFlush = DEFAULT_LINES_WRITTEN_BEFORE_FLUSH; private String schedulerName = null; /** * Set the interval between two checks for the source log file monitor. * * @param sourceLogFileMonitorInterval the interval between two checks for the source log file monitor * @return this {@link LogCopier.Builder} instance */ public Builder useSourceLogFileMonitorInterval(long sourceLogFileMonitorInterval) { Preconditions.checkArgument(sourceLogFileMonitorInterval > 0, "Source log file monitor interval must be positive"); this.sourceLogFileMonitorInterval = sourceLogFileMonitorInterval; return this; } /** * Set the copy interval between two iterations of copies. * * @param copyInterval the copy interval between two iterations of copies * @return this {@link LogCopier.Builder} instance */ public Builder useCopyInterval(long copyInterval) { Preconditions.checkArgument(copyInterval > 0, "Copy interval must be positive"); this.copyInterval = copyInterval; return this; } /** * Set the max minutes per log file. * * @param maxMinutesPerLogFile the maximum minutes of logs a log file contains * @return this {@link LogCopier.Builder} instance */ public Builder useMaxMinutesPerLogFile(long maxMinutesPerLogFile) { Preconditions.checkArgument(maxMinutesPerLogFile > 0, "Max minutes per log file must be positive"); this.maxMinutesPerLogFile = maxMinutesPerLogFile; return this; } /** * Set the max bytes per log file. * * @param maxBytesPerLogFile the maximum bytes of a log file * @return this {@link LogCopier.Builder} instance */ public Builder useMaxBytesPerLogFile(long maxBytesPerLogFile) { Preconditions.checkArgument(maxBytesPerLogFile > 0, "Max bytes per log file must be positive"); this.maxBytesPerLogFile = maxBytesPerLogFile; return this; } /** * Set the {@link TimeUnit} used for the source log file monitor interval, initial delay, copy interval. * * @param timeUnit the {@link TimeUnit} used for the initial delay and copy interval * @return this {@link LogCopier.Builder} instance */ public Builder useTimeUnit(TimeUnit timeUnit) { Preconditions.checkNotNull(timeUnit); this.timeUnit = timeUnit; return this; } /** * Set the set of acceptable log file extensions. * * @param logFileExtensions the set of acceptable log file extensions * @return this {@link LogCopier.Builder} instance */ public Builder acceptsLogFileExtensions(Set<String> logFileExtensions) { Preconditions.checkNotNull(logFileExtensions); this.logFileExtensions = ImmutableSet.copyOf(logFileExtensions); return this; } /** * Set the regex patterns used to filter logs that should be copied. * * @param regexList a comma-separated list of regex patterns * @return this {@link LogCopier.Builder} instance */ public Builder useIncludingRegexPatterns(String regexList) { Preconditions.checkNotNull(regexList); this.includingRegexPatterns = DatasetFilterUtils.getPatternsFromStrings(COMMA_SPLITTER.splitToList(regexList)); return this; } /** * Set the regex patterns used to filter logs that should not be copied. * * @param regexList a comma-separated list of regex patterns * @return this {@link LogCopier.Builder} instance */ public Builder useExcludingRegexPatterns(String regexList) { Preconditions.checkNotNull(regexList); this.excludingRegexPatterns = DatasetFilterUtils.getPatternsFromStrings(COMMA_SPLITTER.splitToList(regexList)); return this; } /** * Set the source {@link FileSystem} for reading the source log file. * * @param srcFs the source {@link FileSystem} for reading the source log file * @return this {@link LogCopier.Builder} instance */ public Builder useSrcFileSystem(FileSystem srcFs) { Preconditions.checkNotNull(srcFs); this.srcFs = srcFs; return this; } /** * Set the destination {@link FileSystem} for writing the destination log file. * * @param destFs the destination {@link FileSystem} for writing the destination log file * @return this {@link LogCopier.Builder} instance */ public Builder useDestFileSystem(FileSystem destFs) { Preconditions.checkNotNull(destFs); this.destFs = destFs; return this; } /** * Set the path of the source log file directory to read from. * * @param srcLogDir the path of the source log file directory to read from * @return this {@link LogCopier.Builder} instance */ public Builder readFrom(Path srcLogDir) { Preconditions.checkNotNull(srcLogDir); this.srcLogDir = srcLogDir; return this; } /** * Set the path of the destination log file directory to write to. * * @param destLogDir the path of the destination log file directory to write to * @return this {@link LogCopier.Builder} instance */ public Builder writeTo(Path destLogDir) { Preconditions.checkNotNull(destLogDir); this.destLogDir = destLogDir; return this; } /** * Set the log file name prefix at the destination. * * @param logFileNamePrefix the log file name prefix at the destination * @return this {@link LogCopier.Builder} instance */ public Builder useLogFileNamePrefix(String logFileNamePrefix) { Preconditions.checkArgument(!Strings.isNullOrEmpty(logFileNamePrefix), "Invalid log file name prefix: " + logFileNamePrefix); this.logFileNamePrefix = logFileNamePrefix; return this; } /** * Set the number of lines written before they are flushed to disk. * * @param linesWrittenBeforeFlush the number of lines written before they are flushed to disk * @return this {@link LogCopier.Builder} instance */ public Builder useLinesWrittenBeforeFlush(int linesWrittenBeforeFlush) { Preconditions.checkArgument(linesWrittenBeforeFlush > 0, "The value specifying the lines to write before flush must be positive"); this.linesWrittenBeforeFlush = linesWrittenBeforeFlush; return this; } /** * Set the scheduler to use for scheduling copy tasks. * * @param schedulerName the name of the scheduler * @return this {@link LogCopier.Builder} instance */ public Builder useScheduler(String schedulerName) { Preconditions.checkArgument(!Strings.isNullOrEmpty(schedulerName), "Invalid scheduler name: " + schedulerName); this.schedulerName = schedulerName; return this; } /** * Build a new {@link LogCopier} instance. * * @return a new {@link LogCopier} instance */ public LogCopier build() { return new LogCopier(this); } } private ImmutableList<Path> getSourceFiles() { return ImmutableList .copyOf(Iterables.transform(this.scheduler.getScheduledTasks(), new Function<LogCopyTask, Path>() { @Override public Path apply(LogCopyTask input) { return input.getKey(); } })); } private class LogCopyTask implements ScheduledTask<Path> { private final Path srcLogFile; private final Path destLogFile; private final Stopwatch watch; // The task maintains the current source log file position itself private long currentPos = 0; public LogCopyTask(Path srcLogFile, Path destLogFile) { this.srcLogFile = srcLogFile; this.destLogFile = destLogFile; this.watch = Stopwatch.createStarted(); } @Override public Path getKey() { return this.srcLogFile; } @Override public void runOneIteration() { try { createNewLogFileIfNeeded(); LOGGER.debug(String.format("Copying changes from %s to %s", this.srcLogFile, this.destLogFile)); copyChangesOfLogFile(LogCopier.this.srcFs.makeQualified(this.srcLogFile), LogCopier.this.destFs.makeQualified(this.destLogFile)); } catch (IOException ioe) { LOGGER.error(String.format("Failed while copying logs from %s to %s", this.srcLogFile, this.destLogFile), ioe); } } private void createNewLogFileIfNeeded() throws IOException { if (LogCopier.this.destFs.exists(this.destLogFile) && (this.watch.elapsed(TimeUnit.MINUTES) > LogCopier.this.maxMinutesPerLogFile || LogCopier.this.destFs.getFileStatus(this.destLogFile).getLen() > LogCopier.this.maxBytesPerLogFile)) { HadoopUtils.renamePath(LogCopier.this.destFs, this.destLogFile, new Path(this.destLogFile.toString() + "." + System.currentTimeMillis())); this.watch.reset(); this.watch.start(); } } /** * Copy changes for a single log file. */ private void copyChangesOfLogFile(Path srcFile, Path destFile) throws IOException { if (!LogCopier.this.srcFs.exists(srcFile)) { LOGGER.warn("Source log file not found: " + srcFile); return; } // We need to use fsDataInputStream in the finally clause so it has to be defined outside try-catch-finally FSDataInputStream fsDataInputStream = null; try (Closer closer = Closer.create()) { fsDataInputStream = closer.register(LogCopier.this.srcFs.open(srcFile)); // Seek to the the most recent position if it is available LOGGER.debug(String.format("Reading log file %s from position %d", srcFile, this.currentPos)); fsDataInputStream.seek(this.currentPos); BufferedReader srcLogFileReader = closer.register( new BufferedReader(new InputStreamReader(fsDataInputStream, ConfigurationKeys.DEFAULT_CHARSET_ENCODING))); FSDataOutputStream outputStream = LogCopier.this.destFs.exists(destFile) ? LogCopier.this.destFs.append(destFile) : LogCopier.this.destFs.create(destFile); BufferedWriter destLogFileWriter = closer.register( new BufferedWriter(new OutputStreamWriter(outputStream, ConfigurationKeys.DEFAULT_CHARSET_ENCODING))); String line; int linesProcessed = 0; while (!Thread.currentThread().isInterrupted() && (line = srcLogFileReader.readLine()) != null) { if (!shouldCopyLine(line)) { continue; } destLogFileWriter.write(line); destLogFileWriter.newLine(); linesProcessed++; if (linesProcessed % LogCopier.this.linesWrittenBeforeFlush == 0) { destLogFileWriter.flush(); } } } finally { if (fsDataInputStream != null) { this.currentPos = fsDataInputStream.getPos(); } } } /** * Check if a log line should be copied. * * <p> * A line should be copied if and only if all of the following conditions satisfy: * * <ul> * <li> * It doesn't match any of the excluding regex patterns. If there's no excluding regex patterns, * this condition is considered satisfied. * </li> * <li> * It matches at least one of the including regex patterns. If there's no including regex patterns, * this condition is considered satisfied. * </li> * </ul> * </p> */ private boolean shouldCopyLine(String line) { boolean including = !LogCopier.this.includingRegexPatterns.isPresent() || DatasetFilterUtils.stringInPatterns(line, LogCopier.this.includingRegexPatterns.get()); boolean excluding = LogCopier.this.excludingRegexPatterns.isPresent() && DatasetFilterUtils.stringInPatterns(line, LogCopier.this.excludingRegexPatterns.get()); return !excluding && including; } } }