/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.task.hoover;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.text.SimpleDateFormat;
import com.addthis.basis.util.Backoff;
import com.addthis.basis.util.LessBytes;
import com.addthis.basis.util.LessFiles;
import com.addthis.basis.util.LessStrings;
import com.addthis.codec.annotations.FieldConfig;
import com.addthis.hydra.common.hash.PluggableHashFunction;
import com.addthis.hydra.data.util.DateUtil;
import com.addthis.hydra.task.run.TaskRunConfig;
import com.addthis.hydra.task.run.TaskRunnable;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This Hydra job is <span class="hydra-summary">a bulk file loader for Hydra clusters</span>.
* <p/>
* <p>The hoover job queries the servers specified by the
* {@link #hosts} parameter. The file contents of each remote server is queried at the
* of the file path specified by {@link #path} parameter.
* Each file is tested to determine if it should be transferred onto the local server.
* The files that meet all the criteria are transferred to the local server into
* the path specified by the {@link #outDir} and {@link #pathOut} parameters.</p>
* Any files on the local server in output directory that are older
* than {@link #purgeAfterDays} days will be deleted. The file purge will use the
* UNIX file modification time to determine which files are purged. The file modification
* time may be different from the date and time that is extracted from the file name or path.
* <p/>
* <p>When a file is detected on a remove server then the date of the file is extracted from
* the file name or the file path. If the date lies outside of a specified date and time interval,
* then the file is not transferred. The date is extracted using either the file name (by default) or
* the file path (if the {@link #pathBasedDateMatching pathBasedDateMatching} parameter is true).
* The regular expression specified in the {@link #dateMatcher dateMatcher} parameter is used to retrieve
* the date from the file name or path. The date string is interpreted as a date and time using the
* DateTimeFormat format specified in the {@link #dateExtractor dateExtractor} parameter.
* <p>Each file is tested to determine whether it should be transferred from the remote server.
* The following criteria must be met:
* <ul>
* <li>the file name matches the {@link #match} regular expression (if 'match' is non-null),
* <li>the file lies within the start and end date time interval,
* <li>the hash of the file name is within the shard list,
* <li>the file was not already fetched in a previous run.
* </ul>
* <p>Example:</p>
* <pre>
* {hoover {
* user : "app",
* hosts : {
* "app1" : "web1.local",
* "app2" : "app2.local",
* },
* path : "/home/app/dat/out/history/*.gz",
* pathOut : ["{{YY}}","{{M}}","{{D}}","{{HOST}}-{{FILE}}"],
* dateMatcher : "([0-9]+)-[0-9]+.cnk.gz",
* dateExtractor : "yyMMdd",
* purgeAfterDays : 450,
* }</pre>
*
* @user-reference
*/
public class Hoover implements Runnable, TaskRunnable {
private static final Logger log = LoggerFactory.getLogger(Hoover.class);
private static final SimpleDateFormat dateOut = new SimpleDateFormat("yyyyMMddHH");
/**
* Mapping from server aliases to server hostnames.
* The serves alias will be substituted in place of the variable {{HOST}}.
*/
@FieldConfig(codable = true)
private HashMap<String, String> hosts;
/**
* Mark file directory on the local machine. Default is "hoover.mark".
*/
@FieldConfig(codable = true)
private String markDir = "hoover.mark";
/**
* Output directory on the local machine. Default is "hoover.out".
*/
@FieldConfig(codable = true)
private String outDir = "hoover.out";
/**
* User name for accessing the remote servers.
*/
@FieldConfig(codable = true)
private String user;
/**
* Path on the remote servers for retrieving files.
* This is a Unix path, so glob-matching (wildcard-matching) is allowed.
*/
@FieldConfig(codable = true)
private String path;
/**
* If non-null then only retrieve the files
* a file path that match this regular expression.
* Default is null.
*/
@FieldConfig(codable = true)
private String match;
/**
* Regular expression for extracting the date from the file name or path. Default is "(.*)".
*/
@FieldConfig(codable = true)
private String dateMatcher = "(.*)";
/**
* The <a href="http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html">DateTimeFormat</a>
* for each date that is extracted from either the file name or the file path. Default is "yyyy-MM-dd".
*/
@FieldConfig(codable = true)
private String dateExtractor = "yyyy-MM-dd";
/**
* Start date and time for filtering.
*/
@FieldConfig(codable = true)
private String startDate;
/**
* End date and time for filtering.
*/
@FieldConfig(codable = true)
private String endDate;
/**
* The <a href="http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html">DateTimeFormat</a>
* for the parameters 'startDate' and 'endDate'. Default is "yyyy-MM-dd-HH".
*/
@FieldConfig(codable = true)
private String startEndDateFormat = "yyyy-MM-dd-HH";
/**
* Execute this command to fetch the list of files on the remote machine.
* Default is ["ssh", "{{USER}}@{{HOST}}", "ls", "{{PATH}}" ].
*/
@FieldConfig(codable = true)
private String[] listCommand = new String[]{"ssh", "{{USER}}@{{HOST}}", "ls", "{{PATH}}"};
/**
* Execute this command to copy a file from the remote machine to the local machine.
* <br>Default is ["rsync", "-av", "{{USER}}@{{HOST}}:{{REMOTEPATH}}", "{{LOCALPATH}}" ].
*/
@FieldConfig(codable = true)
private String[] copyCommand = new String[]{"rsync", "-av", "{{USER}}@{{HOST}}:{{REMOTEPATH}}", "{{LOCALPATH}}"};
/**
* Optionally run this command at the completion of the file transfer. Default is null.
*/
@FieldConfig(codable = true)
private String[] postCommand;
/**
* If this flag is set, then fail this job when the the post command
* returns a non-zero value and the number of files in the output directory is zero.
* Default is false.
*/
@FieldConfig(codable = true)
private boolean failOnPostIfOutEmpty = false;
/**
* Output path and filename for each retrieved file.
* The final path is constructed by concatenating the
* elements of the string array with the directory separator ("/").
* It is a bad idea to exclude "{{FILE}}" from this parameter.
* Default is ["{{HOST}}-{{FILE}}"]
*/
@FieldConfig(codable = true)
private String[] pathOut = new String[]{"{{HOST}}-{{FILE}}"};
/**
* If true then {{FILE}} is replaced with the file name.
* Otherwise {{FILE}} is replaced with the whole path. Default is true.
*/
@FieldConfig(codable = true)
private boolean useShortPath = true;
/**
* If true then show output of remote commands in the logger. Default is true.
*/
@FieldConfig(codable = true)
private boolean verbose = true;
/**
* If true then perform gzip compression on the retrieved files. Default is false.
*/
@FieldConfig(codable = true)
private boolean compress;
/**
* If true then emit additional logging information when
* testing if a file is to be transferred. Default is false.
*/
@FieldConfig(codable = true)
private boolean verboseCheck = false;
/**
* If true, log the stderr instead of stdout of the process
*/
@FieldConfig(codable = true)
private boolean traceError = false;
/**
* Files that are older than this number of days will be deleted.
*/
@FieldConfig(codable = true)
private int purgeAfterDays = 30;
/**
* Command to be executed to delete files. Default is
* <br>["find", "{{DIR}}", "-type", "f", "-mtime", "+{{DAYS}}", "-print", "-exec", "rm", "{}", ";"]</br>
*/
@FieldConfig(codable = true)
private String[] purgeCommand = new String[]{"find", "{{DIR}}", "-type", "f", "-mtime", "+{{DAYS}}", "-print", "-exec", "rm", "{}", ";"};
/**
* If true then purge mark files that are older than purgeAfterDays days. Default is true.
*/
@FieldConfig(codable = true)
private boolean purgeMarks = true;
/**
* If true then extract the date from the file path.
* Otherwise extract the date from the file name.
* Default is false.
*/
@FieldConfig(codable = true)
private boolean pathBasedDateMatching = false;
/**
* Optional file creator with (key,value) pairs of the format
* <fileName or fileName;mode> : <"file string contents">
*/
@FieldConfig(codable = true)
private HashMap<String, String> staticFiles = new HashMap<>();
/**
* Maximum number of attempts to retrieve files from remote hosts. Default is 5.
*/
@FieldConfig(codable = true)
private int maxFindAttempts = 5;
@FieldConfig
private TaskRunConfig config;
private AtomicBoolean terminated = new AtomicBoolean(false);
private SimpleDateFormat dateFormat;
private Pattern datePattern;
private Thread thread;
private Integer[] mods;
private File markRoot;
private DateTime jodaStartDate = null;
private DateTime jodaEndDate = null;
private final Backoff backoff = new Backoff(1000, 10000);
public Date parseDate(String input) throws Exception {
if (dateExtractor.equals("seconds")) {
return new Date(Long.parseLong(input)*1000);
} else if (dateExtractor.equals("millis") || dateExtractor.equals("milliseconds")) {
return new Date(Long.parseLong(input));
} else {
if (dateFormat == null) dateFormat = new SimpleDateFormat(dateExtractor);
return dateFormat.parse(input);
}
}
@Override
public void start() {
this.mods = config.calcShardList(config.nodeCount);
this.markRoot = LessFiles.initDirectory(new File(markDir));
this.datePattern = Pattern.compile(dateMatcher);
if (startDate != null) {
this.jodaStartDate = DateUtil.getDateTime(DateUtil.getFormatter(startEndDateFormat), startDate);
}
if (endDate != null) {
this.jodaEndDate = DateUtil.getDateTime(DateUtil.getFormatter(startEndDateFormat), endDate);
}
log.info("init config={} mods={}", config, LessStrings.join(mods, ","));
/* create job files from map if not already existing or changed */
for (Map.Entry<String, String> file : staticFiles.entrySet()) {
String[] fileName = LessStrings.splitArray(file.getKey(), ";");
String mode = fileName.length > 1 ? fileName[1] : "755";
File out = new File(fileName[0]);
byte[] raw = LessBytes.toBytes(file.getValue());
if (out.exists() && out.isFile() && out.length() == raw.length) {
continue;
}
try {
LessFiles.initDirectory(out.getParentFile());
LessFiles.write(out, raw, false);
Runtime.getRuntime().exec("chmod " + mode + " " + out).waitFor();
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
thread = new Thread(this);
thread.setName("Hoover Rsync");
thread.start();
log.info("exec " + config.jobId);
}
@Override
public void close() {
if (terminated.compareAndSet(false, true)) {
thread.interrupt();
log.info("terminate " + config.jobId);
}
try {
thread.join();
log.info("exit " + config.jobId);
} catch (Exception ex) {
ex.printStackTrace();
}
}
@Override
public void run() {
File outDirFile = new File(outDir);
if (!outDirFile.exists()) {
log.info("out dir does not yet exist, creating: " + outDir);
outDirFile.mkdirs();
}
for (Map.Entry<String, String> host : hosts.entrySet()) {
log.info("fetching from " + host.getValue() + " ...");
Collection<MarkFile> files;
int attempts = 0;
while (true) {
files = findFiles(host.getKey(), host.getValue());
if (files == null && attempts++ > maxFindAttempts) {
log.error("Unable to find files to hoover after " + attempts + " attempts");
throw new RuntimeException("Unable to find files to hoover after " + attempts + " attempts");
} else if (files == null) {
log.warn("error running findFiles command, backing off before retry. This is attempt: " + attempts);
try {
Thread.sleep(backoff.get());
} catch (InterruptedException e) {
if (terminated.get()) {
break;
}
}
} else {
// success case, break retry loop
break;
}
}
for (MarkFile mark : files) {
attempts = 0;
while (!terminated.get()) {
if (fetchFile(mark, host.getValue()) && !terminated.get()) {
mark.write();
// successfully retrieved and marked file, break retry loop
break;
} else {
log.warn("error fetching " + mark.fileName + " from host: " + mark.host + " on attempt: " + attempts);
if (attempts++ < maxFindAttempts && !terminated.get()) {
try {
Thread.sleep(backoff.get());
} catch (InterruptedException e) {
if (terminated.get()) {
break;
}
}
} else {
log.error("max retry attempts: " + attempts + " breaking retry loop");
break;
}
}
}
}
}
purgeDir(outDir, purgeAfterDays);
if (purgeMarks) {
purgeDir(markDir, purgeAfterDays);
}
final int postRet = postCommand();
if (postRet != 0) {
if (!terminated.get() && (failOnPostIfOutEmpty && outDirFile.list().length == 0)) {
log.error("error returned by postCommand, forcing system.exit:" + postRet);
// We need to exit in a separate thread, because
// another thread is joining on this one, and
// inexplicably System.exit does not interrupt blocked
// threads. An alternative to System.exit would be
// ideal, but that would require some more invasive
// changes.
new Thread(new Runnable() {
public void run() {
System.exit(postRet);
}
}, "HooverSystemExiter").start();
thread.interrupt();
} else {
log.warn("error returned by postCommand; taking no action since exit was already in process:" + postRet);
}
}
}
private MarkFile getMarkFile(String host, String fileName) {
return new MarkFile(host, fileName);
}
private int postCommand() {
if (postCommand != null && postCommand.length > 0) {
Process proc = null;
int attempts = 0;
// post command retry loop
while (true) {
try {
proc = Runtime.getRuntime().exec(postCommand);
BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));
BufferedReader ereader = new BufferedReader(new InputStreamReader(proc.getErrorStream()));
String line = null;
while ((line = reader.readLine()) != null) {
if (verbose || log.isDebugEnabled()) log.warn("post --> " + line);
}
reader.close();
int exit = 0;
if ((exit = proc.waitFor()) != 0) {
log.error("post exited with " + exit);
String eline = null;
while ((eline = ereader.readLine()) != null) {
System.err.println(eline);
}
return exit;
}
} catch (Exception ex) {
log.error("", ex);
if (proc != null) {
log.error("Problem during postCommand, destroying process " + ex);
proc.destroy();
}
if (attempts++ < maxFindAttempts && !terminated.get()) {
try {
Thread.sleep(backoff.get());
} catch (InterruptedException e) {
// do nothing
}
continue;
} else {
log.error("Max retry attempts: " + attempts + " reached. Post command failed");
return 1;
}
}
// if we get here, success, so break retry loop
break;
}
}
return 0;
}
// TODO: change time for files that are not append only
/*
* throw out files if:
* - doesn't "match" regex (if set)
* - doesn't match mod hash of file name against shard list
* - was already fetched in a previous run
*/
private boolean checkFile(MarkFile markFile) {
if (match != null && !markFile.path.matches(match)) {
if (verboseCheck || log.isDebugEnabled()) {
log.info("match skip for host=" + markFile.host + " path=" + markFile.path + " match=" + match);
}
return false;
}
if (jodaStartDate != null && markFile.dateTime != null && markFile.dateTime.isBefore(jodaStartDate)) {
if (verboseCheck || log.isDebugEnabled()) {
log.info("skipping host=" + markFile.host + " path=" + markFile.path + " because " + markFile.dateTime + " is before startime" + jodaStartDate);
}
return false;
}
if (jodaEndDate != null && markFile.dateTime != null && markFile.dateTime.isAfter(jodaEndDate)) {
if (verboseCheck || log.isDebugEnabled()) {
log.info("skipping host=" + markFile.host + " path=" + markFile.path + " because " + markFile.dateTime + " is after end" + jodaEndDate);
}
return false;
}
if (markFile.markFile.exists()) {
if (verboseCheck || log.isDebugEnabled()) log.info("mark skip for host=" + markFile.host + " file=" + markFile.name());
return false;
}
int hashMod = Math.abs(PluggableHashFunction.hash(markFile.host.concat(markFile.name()))) % config.nodeCount;
for (Integer mod : mods) {
if (log.isDebugEnabled()) log.debug("mod=" + mod + " hashMod=" + hashMod);
if (hashMod == mod) {
return true;
}
}
if (verboseCheck || log.isDebugEnabled()) {
log.info("hash skip [" + hashMod + "] host=" + markFile.host + " file=" + markFile.name());
}
return false;
}
/*
* exec scp to find list of remote files
*/
private Collection<MarkFile> findFiles(String hostNickname, String host) {
LinkedList<MarkFile> files = new LinkedList<>();
String[] newCmd = new String[listCommand.length];
for (int i = 0; i < newCmd.length; i++) {
newCmd[i] = listCommand[i].replace("{{USER}}", user).replace("{{HOST}}", host).replace("{{PATH}}", path);
}
if (verboseCheck || log.isDebugEnabled()) log.info("find cmd=" + LessStrings.join(newCmd, " "));
try {
Process proc = Runtime.getRuntime().exec(newCmd);
BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));
String fileName = null;
while ((fileName = reader.readLine()) != null) {
MarkFile markFile = getMarkFile(hostNickname, fileName.trim());
if (verboseCheck || log.isDebugEnabled()) log.info("found host=" + host + " file=" + fileName);
if (checkFile(markFile)) {
files.add(markFile);
}
}
reader.close();
proc.waitFor();
} catch (Exception ex) {
log.warn("error finding files from host: " + host, ex);
return null;
}
return files;
}
private boolean fetchFile(MarkFile markFile, String host) {
String[] path = new String[pathOut.length];
for (int i = 0; i < path.length; i++) {
path[i] = pathOut[i].replace("{{HOST}}", markFile.host).replace("{{FILE}}", markFile.name());
if (markFile.dateYear != null) {
path[i] = path[i].replace("{{YY}}", markFile.dateYear);
path[i] = path[i].replace("{{Y}}", markFile.dateYear.substring(2));
path[i] = path[i].replace("{{M}}", markFile.dateMonth);
path[i] = path[i].replace("{{D}}", markFile.dateDay);
path[i] = path[i].replace("{{H}}", markFile.dateHour);
}
}
String pathString = outDir + "/" + LessStrings.join(path, "/");
String[] newCmd = new String[copyCommand.length];
File fileOut = new File(pathString);
File fileDir = LessFiles.initDirectory(fileOut.getParentFile());
if (verboseCheck || log.isDebugEnabled()) log.info("fileDir=" + fileDir + " fileOut=" + fileOut+" outDir="+fileDir+", "+fileDir.exists());
for (int i = 0; i < newCmd.length; i++) {
newCmd[i] = copyCommand[i].replace("{{USER}}", user).replace("{{HOST}}", host).replace("{{LOCALPATH}}", pathString).replace("{{REMOTEPATH}}", markFile.path);
}
if (log.isDebugEnabled()) log.debug("copy cmd = " + LessStrings.join(newCmd, " "));
try {
Process proc = Runtime.getRuntime().exec(newCmd);
BufferedReader reader = new BufferedReader(new InputStreamReader(traceError ? proc.getErrorStream() : proc.getInputStream()));
String line = null;
while ((line = reader.readLine()) != null) {
if (verboseCheck || log.isDebugEnabled()) log.info(" --> " + line);
}
reader.close();
int ret = proc.waitFor();
if (ret != 0) {
log.warn("non-zero return code ("+ret+") while executing: " + LessStrings.join(newCmd, " "));
return false;
}
if (compress && !pathString.endsWith(".gz")) {
File compressTo = new File(pathString + ".gz");
if (compressTo.exists()) {
if (log.isDebugEnabled()) log.debug("" + compressTo + " already exists, deleting and recreating");
compressTo.delete();
}
if (log.isDebugEnabled()) log.debug("compressing file: " + pathString);
Process gzipProc = Runtime.getRuntime().exec(new String[]{"gzip", pathString});
if (gzipProc.waitFor() != 0) {
log.warn("non-zero return code while gzipping: " + pathString);
}
}
} catch (Exception ex) {
ex.printStackTrace();
return false;
}
if (verbose || log.isDebugEnabled()) {
log.info("fetched " + markFile.host + " --> " + markFile.name() + " marked by " + markFile.markFile);
}
return true;
}
private boolean purgeDir(String dir, int days) {
if (days <= 0) return true;
String[] newCmd = new String[purgeCommand.length];
for (int i = 0; i < newCmd.length; i++) {
newCmd[i] = purgeCommand[i].replace("{{DIR}}", dir).replace("{{DAYS}}", Integer.toString(days));
}
if (log.isDebugEnabled()) log.debug("purge cmd = " + LessStrings.join(newCmd, " "));
if (verbose || log.isDebugEnabled()) log.info("purging older than days=" + days + " from dir=" + dir);
try {
Process proc = Runtime.getRuntime().exec(newCmd);
BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream()));
String line = null;
while ((line = reader.readLine()) != null) {
if (verbose || log.isDebugEnabled()) log.info(" --> " + line);
}
reader.close();
return proc.waitFor() == 0;
} catch (Exception ex) {
ex.printStackTrace();
return false;
}
}
/** */
private class MarkFile {
public final String host;
public final String path;
public final String fileName;
public final File markFile;
public final String dateYear;
public final String dateMonth;
public final String dateDay;
public final String dateHour;
public final DateTime dateTime;
MarkFile(String host, String path) {
this.host = host;
this.path = path;
this.fileName = new File(path).getName();
try {
File hostRoot = LessFiles.initDirectory(new File(markRoot, host));
MessageDigest md5 = MessageDigest.getInstance("MD5");
BigInteger val = new BigInteger(1, md5.digest(LessBytes.toBytes(path)));
String hashName = LessStrings.padleft(val.toString(16), 32, LessStrings.pad0);
markFile = new File(hostRoot, hashName);
Matcher fileMatcher = datePattern.matcher(fileName);
Matcher pathMatcher = datePattern.matcher(path);
if (!pathBasedDateMatching && fileMatcher.find(0)) {
String group1 = fileMatcher.group(1);
Date date = parseDate(group1);
String datePrint = dateOut.format(date);
if (verboseCheck || log.isDebugEnabled()) {
log.info("extract group1=" + group1 + " date=" + date + " datePrint=" + datePrint);
}
dateYear = datePrint.substring(0, 4);
dateMonth = datePrint.substring(4, 6);
dateDay = datePrint.substring(6, 8);
dateHour = datePrint.substring(8, 10);
dateTime = new DateTime(date);
} else if (pathBasedDateMatching && pathMatcher.find(0)) {
String group1 = pathMatcher.group(1);
Date date = parseDate(group1);
String datePrint = dateOut.format(date);
if (verboseCheck || log.isDebugEnabled()) {
log.info("extract group1=" + group1 + " date=" + date + " datePrint=" + datePrint);
}
dateYear = datePrint.substring(0, 4);
dateMonth = datePrint.substring(4, 6);
dateDay = datePrint.substring(6, 8);
dateHour = datePrint.substring(8, 10);
dateTime = new DateTime(date);
} else {
dateYear = null;
dateMonth = null;
dateDay = null;
dateHour = null;
dateTime = null;
}
if (verboseCheck || log.isDebugEnabled()) {
log.info("extract "+toString());
}
} catch (Exception ex) {
throw new RuntimeException(ex);
}
if (verboseCheck || log.isDebugEnabled()) log.info("mark " + host + " -> " + path + " = " + markFile);
}
public String toString() {
return "dp=" + datePattern + " fn=" + fileName + " dy=" + dateYear + " dm=" + dateMonth + " dd=" + dateDay + " dh=" + dateHour;
}
public String name() {
return useShortPath ? fileName : path;
}
public void write() {
try {
LessFiles.write(markFile, LessBytes.toBytes(name()), false);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
}
}