/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.task.source; import javax.annotation.Nullable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Pattern; import java.time.temporal.ChronoUnit; import com.addthis.basis.util.LessBytes; import com.addthis.basis.util.Parameter; import com.addthis.basis.util.LessStrings; import com.addthis.codec.json.CodecJSON; import com.addthis.hydra.data.filter.lambda.StringWithValueFilter; import com.addthis.hydra.task.stream.PersistentStreamFileSource; import com.addthis.hydra.task.stream.StreamFileUtil; import com.addthis.maljson.JSONObject; import com.google.common.base.Function; import com.google.common.collect.Lists; import com.fasterxml.jackson.annotation.JsonProperty; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * An abstract implementation of {@link PersistentStreamFileSource} * that provides much of the base functionality required to implement a streaming file source. * The main purpose of this class is to parse an input configuration in order to * to provide common necessary inputs that concrete implementations require to * identify the files from the data source should provide to clients. */ public abstract class AbstractPersistentStreamSource implements PersistentStreamFileSource { private static final Logger log = LoggerFactory.getLogger(AbstractPersistentStreamSource.class); // note that for historical reason these parameters use 'mesh' in their descriptions private static final String DEFAULT_DATE_FORMAT = Parameter.value("source.mesh.date.format", "YYMMdd"); private static final int DEFAULT_SORT_TOKEN_OFFSET = Parameter.intValue("source.mesh.sort.token.offset", 5); private static final int DEFAULT_PATH_TOKEN_OFFSET = Parameter.intValue("source.mesh.path.token.offset", 0); private static final String DEFAULT_PATH_TOKEN = Parameter.value("source.mesh.path.token", "/"); public static final long ONE_HOUR_IN_MILLIS = 60 * 60 * 1000; public static final long ONE_DAY_IN_MILLIS = 24 * ONE_HOUR_IN_MILLIS; private static final String NOW_PREFIX = "{{now"; private static final String NOW_POSTFIX = "}}"; public static final String TIME_NOW = "{{now}}"; private static final Pattern MOD_PATTERN = Pattern.compile("{{mod}}", Pattern.LITERAL); private static final Pattern YY_PATTERN = Pattern.compile("{YY}", Pattern.LITERAL); private static final Pattern Y_PATTERN = Pattern.compile("{Y}", Pattern.LITERAL); private static final Pattern M_PATTERN = Pattern.compile("{M}", Pattern.LITERAL); private static final Pattern D_PATTERN = Pattern.compile("{D}", Pattern.LITERAL); private static final Pattern H_PATTERN = Pattern.compile("{H}", Pattern.LITERAL); /** * The format of startDate and endDate values using the * <a href="http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html">DateTimeFormat</a>. * Default is either "source.mesh.date.format" configuration value or "YYMMdd". * Use the string literal "constant" to ignore the start date and the end date. */ @JsonProperty private String dateFormat = DEFAULT_DATE_FORMAT; /** files that have been created before this date will not be processed. Default is {{last}}. */ @JsonProperty private StringWithValueFilter startDate = new StringWithValueFilter(TIME_NOW, null); /** files that have been created after this date will not be processed. Default is {{now}}. */ @JsonProperty private StringWithValueFilter endDate = new StringWithValueFilter(TIME_NOW, null); /** If true then process the dates from the most recent date to the earliest date. Default is false. */ @JsonProperty protected boolean reverse; /** list of file paths to process. This field is required. */ @JsonProperty(required = true) private List<String> files; /** * When selecting a substring of the input files for either sorting the file names * or fetching the file paths then use this token as the path separator. * Default is "source.mesh.path.token" configuration value or "/". * */ @JsonProperty private String sortToken = DEFAULT_PATH_TOKEN; /** shift the sorting suffix by this many characters. Default is 0. */ @JsonProperty private int sortOffset; /** * skip this number of sortToken characters for the sorting suffix. * Default is "source.mesh.sort.token.offset" configuration value or 5. */ @JsonProperty private int sortTokenOffset = DEFAULT_SORT_TOKEN_OFFSET; /** shift the generated file path by this many characters. Default is 0. */ @JsonProperty private int pathOffset; /** * skip this number of sortToken characters for generating file paths. * Default is "source.mesh.path.token.offset" configuration value or 0. */ @JsonProperty private int pathTokenOffset = DEFAULT_PATH_TOKEN_OFFSET; @JsonProperty private int jitterDays = 1; @JsonProperty private String startDateBaseDir; /** * Legal values are "HOURS", "DAYS", and "MONTHS". */ @JsonProperty private String dateIncrements; /* note: this is based on which files have been opened. If there is a large preOpen queue or many worker threads * then multiple days may be open at once, but this setting will assume that the latest day is the one to resume from. */ @JsonProperty private boolean autoResume; protected final LinkedList<DateTime> dates = new LinkedList<>(); protected DateTimeFormatter formatter; protected volatile boolean moreData; private File stateDir; protected File autoResumeFile; private final AtomicBoolean running = new AtomicBoolean(true); @Nullable protected ChronoUnit intervalUnit; /** * perform any initialization steps specific to the implementing class * * @return true if initialization was successful */ protected abstract boolean doInit() throws IOException; /** perform any shutdown steps specific to the implementing class */ public abstract void doShutdown() throws IOException; /** * @return true if the configuration for this source includes a template 'mod' element * that can be used to segment the input stream between n consumers */ @Override public boolean hasMod() { for (String file : files) { if (file.contains("{{mod")) { return true; } } return false; } /** called by data source wrapper and performs common initialization steps. */ @Override public boolean init(File stateDir, Integer[] shards) throws Exception { if (log.isDebugEnabled()) { log.debug("SSM: {}", CodecJSON.encodeString(this)); } this.stateDir = stateDir; autoResumeFile = new File(this.stateDir, "job.source"); if (log.isTraceEnabled()) { log.trace("shards :: {}", LessStrings.join(shards, " :: ")); } /* expand files list */ Set<String> matches = new HashSet<>(); log.trace("files.1 :: {}", files); /* expand mods */ for (String file : files) { for (Integer shard : shards) { matches.add(MOD_PATTERN.matcher(file).replaceAll( LessStrings.padleft(shard.toString(), 3, LessStrings.pad0))); } } matches = expandPaths(matches); log.trace("files.2 :: {}", matches); files = new ArrayList<>(matches); log.trace("files.3 :: {}", files); /* calculate start/end dates if required */ if ("constant".equalsIgnoreCase(dateFormat)) { formatter = null; } else { formatter = DateTimeFormat.forPattern(dateFormat); } if (autoResume && autoResumeFile.exists() && autoResumeFile.canRead() && autoResumeFile.length() > 0) { try { JSONObject jo = new JSONObject( LessBytes.toString(LessBytes.readFully(new FileInputStream(autoResumeFile)))); String resumeDate = jo.optString("lastDate"); if (resumeDate != null) { log.warn("auto resume from {}", jo); startDate = new StringWithValueFilter(resumeDate, null); } } catch (Exception ex) { log.warn("corrupted autoResume file: {}", autoResumeFile, ex); } } if ((formatter != null) && (startDate == null)) { log.warn("No startDate provided."); return false; } DateTime start = parseDateTime(startDate.get()); if ((formatter != null) && (endDate == null)) { endDate = new StringWithValueFilter(NOW_PREFIX + NOW_POSTFIX, null); log.warn("End Date not provided, using current time: {} as end date for job", endDate); } DateTime end = parseDateTime(endDate.get()); intervalUnit = timeIncrement(dateIncrements, dateFormat); if (!testFileDateExpansion()) { return false; } /* populate date list from start/end */ fillDateList(start, end); log.info("[init] {} to {} = {} time units", start, end, dates.size()); return doInit(); } /** * Return false if one or more file names are missing the expected date substitution * expressions. Otherwise return true. */ private boolean testFileDateExpansion() { if (intervalUnit == null) { return true; } for (String filename : files) { switch (intervalUnit) { case HOURS: if (!H_PATTERN.matcher(filename).find()) { log.error("Hourly interval is specified and {H} is missing from filename " + filename); return false; } break; case DAYS: if (!D_PATTERN.matcher(filename).find()) { log.error("Daily interval is specified and {D} is missing from filename " + filename); return false; } if (H_PATTERN.matcher(filename).find()) { log.error("Daily interval is specified and {H} is present in filename " + filename); return false; } break; case MONTHS: if (!M_PATTERN.matcher(filename).find()) { log.error("Monthly interval is specified and {M} is missing from filename " + filename); return false; } if (D_PATTERN.matcher(filename).find()) { log.error("Monthly interval is specified and {D} is present in filename " + filename); return false; } if (H_PATTERN.matcher(filename).find()) { log.error("Monthly interval is specified and {H} is present in filename " + filename); return false; } break; } } return true; } protected Set<String> expandPaths(Set<String> paths) { return paths; } public void setStartTime(long time) { if (formatter != null) { startDate = new StringWithValueFilter(formatter.print(time), null); log.warn("override start date with {}", startDate); } } @Override public void shutdown() throws IOException { running.set(false); doShutdown(); } private static ChronoUnit timeIncrement(String dateIncrements, String dateFormat) { if ("constant".equalsIgnoreCase(dateFormat)) { return null; } if ("DAYS".equals(dateIncrements) || (dateFormat.length() == 6)) { return ChronoUnit.DAYS; } else if ("HOURS".equals(dateIncrements) || (dateFormat.length() == 8)) { return ChronoUnit.HOURS; } else if ("MONTHS".equals(dateIncrements)) { return ChronoUnit.MONTHS; } else if (dateIncrements == null) { log.warn("Non-Standard dateFormat: {} defaulting to daily time increments\nThis can be modified to " + "hourly time increments by setting dateIncrements to 'HOURS'", dateFormat); return ChronoUnit.DAYS; } else { return null; } } /** list of dates given the start/end range from the config */ private void fillDateList(DateTime start, DateTime end) { if ((start == null) || (end == null)) { dates.add(DateTime.now()); return; } DateTime mark = start; while (mark.isBefore(end) || mark.isEqual(end)) { if (reverse) { dates.addFirst(mark); } else { dates.addLast(mark); } if (intervalUnit == null) { return; } switch (intervalUnit) { case HOURS: mark = mark.plusHours(1); break; case DAYS: mark = mark.plusDays(1); break; case MONTHS: mark = mark.plusMonths(1); break; default: throw new IllegalStateException("Unexpected duration unit " + intervalUnit); } } } private DateTime parseDateTime(String dateString) { DateTime time; if (formatter == null) { time = null; } else if (dateString.contains(NOW_PREFIX)) { // TODO: be better to get this time from a service time = new DateTime(); time = time.plusDays(findDaysOffset(dateString)); } else { time = formatter.parseDateTime(dateString); } return time; } private static int findDaysOffset(String time) { int startIndex = time.indexOf(NOW_PREFIX) + 6; int endIndex = time.indexOf(NOW_POSTFIX); if (startIndex < 0 || endIndex <= startIndex) { return 0; } int offset = Integer.parseInt(time.substring(startIndex, endIndex)); if (time.charAt(startIndex - 1) == '-') { offset = 0 - offset; } return offset; } private static String replaceDateElements(DateTime time, String template) { String result = YY_PATTERN.matcher(template).replaceAll(time.year().getAsString()); result = Y_PATTERN.matcher(result).replaceAll(getTwoDigit(time.year().get())); result = M_PATTERN.matcher(result).replaceAll(getTwoDigit(time.monthOfYear().get())); result = D_PATTERN.matcher(result).replaceAll(getTwoDigit(time.dayOfMonth().get())); result = H_PATTERN.matcher(result).replaceAll(getTwoDigit(time.hourOfDay().get())); log.debug("template={}, result={}", template, result); return result; } private static String getTwoDigit(int value) { if (value < 10) { return "0".concat(Integer.toString(value)); } if (value > 99) { return getTwoDigit(value % 100); } return Integer.toString(value); } public String[] getDateTemplatedFileList(final DateTime timeToLoad) { List<String> fileList = Lists.transform(files, new Function<String, String>() { @Override public String apply(String input) { return replaceDateElements(timeToLoad, input); } }); return fileList.toArray(new String[fileList.size()]); } /** return substring getSortOffset into file name */ public String getSortOffset(String name) { int sortOff = sortOffset; if ((sortToken != null) && (sortTokenOffset > 0)) { int pos = 0; int off = sortTokenOffset; while (off-- > 0 && (pos = name.indexOf(sortToken, pos)) >= 0) { pos++; } if (pos > 0) { sortOff += pos; } } return name.substring(sortOff); } /** return substring getSortOffset into file name */ public String getPathOffset(String name) { return StreamFileUtil.getCanonicalFileReferenceCacheKey(name, pathOffset, sortToken, pathTokenOffset); } }