/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.task.output.tree; import javax.annotation.Nullable; import java.util.List; import java.util.concurrent.TimeUnit; import com.addthis.basis.util.ClosableIterator; import com.addthis.basis.util.JitterClock; import com.addthis.codec.annotations.Time; import com.addthis.hydra.data.tree.DataTreeNode; import com.addthis.hydra.data.tree.prop.DataTime; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.util.concurrent.Runnables; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This {@link PathElement PathElement} <span class="hydra-summary">eliminates child nodes based on a time stamp</span>. * <p/> * <p>The time stamp is extracted from a property of the node with the property key specified by * {@link #timePropKey timePropKey}. Nodes with time stamps that are older than {@link #ttl ttl} * milliseconds are removed.</p> * <p/> * <p>Example:</p> * <pre> * {prune {ttl:2592000000, relativeDown:1}} * </pre> * * @user-reference */ public class PathPrune extends PathElement { private static final Logger logger = LoggerFactory.getLogger(PathPrune.class); /** Maximum age in milliseconds. */ private final long ttl; /** Property key name for extracting the time stamp of a node. Default is "time". */ private final String timePropKey; private final boolean ignoreMissingTimeProp; /** * If true then delete all nodes at the leaf level of matching. * Otherwise perform the default date matching behavior. * Default is false. */ private final boolean allLeaves; /** * When traversing the tree in search of the nodes to prune, if this parameter is a positive integer then begin * the traversal this many levels lower than the current location. Default is zero. */ private final int relativeDown; /** * Optionally specify a path for traversal before pruning is initiated. * This parameter is incompatible the relativeDown parameter. The recognized * path types are "*" for matching all values, "{{date}}" for date matching, * and "foo" for matching a specific value. */ @Nullable private final ImmutableList<String> treePath; /** * If true then terminate the pruning process when the job is shutting down. * Default is false. Note that specifying a prune in the "post" section and * setting preempt to true can prevent job pruning from happening if the * job always terminates when its maximum runtime is reached. * Consider specifying a prune in the "pre" section and setting preempt to true. */ private final boolean preempt; /** * If non-null then parse the name of each node using the provided * <a href="http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat * .html">DateTimeFormat</a>. Default is null. By default the * parser will use the default time zone. To change the time zone * use the "timezone" field. */ @Nullable private final DateTimeFormatter nameFormat; /** * Optional argument. It is a hint to the prune implementation that * the nameFormat is also the lexicographic order of the nodes. * Can be used to improve prune performance. Default is false. */ private final boolean sortedNames; @Nullable private final ImmutableSet<String> excludes; private Splitter SLASH_SPLITTER = Splitter.on('/').omitEmptyStrings(); @JsonCreator public PathPrune(@Time(TimeUnit.MILLISECONDS) @JsonProperty("ttl") long ttl, @Nullable @JsonProperty("nameFormat") String nameFormat, @Nullable @JsonProperty("timezone") String timezone, @Nullable @JsonProperty("treePath") String treePath, @JsonProperty("timePropKey") String timePropKey, @JsonProperty("ignoreMissingTimeProp") boolean ignoreMissingTimeProp, @JsonProperty("allLeaves") boolean allLeaves, @JsonProperty("relativeDown") int relativeDown, @JsonProperty("preempt") boolean preempt, @JsonProperty("excludes") ImmutableSet<String> excludes, @JsonProperty("sortedNames") boolean sortedNames) { this.ttl = ttl; this.timePropKey = timePropKey; this.ignoreMissingTimeProp = ignoreMissingTimeProp; this.allLeaves = allLeaves; this.relativeDown = relativeDown; this.preempt = preempt; this.excludes = excludes; this.sortedNames = sortedNames; if (nameFormat != null && timezone != null) { this.nameFormat = DateTimeFormat.forPattern(nameFormat).withZone(DateTimeZone.forID(timezone)); } else if (nameFormat != null) { this.nameFormat = DateTimeFormat.forPattern(nameFormat); } else { this.nameFormat = null; } if ((treePath != null) && (relativeDown != 0)) { throw new IllegalStateException("cannot use both treePath and relativeDown parameters"); } if (treePath != null) { this.treePath = ImmutableList.copyOf(SLASH_SPLITTER.splitToList(treePath)); } else { this.treePath = null; } } // Is it better to try to do the pruning in this method or // whatever is getting the TreeNodeList back? @Override public List<DataTreeNode> getNextNodeList(final TreeMapState state) { List<DataTreeNode> result = TreeMapState.empty(); long now = JitterClock.globalTime(); DataTreeNode root = state.current(); if (preempt && (state.processorClosing() || expensiveShutdownTest())) { log.info("Path pruning is not executing due to JVM shutdown."); return result; } findAndPruneChildren(state, root, now, relativeDown, treePath); return result; } public void findAndPruneChildren(final TreeMapState state, final DataTreeNode root, long now, int depth, List<String> treePaths) { if ((depth == 0) && ((treePaths == null) || (treePaths.size() == 0))) { pruneChildren(state, root, now); } else if (treePaths != null) { String current = treePaths.get(0); List<String> next = treePaths.subList(1, treePaths.size()); if ("*".equals(current)) { ClosableIterator<DataTreeNode> keyNodeItr = root.getIterator(); try { while (keyNodeItr.hasNext() && !(preempt && state.processorClosing())) { findAndPruneChildren(state, keyNodeItr.next(), now, 0, next); } } finally { keyNodeItr.close(); } } else if ("{{date}}".equals(current)) { ClosableIterator<DataTreeNode> keyNodeItr = root.getIterator(); try { while (keyNodeItr.hasNext() && !(preempt && state.processorClosing())) { DataTreeNode treeNode = keyNodeItr.next(); long nodeTime = getNodeTime(treeNode); if ((nodeTime > 0) && ((now - nodeTime) > ttl)) { findAndPruneChildren(state, treeNode, now, 0, next); } } } finally { keyNodeItr.close(); } } else { DataTreeNode nextNode = root.getNode(current); if (nextNode != null) { findAndPruneChildren(state, nextNode, now, 0, next); } } } else { ClosableIterator<DataTreeNode> keyNodeItr = root.getIterator(); try { while (keyNodeItr.hasNext() && !(preempt && state.processorClosing())) { findAndPruneChildren(state, keyNodeItr.next(), now, depth - 1, null); } } finally { keyNodeItr.close(); } } } public void pruneChildren(final TreeMapState state, final DataTreeNode root, long now) { ClosableIterator<DataTreeNode> keyNodeItr = root.getIterator(); int deleted = 0; int kept = 0; int total = 0; boolean skip = false; try { while (keyNodeItr.hasNext() && !(preempt && state.processorClosing())) { long nodeTime = 0; DataTreeNode treeNode = keyNodeItr.next(); boolean delete; if (allLeaves) { delete = true; } else { nodeTime = getNodeTime(treeNode); delete = ((nodeTime > 0) && ((now - nodeTime) > ttl)); } if (delete) { root.deleteNode(treeNode.getName()); deleted++; } else if (sortedNames && (nodeTime > 0) && (nameFormat != null)) { skip = true; break; } else { kept++; } total++; if ((total % 100000) == 0) { logger.info("Iterating through children of {}, deleted: {} kept: {}", root.getName(), deleted, kept); } } } finally { if (skip) { logger.info("Iterated through children of {}, deleted: {} and skipped remaining nodes", root.getName(), deleted); } else { logger.info("Iterated through children of {}, deleted: {} kept: {}", root.getName(), deleted, kept); } keyNodeItr.close(); } } /** * It is bad practice use the shutdown hook mechanism as a way * to test whether the JVM is shutting down. However if we use * this method exactly once then it is useful when writing tests * to ensure that zero prune operations occur during shutdown. * * @return true iff the jvm is shutting down */ private static boolean expensiveShutdownTest() { try { Thread shutdownHook = new Thread(Runnables.doNothing(), "Path prune shutdown hook"); Runtime.getRuntime().addShutdownHook(shutdownHook); Runtime.getRuntime().removeShutdownHook(shutdownHook); } catch (IllegalStateException ignored) { return true; } return false; } /** * Return the time associated with a node or -1 for an invalid time. */ @VisibleForTesting long getNodeTime(DataTreeNode treeNode) { if ((excludes != null) && (excludes.contains(treeNode.getName()))) { return -1; } if (nameFormat != null) { return nameFormat.parseMillis(treeNode.getName()); } else { DataTime dt = (DataTime) treeNode.getData(timePropKey); if (dt == null) { if (ignoreMissingTimeProp) { return -1; } throw new RuntimeException("missing time attachment with key " + timePropKey); } else { return dt.last(); } } } }