/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.task.output.tree;
import javax.annotation.Nonnull;
import java.io.File;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import com.addthis.basis.jvm.Shutdown;
import com.addthis.basis.util.Bench;
import com.addthis.basis.util.LessBytes;
import com.addthis.basis.util.LessFiles;
import com.addthis.basis.util.JitterClock;
import com.addthis.basis.util.Parameter;
import com.addthis.bundle.core.Bundle;
import com.addthis.bundle.core.BundleField;
import com.addthis.bundle.core.list.ListBundle;
import com.addthis.bundle.value.ValueObject;
import com.addthis.codec.annotations.FieldConfig;
import com.addthis.codec.codables.Codable;
import com.addthis.hydra.data.query.engine.QueryEngine;
import com.addthis.hydra.data.query.source.LiveMeshyServer;
import com.addthis.hydra.data.query.source.LiveQueryReference;
import com.addthis.hydra.data.tree.DataTree;
import com.addthis.hydra.data.tree.TreeConfig;
import com.addthis.hydra.data.tree.concurrent.ConcurrentTree;
import com.addthis.hydra.data.tree.nonconcurrent.NonConcurrentTree;
import com.addthis.hydra.data.tree.TreeCommonParameters;
import com.addthis.hydra.data.util.TimeField;
import com.addthis.hydra.store.db.CloseOperation;
import com.addthis.hydra.task.output.DataOutputTypeList;
import com.addthis.hydra.task.run.TaskRunConfig;
import com.addthis.meshy.MeshyServer;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This output <span class="hydra-summary">transforms bundle streams into trees for statistical analysis and data queries</span>
* <p/>
* <p>A tree is defined by one or more paths. A path is a reusable set of connected tree nodes. One of these
* paths is designated as the root path, from which the rest of the tree is constructed.</p>
* <p/>
* <p>A tree may optionally specify a feature set. The feature set is a set of character strings.
* Path elements that do not match against the feature set will not be included in the tree.
* If a path element specifies the {@link PathElement#feature feature} parameter then the path
* element is excluded if it specifies a feature that is not in the feature set of the tree.
* If a path element specifies the {@link PathElement#featureOff featureOff} parameter then the path
* element is excluded if it specifies a feature that is in the feature set of the tree.
* <p/>
* <p>Example:</p>
* <pre>
* output.tree {
* root:[
* {const:"date"}
* {field:"DATE_YMD"}
* {field:"DATE_HH"}
* ]
* }
* </pre>
*
* @user-reference
*/
public final class TreeMapper extends DataOutputTypeList implements Codable {
private static final Logger log = LoggerFactory.getLogger(TreeMapper.class);
private static final SimpleDateFormat date = new SimpleDateFormat("yyMMdd-HHmmss");
private enum BENCH {
TIME, UNITS, RULES, STREAM, LOCAL
}
private static enum ValidateMode {
ALL, POST, NONE
}
/**
* Default is either "mapper.printinterval" configuration value or 1000.
*/
@FieldConfig private long printinterval = Parameter.longValue("mapper.printinterval", 1000L);
/**
* Definition of the tree structure.
* Consists of a mapping from a name to one or more path elements.
* One of these path elements will serve as the root of the tree.
*/
@FieldConfig private Map<String, PathElement[]> paths;
/**
* Optional path that is processed once at the beginning of execution. The input to this path is an empty bundle.
*/
@FieldConfig private PathElement[] pre;
/** Path that will serve as the root of the output tree. */
@FieldConfig private PathElement[] root;
/** Optional path that is processed once at the end of execution. The input to this path is an empty bundle. */
@FieldConfig private PathElement[] post;
/**
* Optional specify whether to perform
* validation on the tree pages stored
* in persistent storage. This is a slow operation.
* "NONE" never validates.
* "POST" validates only if "post" is fired.
* "ALL" always validates. Default is "NONE".
* If the tree pages are not valid then the task will error.
* In the event of invalid tree pages the correct action
* is to revert the task.
*/
@FieldConfig private ValidateMode validateTree = ValidateMode.NONE;
/**
* If tree validation has been validated (see {@link #validateTree validateTree}
* then this parameter determines whether
* repairs will be made when an error is detected.
* Default is false.
*/
@FieldConfig private boolean repairTree = false;
/**
* Boolean to determine if a concurrent or non-concurrent tree
* should be used by this mapper. Note that you cannot use
* non-concurrent tree if more than one task processing
* threads are in use
*
* Default is true.
*/
@FieldConfig private boolean concurrentTree = true;
/**
* Optional sample rate for applying
* the {@link #pre pre} paths. If greater
* than one than apply once every N runs.
* Default is one.
*/
@FieldConfig private int preRate = 1;
/**
* Optional sample rate for applying
* the {@link #post post} paths. If greater
* than one than apply once every N runs.
* Default is one.
*/
@FieldConfig private int postRate = 1;
/**
* One or more queries that are executed
* after the tree has been constructed.
*/
@FieldConfig private PathOutput[] outputs;
@FieldConfig private boolean live;
@FieldConfig private String liveHost;
@FieldConfig private int livePort;
@FieldConfig private Integer nodeCache;
@FieldConfig private Integer trashInterval;
@FieldConfig private Integer trashTimeLimit;
@FieldConfig private TimeField timeField;
@FieldConfig private boolean stats = true;
/**
* Set of strings that enumerate the features to process.
*/
@FieldConfig private HashSet<String> features;
@FieldConfig private StoreConfig storage;
@FieldConfig private int maxErrors = 0;
@FieldConfig private boolean profiling = false;
@FieldConfig private TaskRunConfig config;
@FieldConfig private String directory;
@FieldConfig private TreeConfig advanced;
private final ConcurrentMap<String, BundleField> fields = new ConcurrentHashMap<>();
private final IndexHash<PathElement[]> pathIndex = new IndexHash();
/**
* If true then jvm shutdown process has begun.
*/
private final AtomicBoolean closing = new AtomicBoolean(false);
private DataTree tree;
private Bench bench;
private long startTime;
private MeshyServer liveQueryServer;
private TreeMapperStats mapstats;
private final AtomicLong lastHeaderTime = new AtomicLong(JitterClock.globalTime());
private final AtomicLong benchCalls = new AtomicLong(0);
private final AtomicLong streamWaitime = new AtomicLong(0);
private final AtomicLong streamReadCount = new AtomicLong(0);
private final AtomicLong streamReadTotal = new AtomicLong(0);
private final AtomicLong mapWriteTime = new AtomicLong(0);
private final AtomicLong processed = new AtomicLong(0);
private final AtomicLong processNodes = new AtomicLong(0);
private int bundleErrors = 0;
private final AtomicLong lastBundleTime = new AtomicLong(0);
private void resolve() throws Exception {
fields.clear();
if (features != null) {
PathElement.featureSet.addAll(features);
}
// index paths and intern path element keys
if (paths != null) {
for (Map.Entry<String, PathElement[]> me : paths.entrySet()) {
PathElement[] pe = me.getValue();
for (PathElement p : pe) {
p.resolve(this);
}
pathIndex.add(me.getKey(), pe);
}
}
if (root != null) {
for (PathElement p : root) {
p.resolve(this);
}
} else if ((paths != null) && !paths.isEmpty()) {
root = paths.values().iterator().next();
}
if (pre != null) {
for (PathElement p : pre) {
p.resolve(this);
}
}
if (post != null) {
for (PathElement p : post) {
p.resolve(this);
}
}
if (outputs != null) {
for (PathOutput out : outputs) {
out.resolve(this);
}
}
}
public PathElement[] getPath(String path) {
return paths.get(path);
}
public Integer getPathIndex(String path) {
return pathIndex.getIndex(path);
}
@Override
public void open() {
try {
boolean success = Shutdown.tryAddShutdownHook(
new Thread(() -> closing.set(true), "TreeMapper shutdown hook"));
if (!success) {
closing.set(true);
}
mapstats = new TreeMapperStats();
resolve();
if (nodeCache != null) TreeCommonParameters.setDefaultCleanQueueSize(nodeCache);
if (trashInterval != null) TreeCommonParameters.setDefaultTrashInterval(trashInterval);
if (trashTimeLimit != null) TreeCommonParameters.setDefaultTrashTimeLimit(trashTimeLimit);
if (storage != null) storage.setStaticFieldsFromMembers();
log.info("[init] live={}, target={} job={}", live, root, this.config.jobId);
Path treePath = Paths.get(config.dir, directory);
if (concurrentTree) {
tree = new ConcurrentTree(LessFiles.initDirectory(treePath.toFile()));
} else {
tree = new NonConcurrentTree(LessFiles.initDirectory(treePath.toFile()));
}
bench = new Bench(EnumSet.allOf(BENCH.class), 1000);
TreeConfig.writeConfigToDataDirectory(treePath, advanced);
if ((this.config.jobId != null) && live && (livePort > -1)) {
QueryEngine liveQueryEngine = new QueryEngine(tree);
connectToMesh(treePath.toFile(), config.jobId, config.node, liveQueryEngine);
}
startTime = System.currentTimeMillis();
tree.foregroundNodeDeletion(closing::get);
if (pre != null) {
sampleOperation(pre, preRate, "pre.sample", "pre");
}
} catch (Exception ex) {
Throwables.propagate(ex);
}
}
private void connectToMesh(File root, String jobId, int taskId, QueryEngine engine) throws IOException {
LiveQueryReference queryReference = new LiveQueryReference(root, jobId, taskId, engine);
liveQueryServer = new LiveMeshyServer(0, queryReference);
liveQueryServer.connectPeer(new InetSocketAddress(liveHost, livePort));
}
public BundleField bindField(String key) {
return getFormat().getField(key);
}
// ------------------------- PROCESSING ENGINE -------------------------
public boolean isProfiling() {
return profiling;
}
public static void updateProfile(PathElement pathElement, long duration) {
pathElement.updateProfile(duration);
}
public void processBundle(Bundle bundle, TreeMapperPathReference target) {
Integer unit = target.getTargetUnit();
if (unit == null) {
log.warn("[deliver] target missing unit: {}", target);
return;
}
processBundle(bundle, pathIndex.getValueByIndex(unit));
}
/**
* Processor interface take a packet and target and either hand it to local
* delivery directly, or if processing queue is enabled, hand it to the
* queue to be re-delivered. when the packet is re-delivered through the
* queue, it goes to the processPacket method of the JobQueueItem, not
* Hydra's processPacket method (this one). read deliverPacket() for more
* information. process queues are a performance gain on nodes with >2
* processors. when JobQueueItems are first created, they have a queue with
* only one entry. if the execution of that rule calls other rules, they are
* added to this job's queue rather than the process queue, which is bounded
* and could lock up through recursive addition. this method iterates over
* the JobQueueItems queue until it's emptied. for each entry, the packet
* and target are examined and either delivered locally or sent to the
* router for delivery to another hydra node.
*/
public void processBundle(Bundle bundle, PathElement[] path) {
try {
long bundleTime;
try {
bundleTime = getBundleTime(bundle);
} catch (NumberFormatException nfe) {
log.warn("error reading TimeField, : {}\nbundle: {}", timeField.getField(), bundle);
// in case of junk data, if the source is flexible we'll continue processing bundles
// until maxErrors is reached
if (bundleErrors++ < maxErrors) {
log.warn("bundleErrors:{} is less than max errors: {}, skipping this bundle", bundleErrors,
maxErrors);
return;
} else {
throw new RuntimeException("Invalid bundle: " + bundle + " unable to read TimeField due to NumberFormatException");
}
}
bench.addEvents(BENCH.UNITS, 1);
bench.addEvents(BENCH.TIME, bundleTime >> 8);
processPath(bundle, path);
} catch (RuntimeException ex) {
throw ex;
} catch (Exception ex) {
log.warn("", ex);
}
processed.incrementAndGet();
bench.addEvents(BENCH.LOCAL, 1);
checkBench();
}
private long getBundleTime(Bundle bundle) {
long bundleTime = JitterClock.globalTime();
if (timeField != null) {
ValueObject vo = timeField.getField().getValue(bundle);
if (vo == null) {
log.debug("missing time {} in [{}] --> {}", timeField.getField(), bundle.getCount(), bundle);
} else {
bundleTime = timeField.toUnix(vo);
}
}
return bundleTime;
}
/**
* Processor interface this is where packets and rules are finally executed
* locally.
*/
private void processPath(Bundle bundle, PathElement[] path) {
try {
TreeMapState ps = new TreeMapState(this, tree, path, bundle);
ps.process();
processNodes.addAndGet(ps.touched());
} catch (RuntimeException ex) {
throw ex;
} catch (Exception ex) {
log.warn("", ex);
}
bench.addEvents(BENCH.RULES, 1);
}
/**
* print benchmark data to log
*/
protected void checkBench() {
synchronized (bench) {
if (bench.hasElapsed(printinterval)) {
long time = System.currentTimeMillis() - startTime;
long proc = processed.get();
// prevent multiple Hydra threads from competing to change
// streamXX,lastTime vars
if (((benchCalls.getAndIncrement() % 20) == 0) && stats) {
long streamCounts = streamReadCount.getAndSet(0);
long streamTotals = streamReadTotal.addAndGet(streamCounts) / (1024 * 1024);
long mark = JitterClock.globalTime();
long streamRate = (streamCounts * 1000L) / (mark - lastHeaderTime.getAndSet(mark));
log.info(
"tread tmap input proc rules nodes bundles cache...hit% dbs mem bundleTime [{}," +
"{}/s,{}MM]",
streamCounts, streamRate, streamTotals);
}
long benchtime = bench.getEventCount(BENCH.TIME);
long benchlocal = bench.getEventCount(BENCH.UNITS);
long streamRate = bench.getEventRate(BENCH.STREAM);
bench.mark();
long avg_t = benchtime / Math.max(1, benchlocal) << 8;
long time_write_map = mapWriteTime.getAndSet(0);
long time_read_wait = streamWaitime.getAndSet(0);
TreeMapperStats.Snapshot snap = new TreeMapperStats.Snapshot();
snap.streamRate = streamRate;
snap.mapWriteTime = benchlocal > 0 ? time_write_map / benchlocal : time_write_map;
snap.streamWaitTime = (benchlocal > 0 ? time_read_wait / benchlocal : time_read_wait);
snap.localPacketRate = bench.getEventRate(BENCH.LOCAL);
snap.ruleProcessRate = bench.getEventRate(BENCH.RULES);
snap.nodesUpdated = processNodes.getAndSet(0);
snap.totalPackets = proc;
snap.treeCacheSize = tree.getCacheSize();
snap.treeCacheHitRate = tree.getCacheHitRate();
snap.treeDbCount = tree.getDBCount();
snap.freeMemory = Runtime.getRuntime().freeMemory() / 1024L / 1024L;
snap.averageTimestamp = date.format(avg_t);
snap.runningTime = time;
mapstats.setSnapshot(snap);
if (!stats) {
return;
}
log.info(snap.toFormattedString());
}
}
}
@Override
public void send(Bundle bundle) {
long markBefore = System.nanoTime();
streamWaitime.addAndGet(markBefore - lastBundleTime.getAndSet(markBefore));
processBundle(bundle, root);
long markAfter = System.nanoTime();
mapWriteTime.addAndGet(markAfter - markBefore);
streamReadCount.incrementAndGet();
bench.addEvents(BENCH.STREAM, 1);
lastBundleTime.set(markAfter);
}
@Override
public void send(List<Bundle> bundles) {
if (bundles != null && !bundles.isEmpty()) {
for (Bundle bundle : bundles) {
send(bundle);
}
}
}
@Override
public void sendComplete() {
try {
boolean doPost = false;
if (post != null) {
doPost = sampleOperation(post, postRate, "post.sample", "post");
}
tree.foregroundNodeDeletion(closing::get);
if (outputs != null) {
for (PathOutput output : outputs) {
log.info("output: {}", output);
output.exec(tree);
}
}
// turn off live queries
if (liveQueryServer != null) {
liveQueryServer.close();
}
// disable web interface
boolean doValidate;
switch (validateTree) {
case ALL:
doValidate = true;
break;
case POST:
doValidate = doPost;
break;
case NONE:
default:
doValidate = false;
}
// close storage
log.info("[close] closing tree storage");
CloseOperation closeOperation = CloseOperation.NONE;
if (doValidate) {
closeOperation = repairTree ? CloseOperation.REPAIR : CloseOperation.TEST;
}
tree.close(false, closeOperation);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
/**
* Conditionally perform the array of path element operations. If {@code rate} is
* greater than 1 then test the contents of {@code filename} to determine whether
* to run the path element operations. Supply an empty bundle as input to the path
* operations.
*
* @param op array of path operations that may be run
* @param rate if greater than 1 then test the sample file
* @param filename name of the sample file
* @param message output prefix for logging
* @return true iff path elements were executed
* @throws IOException
*/
private boolean sampleOperation(PathElement[] op, int rate, String filename, String message) throws IOException {
boolean perform;
int sample = 0;
if (rate > 1) {
File sampleFile = new File(filename);
if (sampleFile.exists() && sampleFile.isFile() && sampleFile.length() > 0) {
try {
sample = Integer.parseInt(LessBytes.toString(LessFiles.read(sampleFile)));
sample = (sample + 1) % rate;
} catch (NumberFormatException ignored) {
}
}
perform = (sample == 0);
LessFiles.write(sampleFile, LessBytes.toBytes(Integer.toString(sample)), false);
} else {
perform = true;
}
if (perform) {
log.info("{}-chain: {}", message, op);
processBundle(new ListBundle(), op);
} else {
log.info("skipping {}-chain: {}. Sample rate is {} out of {}", message, op, sample, rate);
}
return perform;
}
public boolean isClosing() {
return closing.get();
}
@Override
public void sourceError(Throwable err) {
// TODO
}
@Nonnull @Override
public ImmutableList<Path> writableRootPaths() {
return ImmutableList.of(Paths.get(config.dir, directory));
}
}