/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.morphline.stdlib; import java.security.SecureRandom; import java.util.Collection; import java.util.Collections; import java.util.Random; import org.kitesdk.morphline.api.Command; import org.kitesdk.morphline.api.CommandBuilder; import org.kitesdk.morphline.api.MorphlineCompilationException; import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.api.Record; import org.kitesdk.morphline.base.AbstractCommand; import org.kitesdk.morphline.shaded.org.apache.commons.math3.random.RandomGenerator; import org.kitesdk.morphline.shaded.org.apache.commons.math3.random.Well19937c; import com.typesafe.config.Config; /** * Command that forwards each input record with a given probability to its child command, and * silently ignores all other input records. Sampling is based on a random number generator. This * can be helpful to easily test a morphline with a random subset of records from a large dataset. */ public final class SampleBuilder implements CommandBuilder { @Override public Collection<String> getNames() { return Collections.singletonList("sample"); } @Override public Command build(Config config, Command parent, Command child, MorphlineContext context) { return new Sample(this, config, parent, child, context); } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// private static final class Sample extends AbstractCommand { private final double probability; private final RandomGenerator prng; private long count = 0; public Sample(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); this.probability = getConfigs().getDouble(config, "probability", 1.0); if (probability < 0.0) { throw new MorphlineCompilationException("Probability must not be negative: " + probability, config); } if (probability >= 1.0) { this.prng = null; } else { if (config.hasPath("seed")) { long seed = getConfigs().getLong(config, "seed"); this.prng = new Well19937c(seed); // non-secure & fast } else { Random rand = new SecureRandom(); int[] seed = new int[624]; for (int i = 0; i < seed.length; i++) { seed[i] = rand.nextInt(); } this.prng = new Well19937c(seed); // non-secure & fast } } validateArguments(); } @Override protected boolean doProcess(Record record) { if (prng != null && prng.nextDouble() > probability) { return true; // silently ignore this record } if (LOG.isDebugEnabled()) { LOG.debug("sampleCount: {}", count); } count++; // pass record to next command in chain: return super.doProcess(record); } } }