// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.duplicating; import java.util.ArrayList; import java.util.List; import java.util.Random; import org.apache.commons.math3.distribution.AbstractIntegerDistribution; import org.apache.log4j.Logger; public abstract class AbstractDuplicator<TIn, TOut> { private static final Logger LOG = Logger.getLogger(AbstractDuplicator.class); private RandomWrapper rnd; protected double uniquePercentageOfOriginal; protected double expectation; protected AbstractIntegerDistribution distribution; private static final double EPSILON = 1e-6; public AbstractDuplicator(double expectation, double duplicatesPercentage, String distributionName) { this.expectation = expectation; if (Math.abs(duplicatesPercentage - 1) < EPSILON) { uniquePercentageOfOriginal = 0; } else { uniquePercentageOfOriginal = expectation / (expectation - 1 + 1 / (1 - duplicatesPercentage)); } distribution = DistributionFactory.createDistribution(distributionName, expectation); } public AbstractDuplicator(double expectation, double duplicatesPercentage, String distributionName, long distributionSeed) { this(expectation, duplicatesPercentage, distributionName); rnd = new RandomWrapper(distributionSeed); distribution.reseedRandomGenerator(distributionSeed); if (LOG.isInfoEnabled()) { LOG.info("Seed for random generator has been set to: " + rnd.getSeed()); //$NON-NLS-1$ } } protected Random getRandom() { if (rnd == null) { rnd = new RandomWrapper(); distribution.reseedRandomGenerator(rnd.getSeed()); if (LOG.isInfoEnabled()) { LOG.info( "A seed is generated for Random generator. If a fixed seed is needed, set it in the advanced parameters of the tDuplicateRow component"); //$NON-NLS-1$ } } return rnd; } public List<TOut> process(TIn v) { List<TOut> result = new ArrayList<TOut>(); int grpSize = getRandomGroupSize(); for (int i = 0; i < grpSize; i++) { result.add(generateOutput(v, i == 0)); } return result; } protected abstract TOut generateOutput(TIn v, boolean isOriginal); private int getRandomGroupSize() { if (uniquePercentageOfOriginal > getRandom().nextDouble()) { return 1; } else { return distribution.sample() + 2; } } public void setSeed(long seed) { getRandom().setSeed(seed); distribution.reseedRandomGenerator(seed); } }