// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.sampling; import java.util.ArrayList; import java.util.List; /** * created by zhao<br> * Bridge from data source extraction to data sample API * <p> * The call sequences for following methods must be comply with. <br> * 1. {{@link #prepareData()}<br> * 2. {{@link #hasNext()}<br> * 3. {{@link #getRecord()}<br> * 4. {{@link #finalizeDataSampling()}<br> * Step 2 and 3 can be executed several times * */ public class DataSamplingBridge { private SamplingOption samplingOption = SamplingOption.TopN; private ReservoirSampler<Object[]> reservoirSampler; private List<Object[]> reservoirSamplingData = new ArrayList<Object[]>(); private int sampleSize = 1000; private SamplingDataSource<?> dataSource; private long currentRandomSeed = System.currentTimeMillis(); // The cursor of reservoir sampling records. private int recordCursor = 0; private long dataSourceCursor = 0; private boolean stopRequested = false; public DataSamplingBridge(SamplingDataSource<?> ds) { this.dataSource = ds; } public void setSampleSize(int size) { sampleSize = size; } public void setSamplingOption(SamplingOption option) { samplingOption = option; } public SamplingOption getSamplingOption() { return samplingOption; } /** * * DOC zhao see if there exist next record or not. * * @return true if there is next sampling record, false otherwise. * @throws Exception */ public boolean hasNext() throws Exception { if (recordCursor >= sampleSize) { // Stop getting sample from data source. return false; } if (SamplingOption.Reservoir == samplingOption) { return recordCursor < reservoirSamplingData.size(); } else { return dataSource.hasNext(); } } /** * * DOC zhao Do prepare work before getting real data, work such as JDBC or file connection creation. * * @return true if success, false otherwise. * @throws Exception When unexpected exception occurs */ public boolean prepareData(long randomSeed) throws Exception { // Reset record cursor switch (samplingOption) { case TopN: break; case Percentage: break; case Reservoir: dataSourceCursor = 0; reservoirSamplingData = new ArrayList<Object[]>(); reservoirSampler = new ReservoirSampler<Object[]>(sampleSize, randomSeed); reservoirSampler.clear(); while (!stopRequested && dataSource.hasNext()) { reservoirSampler.onNext(dataSource.getRecord()); dataSourceCursor++; } reservoirSampler.onCompleted(true); reservoirSamplingData = reservoirSampler.sample(); break; default: break; } return false; } public void prepareData(String[] columnHeaders) throws Exception { prepareData(currentRandomSeed); } /** * Sets the randomSeed. * * @param randomSeed the randomSeed to set */ public void setRandomSeed(long randomSeed) { this.currentRandomSeed = randomSeed; } /** * * Get one record from iterator of data source, note that this function is data-source type dependent such a case is * that in JDBC connection, the integration is from ResultSet while in file connection it's from file delimiter API.<br> * Before call this method, the method {{@link #hasNext()} should be called to check if there are data available * from data source. * * @return true if success, false otherwise * @throws Exception occurs when there are unexpected exceptions. */ public Object[] getRecord() throws Exception { Object[] records = null; switch (samplingOption) { case TopN: records = dataSource.getRecord(); break; case Percentage: break; case Reservoir: if (reservoirSampler == null) { throw new Exception("DataSamplingBridge is not initialized"); } records = reservoirSamplingData.get(recordCursor); break; default: break; } recordCursor++; return records; } /** * * DOC zhao Finalize the data sample , some operation need to be done here such as closing csv file stream. * * @return true if success, false otherwise. * @throws Exception When unexpected exception occurs */ public boolean finalizeDataSampling() throws Exception { reservoirSamplingData = null; dataSource.finalizeDataSampling(); return false; } public static final long RANDOM_SEED = 12345678; public boolean isStopRequested() { return stopRequested; } public void setStopRequested(boolean stopRequested) { this.stopRequested = stopRequested; } public long getDataSourceCursor() { return dataSourceCursor; } }