package org.embulk.exec; import java.text.NumberFormat; import java.util.List; import com.google.inject.Inject; import com.google.common.base.Preconditions; import org.embulk.config.Config; import org.embulk.config.ConfigDefault; import org.embulk.config.Task; import org.embulk.config.TaskSource; import org.embulk.config.ConfigSource; import org.embulk.config.TaskReport; import org.embulk.spi.Schema; import org.embulk.spi.Exec; import org.embulk.spi.Page; import org.embulk.spi.Buffer; import org.embulk.spi.InputPlugin; import org.embulk.spi.ParserPlugin; import org.embulk.spi.FileInput; import org.embulk.spi.FileInputRunner; import org.embulk.spi.PageOutput; import org.slf4j.Logger; import static java.util.Locale.ENGLISH; import static org.embulk.spi.util.Inputs.each; /* * Used by FileInputRunner.guess */ public class SamplingParserPlugin implements ParserPlugin { public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig) { return runFileInputSampling(runner, inputConfig, Exec.newConfigSource()); } public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig, ConfigSource sampleBufferConfig) { final SampleBufferTask sampleBufferTask = sampleBufferConfig.loadConfig(SampleBufferTask.class); // override in.parser.type so that FileInputRunner creates SamplingParserPlugin ConfigSource samplingInputConfig = inputConfig.deepCopy(); samplingInputConfig.getNestedOrSetEmpty("parser") .set("type", "system_sampling") .set("sample_buffer_bytes", sampleBufferTask.getSampleBufferBytes()); samplingInputConfig.set("decoders", null); try { runner.transaction(samplingInputConfig, new InputPlugin.Control() { public List<TaskReport> run(TaskSource taskSource, Schema schema, int taskCount) { if (taskCount == 0) { throw new NoSampleException("No input files to read sample data"); } int maxSize = -1; int maxSizeTaskIndex = -1; for (int taskIndex=0; taskIndex < taskCount; taskIndex++) { try { runner.run(taskSource, schema, taskIndex, new PageOutput() { @Override public void add(Page page) { throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration"); // TODO exception class } public void finish() { } public void close() { } }); } catch (NotEnoughSampleError ex) { if (maxSize < ex.getSize()) { maxSize = ex.getSize(); maxSizeTaskIndex = taskIndex; } continue; } } if (maxSize <= 0) { throw new NoSampleException("All input files are empty"); } taskSource.getNested("ParserTaskSource").set("force", true); try { runner.run(taskSource, schema, maxSizeTaskIndex, new PageOutput() { @Override public void add(Page page) { throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration"); // TODO exception class } public void finish() { } public void close() { } }); } catch (NotEnoughSampleError ex) { throw new NoSampleException("All input files are smaller than minimum sampling size"); } throw new NoSampleException("All input files are smaller than minimum sampling size"); } }); throw new AssertionError("SamplingParserPlugin must throw SampledNoticeError"); } catch (SampledNoticeError error) { return error.getSample(); } } public static class SampledNoticeError extends Error { private final Buffer sample; public SampledNoticeError(Buffer sample) { this.sample = sample; } public Buffer getSample() { return sample; } } public static class NotEnoughSampleError extends Error { private final int size; public NotEnoughSampleError(int size) { this.size = size; } public int getSize() { return size; } } private final NumberFormat numberFormat = NumberFormat.getNumberInstance(ENGLISH); private final Logger log = Exec.getLogger(this.getClass()); private final int minSampleBufferBytes; public interface PluginTask extends Task, SampleBufferTask { } public interface SampleBufferTask extends Task { @Config("sample_buffer_bytes") @ConfigDefault("32768") // 32 * 1024 public int getSampleBufferBytes(); } @Inject public SamplingParserPlugin(@ForSystemConfig ConfigSource systemConfig) { this.minSampleBufferBytes = 40; // empty gzip file is 33 bytes. // TODO get sample size from system config } @Override public void transaction(ConfigSource config, ParserPlugin.Control control) { PluginTask task = config.loadConfig(PluginTask.class); Preconditions.checkArgument(minSampleBufferBytes < task.getSampleBufferBytes(), "minSampleBufferBytes must be smaller than sample_buffer_bytes"); log.info("Try to read {} bytes from input source", numberFormat.format(task.getSampleBufferBytes())); control.run(task.dump(), null); } @Override public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output) { PluginTask task = taskSource.loadTask(PluginTask.class); Buffer buffer = readSample(input, task.getSampleBufferBytes()); if (!taskSource.get(boolean.class, "force", false)) { if (buffer.limit() < minSampleBufferBytes) { throw new NotEnoughSampleError(buffer.limit()); } } throw new SampledNoticeError(buffer); } public static Buffer readSample(FileInput fileInput, int sampleBufferBytes) { return readSample(fileInput, Buffer.allocate(sampleBufferBytes), 0, sampleBufferBytes); } public static Buffer readSample(FileInput fileInput, Buffer sample, int offset, int sampleBufferBytes) { if (!fileInput.nextFile()) { // no input files return sample; } try { for (Buffer buffer : each(fileInput)) { int size = Math.min(buffer.limit(), sample.capacity() - offset); sample.setBytes(offset, buffer, 0, size); offset += size; buffer.release(); if (offset >= sampleBufferBytes) { break; } } } finally { sample.limit(offset); } return sample; } }