package org.embulk.exec; import java.util.List; import java.util.concurrent.ExecutionException; import com.google.common.base.Optional; import com.google.common.collect.ImmutableList; import com.google.inject.Inject; import com.google.inject.Injector; import com.google.common.base.Throwables; import org.embulk.config.Task; import org.embulk.config.Config; import org.embulk.config.ConfigDefault; import org.embulk.config.ConfigSource; import org.embulk.config.ConfigException; import org.embulk.config.TaskSource; import org.embulk.config.ConfigDiff; import org.embulk.config.TaskReport; import org.embulk.plugin.PluginType; import org.embulk.spi.Schema; import org.embulk.spi.Exec; import org.embulk.spi.ExecSession; import org.embulk.spi.ExecAction; import org.embulk.spi.ExecutorPlugin; import org.embulk.spi.ProcessTask; import org.embulk.spi.ProcessState; import org.embulk.spi.TaskState; import org.embulk.spi.InputPlugin; import org.embulk.spi.FilterPlugin; import org.embulk.spi.OutputPlugin; import org.embulk.spi.util.Filters; import org.slf4j.Logger; public class BulkLoader { private final Injector injector; public interface BulkLoaderTask extends Task { @Config("exec") @ConfigDefault("{}") public ConfigSource getExecConfig(); @Config("in") public ConfigSource getInputConfig(); @Config("filters") @ConfigDefault("[]") public List<ConfigSource> getFilterConfigs(); @Config("out") public ConfigSource getOutputConfig(); public TaskSource getOutputTask(); public void setOutputTask(TaskSource taskSource); } @Inject public BulkLoader(Injector injector, @ForSystemConfig ConfigSource systemConfig) { this.injector = injector; } protected static class LoaderState implements ProcessState { private final Logger logger; private final ProcessPluginSet plugins; private volatile TaskSource inputTaskSource; private volatile TaskSource outputTaskSource; private volatile List<TaskSource> filterTaskSources; private volatile List<Schema> schemas; private volatile Schema executorSchema; private volatile TransactionStage transactionStage; private volatile ConfigDiff inputConfigDiff; private volatile ConfigDiff outputConfigDiff; private volatile List<TaskState> inputTaskStates; private volatile List<TaskState> outputTaskStates; public LoaderState(Logger logger, ProcessPluginSet plugins) { this.logger = logger; this.plugins = plugins; } public Logger getLogger() { return logger; } public void setSchemas(List<Schema> schemas) { this.schemas = schemas; } public void setExecutorSchema(Schema executorSchema) { this.executorSchema = executorSchema; } public void setTransactionStage(TransactionStage transactionStage) { this.transactionStage = transactionStage; } public void setInputTaskSource(TaskSource inputTaskSource) { this.inputTaskSource = inputTaskSource; } public void setOutputTaskSource(TaskSource outputTaskSource) { this.outputTaskSource = outputTaskSource; } public void setFilterTaskSources(List<TaskSource> filterTaskSources) { this.filterTaskSources = filterTaskSources; } public ProcessTask buildProcessTask() { return new ProcessTask( plugins.getInputPluginType(), plugins.getOutputPluginType(), plugins.getFilterPluginTypes(), inputTaskSource, outputTaskSource, filterTaskSources, schemas, executorSchema, Exec.newTaskSource()); } @Override public void initialize(int inputTaskCount, int outputTaskCount) { if (inputTaskStates != null || outputTaskStates != null) { // initialize is called twice if resume (by restoreResumedTaskReports and ExecutorPlugin.execute) if (inputTaskStates.size() != inputTaskCount || outputTaskStates.size() != outputTaskCount) { throw new ConfigException(String.format( "input task count and output task (%d and %d) must be same with the first execution (%d and %d) whenre resumed", inputTaskCount, outputTaskCount, inputTaskStates.size(), outputTaskStates.size())); } } else { ImmutableList.Builder<TaskState> inputTaskStates = ImmutableList.builder(); ImmutableList.Builder<TaskState> outputTaskStates = ImmutableList.builder(); for (int i=0; i < inputTaskCount; i++) { inputTaskStates.add(new TaskState()); } for (int i=0; i < outputTaskCount; i++) { outputTaskStates.add(new TaskState()); } this.inputTaskStates = inputTaskStates.build(); this.outputTaskStates = outputTaskStates.build(); } } @Override public TaskState getInputTaskState(int inputTaskIndex) { return inputTaskStates.get(inputTaskIndex); } @Override public TaskState getOutputTaskState(int outputTaskIndex) { return outputTaskStates.get(outputTaskIndex); } public boolean isAllTasksCommitted() { // here can't assume that input tasks are committed when output tasks are // committed because that's controlled by executor plugins. some executor // plugins (especially mapreduce executor) may commit output tasks even // when some input tasks failed. This is asemantically allowed behavior for // executor plugins (as long as output plugin is atomic and idempotent). if (inputTaskStates == null || outputTaskStates == null) { // not initialized return false; } for (TaskState inputTaskState : inputTaskStates) { if (!inputTaskState.isCommitted()) { return false; } } for (TaskState outputTaskState : outputTaskStates) { if (!outputTaskState.isCommitted()) { return false; } } return true; } public int countUncommittedInputTasks() { if (inputTaskStates == null) { // not initialized return 0; } int count = 0; for (TaskState inputTaskState : inputTaskStates) { if (!inputTaskState.isCommitted()) { count++; } } return count; } public int countUncommittedOutputTasks() { if (outputTaskStates == null) { // not initialized return 0; } int count = 0; for (TaskState outputTaskState : outputTaskStates) { if (!outputTaskState.isCommitted()) { count++; } } return count; } public boolean isAllTransactionsCommitted() { return inputConfigDiff != null && outputConfigDiff != null; } public void setOutputConfigDiff(ConfigDiff outputConfigDiff) { if (outputConfigDiff == null) { outputConfigDiff = Exec.newConfigDiff(); } this.outputConfigDiff = outputConfigDiff; } public void setInputConfigDiff(ConfigDiff inputConfigDiff) { if (inputConfigDiff == null) { inputConfigDiff = Exec.newConfigDiff(); } this.inputConfigDiff = inputConfigDiff; } private List<Optional<TaskReport>> getInputTaskReports() { ImmutableList.Builder<Optional<TaskReport>> builder = ImmutableList.builder(); for (TaskState inputTaskState : inputTaskStates) { builder.add(inputTaskState.getTaskReport()); } return builder.build(); } private List<Optional<TaskReport>> getOutputTaskReports() { ImmutableList.Builder<Optional<TaskReport>> builder = ImmutableList.builder(); for (TaskState outputTaskState : outputTaskStates) { builder.add(outputTaskState.getTaskReport()); } return builder.build(); } public List<TaskReport> getAllInputTaskReports() { ImmutableList.Builder<TaskReport> builder = ImmutableList.builder(); for (TaskState inputTaskState : inputTaskStates) { builder.add(inputTaskState.getTaskReport().get()); } return builder.build(); } public List<TaskReport> getAllOutputTaskReports() { ImmutableList.Builder<TaskReport> builder = ImmutableList.builder(); for (TaskState outputTaskState : outputTaskStates) { builder.add(outputTaskState.getTaskReport().get()); } return builder.build(); } public List<Throwable> getExceptions() { ImmutableList.Builder<Throwable> builder = ImmutableList.builder(); if (inputTaskStates != null) { // null if not initialized yet for (TaskState inputTaskState : inputTaskStates) { Optional<Throwable> exception = inputTaskState.getException(); if (exception.isPresent()) { builder.add(exception.get()); } } } if (outputTaskStates != null) { // null if not initialized yet for (TaskState outputTaskState : outputTaskStates) { Optional<Throwable> exception = outputTaskState.getException(); if (exception.isPresent()) { builder.add(exception.get()); } } } return builder.build(); } public RuntimeException getRepresentativeException() { RuntimeException top = null; for (Throwable ex : getExceptions()) { if (top != null) { top.addSuppressed(ex); } else { if (ex instanceof RuntimeException) { top = (RuntimeException) ex; } else { top = new RuntimeException(ex); } } } if (top == null) { top = new RuntimeException("Some transactions are not committed"); } return top; } public ExecutionResult buildExecuteResult() { return buildExecuteResultWithWarningException(null); } public ExecutionResult buildExecuteResultWithWarningException(Throwable ex) { ConfigDiff configDiff = Exec.newConfigDiff(); if (inputConfigDiff != null) { configDiff.getNestedOrSetEmpty("in").merge(inputConfigDiff); } if (outputConfigDiff != null) { configDiff.getNestedOrSetEmpty("out").merge(outputConfigDiff); } ImmutableList.Builder<Throwable> ignoredExceptions = ImmutableList.builder(); for (Throwable e : getExceptions()) { ignoredExceptions.add(e); } if (ex != null) { ignoredExceptions.add(ex); } return new ExecutionResult(configDiff, false, ignoredExceptions.build()); } public ExecutionResult buildExecuteResultOfSkippedExecution(ConfigDiff configDiff) { ImmutableList.Builder<Throwable> ignoredExceptions = ImmutableList.builder(); for (Throwable e : getExceptions()) { ignoredExceptions.add(e); } return new ExecutionResult(configDiff, true, ignoredExceptions.build()); } public ResumeState buildResumeState(ExecSession exec) { Schema inputSchema = (schemas == null) ? null : schemas.get(0); List<Optional<TaskReport>> inputTaskReports = (inputTaskStates == null) ? null : getInputTaskReports(); List<Optional<TaskReport>> outputTaskReports = (outputTaskStates == null) ? null : getOutputTaskReports(); return new ResumeState( exec.getSessionExecConfig(), inputTaskSource, outputTaskSource, inputSchema, executorSchema, inputTaskReports, outputTaskReports); } public PartialExecutionException buildPartialExecuteException(Throwable cause, ExecSession exec) { return new PartialExecutionException(cause, buildResumeState(exec), transactionStage); } } protected LoaderState newLoaderState(Logger logger, ProcessPluginSet plugins) { return new LoaderState(logger, plugins); } public ExecutionResult run(ExecSession exec, final ConfigSource config) { try { return Exec.doWith(exec, new ExecAction<ExecutionResult>() { public ExecutionResult run() { try (SetCurrentThreadName dontCare = new SetCurrentThreadName("transaction")) { return doRun(config); } } }); } catch (ExecutionException ex) { throw Throwables.propagate(ex.getCause()); } } public ExecutionResult resume(final ConfigSource config, final ResumeState resume) { try { ExecSession exec = ExecSession.builder(injector).fromExecConfig(resume.getExecSessionConfigSource()).build(); ExecutionResult result = Exec.doWith(exec, new ExecAction<ExecutionResult>() { public ExecutionResult run() { try (SetCurrentThreadName dontCare = new SetCurrentThreadName("resume")) { return doResume(config, resume); } } }); exec.cleanup(); return result; } catch (ExecutionException ex) { throw Throwables.propagate(ex.getCause()); } } public void cleanup(final ConfigSource config, final ResumeState resume) { try { ExecSession exec = ExecSession.builder(injector).fromExecConfig(resume.getExecSessionConfigSource()).build(); Exec.doWith(exec, new ExecAction<Void>() { public Void run() { try (SetCurrentThreadName dontCare = new SetCurrentThreadName("cleanup")) { doCleanup(config, resume); return null; } } }); exec.cleanup(); } catch (ExecutionException ex) { throw Throwables.propagate(ex.getCause()); } } protected static class ProcessPluginSet { private final PluginType inputPluginType; private final PluginType outputPluginType; private final List<PluginType> filterPluginTypes; private final InputPlugin inputPlugin; private final OutputPlugin outputPlugin; private final List<FilterPlugin> filterPlugins; public ProcessPluginSet(BulkLoaderTask task) { this.inputPluginType = task.getInputConfig().get(PluginType.class, "type"); this.outputPluginType = task.getOutputConfig().get(PluginType.class, "type"); this.filterPluginTypes = Filters.getPluginTypes(task.getFilterConfigs()); this.inputPlugin = Exec.newPlugin(InputPlugin.class, inputPluginType); this.outputPlugin = Exec.newPlugin(OutputPlugin.class, outputPluginType); this.filterPlugins = Filters.newFilterPlugins(Exec.session(), filterPluginTypes); } public PluginType getInputPluginType() { return inputPluginType; } public PluginType getOutputPluginType() { return outputPluginType; } public List<PluginType> getFilterPluginTypes() { return filterPluginTypes; } public InputPlugin getInputPlugin() { return inputPlugin; } public OutputPlugin getOutputPlugin() { return outputPlugin; } public List<FilterPlugin> getFilterPlugins() { return filterPlugins; } } public void doCleanup(ConfigSource config, ResumeState resume) { BulkLoaderTask task = config.loadConfig(BulkLoaderTask.class); ProcessPluginSet plugins = new ProcessPluginSet(task); // TODO don't create filter plugins ImmutableList.Builder<TaskReport> successfulInputTaskReports = ImmutableList.builder(); ImmutableList.Builder<TaskReport> successfulOutputTaskReports = ImmutableList.builder(); for (Optional<TaskReport> inputTaskReport : resume.getInputTaskReports()) { if (inputTaskReport.isPresent()) { successfulInputTaskReports.add(inputTaskReport.get()); } } for (Optional<TaskReport> outputTaskReport : resume.getOutputTaskReports()) { if (outputTaskReport.isPresent()) { successfulOutputTaskReports.add(outputTaskReport.get()); } } plugins.getInputPlugin().cleanup(resume.getInputTaskSource(), resume.getInputSchema(), resume.getInputTaskReports().size(), successfulInputTaskReports.build()); plugins.getOutputPlugin().cleanup(resume.getOutputTaskSource(), resume.getOutputSchema(), resume.getOutputTaskReports().size(), successfulOutputTaskReports.build()); } private ExecutorPlugin newExecutorPlugin(BulkLoaderTask task) { return Exec.newPlugin(ExecutorPlugin.class, task.getExecConfig().get(PluginType.class, "type", new PluginType("local"))); } private ExecutionResult doRun(ConfigSource config) { final BulkLoaderTask task = config.loadConfig(BulkLoaderTask.class); final ExecutorPlugin exec = newExecutorPlugin(task); final ProcessPluginSet plugins = new ProcessPluginSet(task); final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins); state.setTransactionStage(TransactionStage.INPUT_BEGIN); try { ConfigDiff inputConfigDiff = plugins.getInputPlugin().transaction(task.getInputConfig(), new InputPlugin.Control() { public List<TaskReport> run(final TaskSource inputTask, final Schema inputSchema, final int inputTaskCount) { state.setInputTaskSource(inputTask); state.setTransactionStage(TransactionStage.FILTER_BEGIN); Filters.transaction(plugins.getFilterPlugins(), task.getFilterConfigs(), inputSchema, new Filters.Control() { public void run(final List<TaskSource> filterTasks, final List<Schema> schemas) { state.setSchemas(schemas); state.setFilterTaskSources(filterTasks); state.setTransactionStage(TransactionStage.EXECUTOR_BEGIN); exec.transaction(task.getExecConfig(), last(schemas), inputTaskCount, new ExecutorPlugin.Control() { public void transaction(final Schema executorSchema, final int outputTaskCount, final ExecutorPlugin.Executor executor) { state.setExecutorSchema(executorSchema); state.setTransactionStage(TransactionStage.OUTPUT_BEGIN); ConfigDiff outputConfigDiff = plugins.getOutputPlugin().transaction(task.getOutputConfig(), executorSchema, outputTaskCount, new OutputPlugin.Control() { public List<TaskReport> run(final TaskSource outputTask) { state.setOutputTaskSource(outputTask); state.initialize(inputTaskCount, outputTaskCount); state.setTransactionStage(TransactionStage.RUN); if (!state.isAllTasksCommitted()) { // inputTaskCount == 0 execute(task, executor, state); } if (!state.isAllTasksCommitted()) { throw new RuntimeException(String.format("%d input tasks and %d output tasks failed", state.countUncommittedInputTasks(), state.countUncommittedOutputTasks())); } state.setTransactionStage(TransactionStage.OUTPUT_COMMIT); return state.getAllOutputTaskReports(); } }); state.setOutputConfigDiff(outputConfigDiff); state.setTransactionStage(TransactionStage.EXECUTOR_COMMIT); } }); state.setTransactionStage(TransactionStage.FILTER_COMMIT); } }); state.setTransactionStage(TransactionStage.INPUT_COMMIT); return state.getAllInputTaskReports(); } }); state.setInputConfigDiff(inputConfigDiff); state.setTransactionStage(TransactionStage.CLEANUP); cleanupCommittedTransaction(config, state); return state.buildExecuteResult(); } catch (Throwable ex) { if (isSkippedTransaction(ex)) { ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff(); return state.buildExecuteResultOfSkippedExecution(configDiff); } else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) { // ignore the exception return state.buildExecuteResultWithWarningException(ex); } throw state.buildPartialExecuteException(ex, Exec.session()); } } private ExecutionResult doResume(ConfigSource config, final ResumeState resume) { final BulkLoaderTask task = config.loadConfig(BulkLoaderTask.class); final ExecutorPlugin exec = newExecutorPlugin(task); final ProcessPluginSet plugins = new ProcessPluginSet(task); final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins); state.setTransactionStage(TransactionStage.INPUT_BEGIN); try { ConfigDiff inputConfigDiff = plugins.getInputPlugin().resume(resume.getInputTaskSource(), resume.getInputSchema(), resume.getInputTaskReports().size(), new InputPlugin.Control() { public List<TaskReport> run(final TaskSource inputTask, final Schema inputSchema, final int inputTaskCount) { // TODO validate inputTask? // TODO validate inputSchema state.setInputTaskSource(inputTask); state.setTransactionStage(TransactionStage.FILTER_BEGIN); Filters.transaction(plugins.getFilterPlugins(), task.getFilterConfigs(), inputSchema, new Filters.Control() { public void run(final List<TaskSource> filterTasks, final List<Schema> schemas) { state.setSchemas(schemas); state.setFilterTaskSources(filterTasks); state.setTransactionStage(TransactionStage.EXECUTOR_BEGIN); exec.transaction(task.getExecConfig(), last(schemas), inputTaskCount, new ExecutorPlugin.Control() { public void transaction(final Schema executorSchema, final int outputTaskCount, final ExecutorPlugin.Executor executor) { // TODO validate executorSchema state.setExecutorSchema(executorSchema); state.setTransactionStage(TransactionStage.OUTPUT_BEGIN); ConfigDiff outputConfigDiff = plugins.getOutputPlugin().resume(resume.getOutputTaskSource(), executorSchema, outputTaskCount, new OutputPlugin.Control() { public List<TaskReport> run(final TaskSource outputTask) { // TODO validate outputTask? state.setOutputTaskSource(outputTask); restoreResumedTaskReports(resume, state); state.setTransactionStage(TransactionStage.RUN); if (!state.isAllTasksCommitted()) { execute(task, executor, state); } if (!state.isAllTasksCommitted()) { throw new RuntimeException(String.format("%d input tasks and %d output tasks failed", state.countUncommittedInputTasks(), state.countUncommittedOutputTasks())); } state.setTransactionStage(TransactionStage.OUTPUT_COMMIT); return state.getAllOutputTaskReports(); } }); state.setOutputConfigDiff(outputConfigDiff); state.setTransactionStage(TransactionStage.EXECUTOR_COMMIT); } }); state.setTransactionStage(TransactionStage.FILTER_COMMIT); } }); state.setTransactionStage(TransactionStage.INPUT_COMMIT); return state.getAllInputTaskReports(); } }); state.setInputConfigDiff(inputConfigDiff); state.setTransactionStage(TransactionStage.CLEANUP); cleanupCommittedTransaction(config, state); return state.buildExecuteResult(); } catch (Throwable ex) { if (isSkippedTransaction(ex)) { ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff(); return state.buildExecuteResultOfSkippedExecution(configDiff); } else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) { // ignore the exception return state.buildExecuteResultWithWarningException(ex); } throw state.buildPartialExecuteException(ex, Exec.session()); } } private static boolean isSkippedTransaction(Throwable ex) { return ex instanceof SkipTransactionException; } private static void restoreResumedTaskReports(ResumeState resume, LoaderState state) { int inputTaskCount = resume.getInputTaskReports().size(); int outputTaskCount = resume.getOutputTaskReports().size(); state.initialize(inputTaskCount, outputTaskCount); for (int i=0; i < inputTaskCount; i++) { Optional<TaskReport> report = resume.getInputTaskReports().get(i); if (report.isPresent()) { TaskState task = state.getInputTaskState(i); task.start(); task.setTaskReport(report.get()); task.finish(); } } for (int i=0; i < outputTaskCount; i++) { Optional<TaskReport> report = resume.getOutputTaskReports().get(i); if (report.isPresent()) { TaskState task = state.getOutputTaskState(i); task.start(); task.setTaskReport(report.get()); task.finish(); } } } private void execute(BulkLoaderTask task, ExecutorPlugin.Executor executor, LoaderState state) { ProcessTask procTask = state.buildProcessTask(); executor.execute(procTask, state); if (!state.isAllTasksCommitted()) { throw state.getRepresentativeException(); } } private void cleanupCommittedTransaction(ConfigSource config, LoaderState state) { try { doCleanup(config, state.buildResumeState(Exec.session())); } catch (Exception ex) { state.getLogger().warn("Commit succeeded but cleanup failed. Ignoring this exception.", ex); // TODO } } private static Schema first(List<Schema> schemas) { return schemas.get(0); } private static Schema last(List<Schema> schemas) { return schemas.get(schemas.size() - 1); } }