package com.thinkbiganalytics.spark.service;
/*-
* #%L
* thinkbig-spark-shell-client-app
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.thinkbiganalytics.spark.SparkContextService;
import com.thinkbiganalytics.spark.dataprofiler.Profiler;
import com.thinkbiganalytics.spark.metadata.TransformJob;
import com.thinkbiganalytics.spark.metadata.TransformScript;
import com.thinkbiganalytics.spark.repl.SparkScriptEngine;
import com.thinkbiganalytics.spark.rest.model.TransformRequest;
import com.thinkbiganalytics.spark.rest.model.TransformResponse;
import com.thinkbiganalytics.spark.shell.DatasourceProvider;
import com.thinkbiganalytics.spark.shell.DatasourceProviderFactory;
import org.apache.commons.lang3.StringEscapeUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.script.ScriptException;
import scala.Option;
import scala.tools.nsc.interpreter.NamedParam;
import scala.tools.nsc.interpreter.NamedParamClass;
/**
* A scheduled service that manages cached results of Spark jobs.
*/
@Component
public class TransformService {
private static final Logger log = LoggerFactory.getLogger(TransformService.class);
/**
* Data source provider factory
*/
@Nullable
private DatasourceProviderFactory datasourceProviderFactory;
/**
* Script execution engine
*/
@Nonnull
private final SparkScriptEngine engine;
/**
* Profiler for column statistics.
*/
@Nullable
private Profiler profiler;
/**
* Provides access to the Spark context
*/
@Nonnull
private final SparkContextService sparkContextService;
/**
* Job tracker for transformations
*/
@Nonnull
private final TransformJobTracker tracker;
/**
* Class for the transform script.
*/
@Nonnull
private final Class<? extends TransformScript> transformScriptClass;
/**
* Constructs a {@code TransformService} using the specified engine to execute scripts.
*
* @param transformScriptClass the parent class for Scala transform scripts
* @param engine the script engine
* @param sparkContextService the Spark context service
* @param tracker job tracker for transformations
*/
public TransformService(@Nonnull final Class<? extends TransformScript> transformScriptClass, @Nonnull final SparkScriptEngine engine, @Nonnull final SparkContextService sparkContextService,
@Nonnull final TransformJobTracker tracker) {
this.transformScriptClass = transformScriptClass;
this.engine = engine;
this.sparkContextService = sparkContextService;
this.tracker = tracker;
}
/**
* Gets the data source provider factory.
*
* @return the data source provider factory
*/
@Nullable
@SuppressWarnings("unused")
public DatasourceProviderFactory getDatasourceProviderFactory() {
return datasourceProviderFactory;
}
/**
* Sets the data source provider factory.
*
* @param datasourceProviderFactory the data source provider factory
*/
public void setDatasourceProviderFactory(@Nullable final DatasourceProviderFactory datasourceProviderFactory) {
this.datasourceProviderFactory = datasourceProviderFactory;
}
/**
* Executes the specified transformation and returns the name of the Hive table containing the results.
*
* @param request the transformation request
* @return the Hive table containing the results
* @throws IllegalStateException if this service is not running
* @throws ScriptException if the script cannot be executed
*/
@Nonnull
public TransformResponse execute(@Nonnull final TransformRequest request) throws ScriptException {
log.trace("entry params({})", request);
// Generate destination
final String table = newTableName();
// Build bindings list
final List<NamedParam> bindings = new ArrayList<>();
bindings.add(new NamedParamClass("profiler", Profiler.class.getName(), profiler));
bindings.add(new NamedParamClass("sparkContextService", SparkContextService.class.getName(), sparkContextService));
bindings.add(new NamedParamClass("tableName", "String", table));
if (request.getDatasources() != null && !request.getDatasources().isEmpty()) {
if (datasourceProviderFactory != null) {
final DatasourceProvider datasourceProvider = datasourceProviderFactory.getDatasourceProvider(request.getDatasources());
bindings.add(new NamedParamClass("datasourceProvider", DatasourceProvider.class.getName() + "[org.apache.spark.sql.DataFrame]", datasourceProvider));
} else {
final ScriptException e = new ScriptException("Script cannot be executed because no data source provider factory is available.");
log.error("Throwing {}", e);
throw e;
}
}
// Execute script
final Object result = this.engine.eval(toScript(request), bindings);
final TransformJob job;
if (result instanceof Callable) {
@SuppressWarnings("unchecked") final Callable<TransformResponse> callable = (Callable) result;
job = new TransformJob(table, callable, engine.getSparkContext());
tracker.submitJob(job);
} else {
final IllegalStateException e = new IllegalStateException("Unexpected script result type: " + (result != null ? result.getClass() : null));
log.error("Throwing {}", e);
throw e;
}
// Build response
TransformResponse response;
try {
response = job.get(500, TimeUnit.MILLISECONDS);
tracker.removeJob(table);
} catch (final ExecutionException cause) {
final ScriptException e = new ScriptException(cause);
log.error("Throwing {}", e);
throw e;
} catch (final InterruptedException | TimeoutException e) {
log.trace("Timeout waiting for script result", e);
response = new TransformResponse();
response.setProgress(0.0);
response.setStatus(TransformResponse.Status.PENDING);
response.setTable(table);
}
log.trace("exit with({})", response);
return response;
}
/**
* Gets the transformation job with the specified id.
*
* @param id the table with the results
* @return the transformation job
* @throws IllegalArgumentException if a job with the id does not exist
*/
@Nonnull
public TransformJob getJob(@Nonnull final String id) {
final Option<TransformJob> job = tracker.getJob(id);
if (job.isDefined()) {
if (job.get().isDone()) {
tracker.removeJob(id);
}
return job.get();
} else {
throw new IllegalArgumentException();
}
}
/**
* Gets the profiler for column statistics.
*
* @return the profiler
*/
@Nullable
@SuppressWarnings("unused")
public Profiler getProfiler() {
return profiler;
}
/**
* Sets the profiler for column statistics.
*
* @param profiler the profiler
*/
public void setProfiler(@Nullable final Profiler profiler) {
this.profiler = profiler;
}
/**
* Converts the specified transformation request to a Scala script that can be executed by the script engine.
*
* @param request the transformation request
* @return the Scala script
*/
@Nonnull
String toScript(@Nonnull final TransformRequest request) {
final StringBuilder script = new StringBuilder();
script.append(
"class Transform (destination: String, profiler: com.thinkbiganalytics.spark.dataprofiler.Profiler, sqlContext: org.apache.spark.sql.SQLContext, sparkContextService: com.thinkbiganalytics.spark.SparkContextService) extends ");
script.append(transformScriptClass.getName());
script.append("(destination, profiler, sqlContext, sparkContextService) {\n");
script.append("override def dataFrame: org.apache.spark.sql.DataFrame = {");
script.append(request.getScript());
script.append("}\n");
if (request.getParent() != null) {
script.append("override def parentDataFrame: org.apache.spark.sql.DataFrame = {");
script.append(request.getParent().getScript());
script.append("}\n");
script.append("override def parentTable: String = {\"");
script.append(StringEscapeUtils.escapeJava(request.getParent().getTable()));
script.append("\"}\n");
}
script.append("}\n");
script.append("new Transform(tableName, profiler, sqlContext, sparkContextService).run()\n");
return script.toString();
}
/**
* Generates a new, unique table name.
*
* @return the table name
* @throws IllegalStateException if a table name cannot be generated
*/
private String newTableName() {
for (int i = 0; i < 100; ++i) {
final String name = UUID.randomUUID().toString();
if (name.matches("^[a-fA-F].*")) {
return name.replace("-", "");
}
}
throw new IllegalStateException("Unable to generate a new table name");
}
}