SparkRunner.java example

Explorer
DataCleaner-master
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.UUID;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.launcher.SparkLauncher;
import org.datacleaner.spark.utils.HadoopUtils;
import org.datacleaner.util.FileFilters;

/**
 * Prepares job for launching on Spark
 */
public class SparkRunner {
    public interface ProgressListener {
        /**
         * Called when files are ready. This either means that they have
         * been uploaded or already been found.
         */
        void onJobFilesReady();

        /**
         * Called when jobs has been submitted and now awaits completion.
         * This is actually a bit of a lie, as it only tells us that the
         * Spark process has been created, but there is no simple way to
         * verify progress beyond that.
         */
        void onJobSubmitted();
    }

    public static final String DATACLEANER_DIR = "/datacleaner";
    public static final String DATACLEANER_TEMP_DIR = DATACLEANER_DIR + "/temp";
    public static final String DATACLEANER_LIB_DIR = DATACLEANER_DIR + "/lib";
    public static final String DEFAULT_RESULT_PATH = "/datacleaner/results";
    public static final String RESULT_FILE_EXTENSION = FileFilters.ANALYSIS_RESULT_SER.getExtension();


    private final FileSystem _hadoopDefaultFS;
    private final ApplicationDriver _applicationDriver;
    private final File _hadoopConfigurationDirectory;
    private final String _configurationFilePath;
    private final String _jobFilePath;
    private final String _resultFilePath;

    public SparkRunner(final String configurationFilePath, final String jobFilePath, final String resultFilePath)
            throws IOException {
        _configurationFilePath = configurationFilePath;
        _jobFilePath = jobFilePath;
        _resultFilePath = resultFilePath;
        try {
            _hadoopConfigurationDirectory = HadoopUtils.getHadoopConfigurationDirectoryToUse();
            _hadoopDefaultFS = HadoopUtils.getFileSystem();
        } catch (final IOException e) {
            throw new IllegalStateException("Could not create Hadoop filesystem", e);
        }
        _applicationDriver = new ApplicationDriver(_hadoopDefaultFS.getUri(), DATACLEANER_LIB_DIR);
    }

    /**
     * Tries to find a file on HDFS, and optionally uploads it if not found.
     *
     * @param filePath Path of the file to find
     * @param upload true if file upload should be attempted.
     * @return Either the resolved URI, or the uploaded file.
     */
    private URI findFile(final String filePath, final boolean upload) {
        try {
            URI fileURI;
            try {
                fileURI = _hadoopDefaultFS.getUri().resolve(filePath);
            } catch (final Exception e) {
                fileURI = null;
            }

            if ((fileURI == null || !_hadoopDefaultFS.exists(new Path(fileURI))) && upload) {
                final File file = new File(filePath);
                if (!file.isFile()) {
                    throw new IllegalArgumentException("'" + filePath + " does not exist, or is not a file");
                }

                final String fileName = file.toPath().getFileName().toString();
                return _applicationDriver
                        .copyFileToHdfs(file, DATACLEANER_TEMP_DIR + "/" + UUID.randomUUID() + fileName);
            }

            return fileURI;
        } catch (final IOException e) {
            throw new IllegalArgumentException("Path '" + filePath + "' is not a proper file path");
        }
    }

    public int runJob() throws Exception {
        return runJob(null);
    }

    public int runJob(final ProgressListener listener) throws Exception {
        final URI configurationFile = findFile(_configurationFilePath, true);
        final URI jobFile = findFile(_jobFilePath, true);
        final URI resultFile;
        if (_resultFilePath != null) {
            resultFile = findFile(_resultFilePath, false);
        } else {
            resultFile = null;
        }

        if (listener != null) {
            listener.onJobFilesReady();
        }

        final SparkLauncher sparkLauncher = _applicationDriver
                .createSparkLauncher(_hadoopConfigurationDirectory, configurationFile, jobFile, resultFile);
        final Process launchProcess = _applicationDriver.launchProcess(sparkLauncher);

        if (listener != null) {
            listener.onJobSubmitted();
        }

        return launchProcess.waitFor();
    }
}