ExampleLaunch.java example

Explorer
DataCleaner-master
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark;

import java.io.File;
import java.net.URI;

import org.apache.spark.launcher.SparkLauncher;

/**
 * Prerequisites:
 *
 * <ul>
 * <li>Make sure to set SPARK_HOME as either an environment variable or system
 * property.</li>
 * <li>Create a /datacleaner/lib directory on HDFS and upload the following JAR
 * files to it:
 * <ul>
 * <li>DataCleaner-env-spark-[version].jar</li>
 * <li>DataCleaner-basic-analyzers-[version].jar</li>
 * <li>DataCleaner-value-distribution-[version].jar</li>
 * </ul>
 * or simply:
 * <ul>
 * <li>DataCleaner-env-spark-[version]-jar-with-dependencies.jar</li>
 * </ul>
 * this can be done with a command a la:
 *
 * <pre>
 * hadoop fs -mkdir /datacleaner/lib
 * hadoop fs -put /path/to/DataCleaner-env-spark-4.5.1-SNAPSHOT-jar-with-dependencies.jar /datacleaner/lib
 * </pre>
 *
 * </li>
 * </ul>
 */
public class ExampleLaunch {

    private static final URI HDFS_NAMENODE = URI.create("hdfs://bigdatavm:9000/");
    private static final String HDFS_JAR_LOCATION = "/datacleaner/lib";
    private static final String CONFIGURATION_LOCATION = "/datacleaner/test/conf.xml";
    private static final String SPARK_HOME = "C:\\dev\\spark-1.5.1-bin-hadoop2.6";
    private static final String JOB_LOCATION = "/datacleaner/test/vanilla-job.analysis.xml";
    private static final String DATA_LOCATION = "/datacleaner/test/person_names.txt";

    public static void main(final String[] args) throws Exception {
        if (System.getenv("SPARK_HOME") == null) {
            System.setProperty("SPARK_HOME", SPARK_HOME);
        }

        final ApplicationDriver launcher = new ApplicationDriver(HDFS_NAMENODE, HDFS_JAR_LOCATION);

        // copy test files to the desired location
        launcher.copyFileToHdfs(new File("src/test/resources/person_names.txt"), DATA_LOCATION, false);
        launcher.copyFileToHdfs(new File("src/test/resources/conf_hdfs.xml"), CONFIGURATION_LOCATION, false);
        launcher.copyFileToHdfs(new File("src/test/resources/vanilla-job.analysis.xml"), JOB_LOCATION, false);

        final File hadoopConfDir = launcher.createTemporaryHadoopConfDir();
        final SparkLauncher sparkLauncher =
                launcher.createSparkLauncher(hadoopConfDir, CONFIGURATION_LOCATION, JOB_LOCATION, null);
        final int exitCode = launcher.launch(sparkLauncher);

        System.out.println("Exit code: " + exitCode);
    }
}