/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.spark;
import java.io.File;
import java.net.URI;
import org.apache.spark.launcher.SparkLauncher;
/**
* Prerequisites:
*
* <ul>
* <li>Make sure to set SPARK_HOME as either an environment variable or system
* property.</li>
* <li>Create a /datacleaner/lib directory on HDFS and upload the following JAR
* files to it:
* <ul>
* <li>DataCleaner-env-spark-[version].jar</li>
* <li>DataCleaner-basic-analyzers-[version].jar</li>
* <li>DataCleaner-value-distribution-[version].jar</li>
* </ul>
* or simply:
* <ul>
* <li>DataCleaner-env-spark-[version]-jar-with-dependencies.jar</li>
* </ul>
* this can be done with a command a la:
*
* <pre>
* hadoop fs -mkdir /datacleaner/lib
* hadoop fs -put /path/to/DataCleaner-env-spark-4.5.1-SNAPSHOT-jar-with-dependencies.jar /datacleaner/lib
* </pre>
*
* </li>
* </ul>
*/
public class ExampleLaunch {
private static final URI HDFS_NAMENODE = URI.create("hdfs://bigdatavm:9000/");
private static final String HDFS_JAR_LOCATION = "/datacleaner/lib";
private static final String CONFIGURATION_LOCATION = "/datacleaner/test/conf.xml";
private static final String SPARK_HOME = "C:\\dev\\spark-1.5.1-bin-hadoop2.6";
private static final String JOB_LOCATION = "/datacleaner/test/vanilla-job.analysis.xml";
private static final String DATA_LOCATION = "/datacleaner/test/person_names.txt";
public static void main(final String[] args) throws Exception {
if (System.getenv("SPARK_HOME") == null) {
System.setProperty("SPARK_HOME", SPARK_HOME);
}
final ApplicationDriver launcher = new ApplicationDriver(HDFS_NAMENODE, HDFS_JAR_LOCATION);
// copy test files to the desired location
launcher.copyFileToHdfs(new File("src/test/resources/person_names.txt"), DATA_LOCATION, false);
launcher.copyFileToHdfs(new File("src/test/resources/conf_hdfs.xml"), CONFIGURATION_LOCATION, false);
launcher.copyFileToHdfs(new File("src/test/resources/vanilla-job.analysis.xml"), JOB_LOCATION, false);
final File hadoopConfDir = launcher.createTemporaryHadoopConfDir();
final SparkLauncher sparkLauncher =
launcher.createSparkLauncher(hadoopConfDir, CONFIGURATION_LOCATION, JOB_LOCATION, null);
final int exitCode = launcher.launch(sparkLauncher);
System.out.println("Exit code: " + exitCode);
}
}