package org.datavec.transform.analysis; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.datavec.api.records.reader.RecordReader; import org.datavec.api.records.reader.impl.csv.CSVRecordReader; import org.datavec.api.transform.analysis.DataAnalysis; import org.datavec.api.transform.analysis.columns.DoubleAnalysis; import org.datavec.api.transform.schema.Schema; import org.datavec.api.transform.ui.HtmlAnalysis; import org.datavec.api.util.ClassPathResource; import org.datavec.api.writable.Writable; import org.datavec.spark.transform.AnalyzeSpark; import org.datavec.spark.transform.misc.StringToWritablesFunction; import org.datavec.spark.transform.utils.SparkUtils; import java.io.File; import java.util.List; /** * Conduct and export some basic analysis on the Iris dataset, as a stand-alone .html file. * * This functionality is still fairly basic, but can still be useful for analysis and debugging. * * @author Alex Black */ public class IrisAnalysis { public static void main(String[] args) throws Exception { Schema schema = new Schema.Builder() .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width") .addColumnInteger("Species") .build(); SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); String directory = new ClassPathResource("IrisData/iris.txt").getFile().getParent(); //Normally just define your directory like "file:/..." or "hdfs:/..." JavaRDD<String> stringData = sc.textFile(directory); //We first need to parse this comma-delimited (CSV) format; we can do this using CSVRecordReader: RecordReader rr = new CSVRecordReader(); JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr)); int maxHistogramBuckets = 10; DataAnalysis dataAnalysis = AnalyzeSpark.analyze(schema, parsedInputData, maxHistogramBuckets); System.out.println(dataAnalysis); //We can get statistics on a per-column basis: DoubleAnalysis da = (DoubleAnalysis)dataAnalysis.getColumnAnalysis("Sepal length"); double minValue = da.getMin(); double maxValue = da.getMax(); double mean = da.getMean(); HtmlAnalysis.createHtmlAnalysisFile(dataAnalysis, new File("DataVecIrisAnalysis.html")); //To write to HDFS instead: // String htmlAnalysisFileContents = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis); // SparkUtils.writeStringToFile("hdfs://your/hdfs/path/here",htmlAnalysisFileContents,sc); } }