package org.datavec.transform.logdata;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.datavec.api.records.reader.RecordReader;
import org.datavec.api.records.reader.impl.regex.RegexLineRecordReader;
import org.datavec.api.transform.ReduceOp;
import org.datavec.api.transform.TransformProcess;
import org.datavec.api.transform.analysis.DataAnalysis;
import org.datavec.api.transform.condition.ConditionOp;
import org.datavec.api.transform.condition.column.LongColumnCondition;
import org.datavec.api.transform.condition.string.StringRegexColumnCondition;
import org.datavec.api.transform.filter.ConditionFilter;
import org.datavec.api.transform.quality.DataQualityAnalysis;
import org.datavec.api.transform.reduce.Reducer;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.writable.IntWritable;
import org.datavec.api.writable.Writable;
import org.datavec.spark.transform.AnalyzeSpark;
import org.datavec.spark.transform.SparkTransformExecutor;
import org.datavec.spark.transform.misc.StringToWritablesFunction;
import org.joda.time.DateTimeZone;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.List;
import java.util.zip.GZIPInputStream;
/**
* Simple example performing some preprocessing/aggregation operations on some web log data using DataVec.
* Specifically:
* - Load some data
* - Perform data quality analysis
* - Perform basic data cleaning and preprocessing
* - Group records by host, and calculate some aggregate values for each (such as number of requests and total number of bytes)
* - Analyze the resulting data, and print some results
*
*
* Data is automatically downloaded from: http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html
*
* Examples of some log lines
* 199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
* unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985
*
* @author Alex Black
*/
public class LogDataExample {
/** Data URL for downloading */
public static final String DATA_URL = "ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz";
/** Location to save and extract the training/testing data */
public static final String DATA_PATH = FilenameUtils.concat(System.getProperty("java.io.tmpdir"), "datavec_log_example/");
public static final String EXTRACTED_PATH = FilenameUtils.concat(DATA_PATH, "data");
public static void main(String[] args) throws Exception {
//Setup
downloadData();
SparkConf conf = new SparkConf();
conf.setMaster("local[*]");
conf.setAppName("DataVec Log Data Example");
JavaSparkContext sc = new JavaSparkContext(conf);
//=====================================================================
// Step 1: Define the input data schema
//=====================================================================
//First: let's specify a schema for the data. This is based on the information from: http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html
Schema schema = new Schema.Builder()
.addColumnString("host")
.addColumnString("timestamp")
.addColumnString("request")
.addColumnInteger("httpReplyCode")
.addColumnInteger("replyBytes")
.build();
//=====================================================================
// Step 2: Clean Invalid Lines
//=====================================================================
//Second: let's load the data. Initially as Strings
JavaRDD<String> logLines = sc.textFile(EXTRACTED_PATH);
//This data unfortunately contains a small number of invalid lines. We'll remove them using standard Spark functionality
logLines = logLines.filter(new Function<String,Boolean>() {
@Override
public Boolean call(String s) throws Exception {
return s.matches("(\\S+) - - \\[(\\S+ -\\d{4})\\] \"(.+)\" (\\d+) (\\d+|-)"); //Regex for the format we expect
}
});
//=====================================================================
// Step 3: Parse Raw Data and Perform Initial Analysis
//=====================================================================
//To parse it: we're going to use RegexLineRecordReader. This requires us to define a regex for the format
String regex = "(\\S+) - - \\[(\\S+ -\\d{4})\\] \"(.+)\" (\\d+) (\\d+|-)";
RecordReader rr = new RegexLineRecordReader(regex,0);
JavaRDD<List<Writable>> parsed = logLines.map(new StringToWritablesFunction(rr));
//Now, let's check the quality, so we know if there's anything we need to clean up first...
DataQualityAnalysis dqa = AnalyzeSpark.analyzeQuality(schema, parsed);
System.out.println("----- Data Quality -----");
System.out.println(dqa); //One issue: non-integer values in "replyBytes" column
//=====================================================================
// Step 4: Perform Cleaning, Parsing and Aggregation
//=====================================================================
//Let's specify the transforms we want to do
TransformProcess tp = new TransformProcess.Builder(schema)
//First: clean up the "replyBytes" column by replacing any non-integer entries with the value 0
.conditionalReplaceValueTransform("replyBytes",new IntWritable(0), new StringRegexColumnCondition("replyBytes","\\D+"))
//Second: let's parse the date/time string:
.stringToTimeTransform("timestamp","dd/MMM/YYYY:HH:mm:ss Z", DateTimeZone.forOffsetHours(-4))
//Group by host and work out summary metrics
.reduce(new Reducer.Builder(ReduceOp.CountUnique)
.keyColumns("host") //keyColumns == columns to group by
.countColumns("timestamp") //Count the number of values
.countUniqueColumns("request", "httpReplyCode") //Count the number of unique requests and http reply codes
.sumColumns("replyBytes") //Sum the values in the replyBytes column
.build())
.renameColumn("count", "numRequests")
//Finally, let's filter out all hosts that requested less than 1 million bytes in total
.filter(new ConditionFilter(new LongColumnCondition("sum(replyBytes)", ConditionOp.LessThan, 1000000)))
.build();
JavaRDD<List<Writable>> processed = SparkTransformExecutor.execute(parsed, tp);
processed.cache();
//=====================================================================
// Step 5: Perform Analysis on Final Data; Display Results
//=====================================================================
Schema finalDataSchema = tp.getFinalSchema();
long finalDataCount = processed.count();
List<List<Writable>> sample = processed.take(10);
DataAnalysis analysis = AnalyzeSpark.analyze(finalDataSchema, processed);
sc.stop();
Thread.sleep(4000); //Give spark some time to shut down (and stop spamming console)
System.out.println("----- Final Data Schema -----");
System.out.println(finalDataSchema);
System.out.println("\n\nFinal data count: " + finalDataCount);
System.out.println("\n\n----- Samples of final data -----");
for(List<Writable> l : sample){
System.out.println(l);
}
System.out.println("\n\n----- Analysis -----");
System.out.println(analysis);
}
private static void downloadData() throws Exception {
//Create directory if required
File directory = new File(DATA_PATH);
if(!directory.exists()) directory.mkdir();
//Download file:
String archivePath = DATA_PATH + "NASA_access_log_Jul95.gz";
File archiveFile = new File(archivePath);
File extractedFile = new File(EXTRACTED_PATH,"access_log_July95.txt");
new File(extractedFile.getParent()).mkdirs();
if( !archiveFile.exists() ){
System.out.println("Starting data download (20MB)...");
FileUtils.copyURLToFile(new URL(DATA_URL), archiveFile);
System.out.println("Data (.tar.gz file) downloaded to " + archiveFile.getAbsolutePath());
//Extract tar.gz file to output directory
extractGzip(archivePath, extractedFile.getAbsolutePath());
} else {
//Assume if archive (.tar.gz) exists, then data has already been extracted
System.out.println("Data (.gz file) already exists at " + archiveFile.getAbsolutePath());
if( !extractedFile.exists()){
//Extract tar.gz file to output directory
extractGzip(archivePath, extractedFile.getAbsolutePath());
} else {
System.out.println("Data (extracted) already exists at " + extractedFile.getAbsolutePath());
}
}
}
private static final int BUFFER_SIZE = 4096;
private static void extractGzip(String filePath, String outputPath) throws IOException {
System.out.println("Extracting files...");
byte[] buffer = new byte[BUFFER_SIZE];
try{
GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(new File(filePath)));
FileOutputStream out = new FileOutputStream(new File(outputPath));
int len;
while ((len = gzis.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
gzis.close();
out.close();
System.out.println("Done");
}catch(IOException ex){
ex.printStackTrace();
}
}
}