package com.hadooparchitecturebook.clickstream;
import JavaSessionize.avro.LogLine;
import com.google.common.collect.Lists;
import org.apache.avro.io.*;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.specific.SpecificRecord;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import parquet.avro.AvroParquetOutputFormat;
import parquet.avro.AvroWriteSupport;
import parquet.hadoop.ParquetOutputFormat;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Date;
import com.google.common.io.Files;
import java.io.File;
import java.io.ObjectStreamException;
/**
* Created by gshapira on 5/11/14.
*/
public final class JavaSessionize {
public static final List<String> testLines = Lists.newArrayList(
"233.19.62.103 - 16261 [15/Sep/2013:23:55:57] \"GET /code.js " +
"HTTP/1.0\" 200 3667 " +
"\"http://www.loudacre.com\" \"Loudacre Mobile Browser " +
"Sorrento F10L\"",
"16.180.70.237 - 128 [15/Sep/2013:23:59:53] \"GET /KBDOC-00031" +
".html HTTP/1.0\" 200 1388 " +
"\"http://www.loudacre.com\" \"Loudacre CSR Browser\"",
"116.180.70.237 - 128 [15/Sep/2013:23:59:53] \"GET /theme.css " +
"HTTP/1.0\" 200 5531 " +
"\"http://www.loudacre.com\" \"Loudacre CSR Browser\"",
"116.180.70.237 - 128 [15/Sep/2013:23:59:53] \"GET /theme.css " +
"HTTP/1.0\" 200 5531 "
+ "\"http://www.loudacre.com\" \"Loudacre CSR Browser\""
);
public static final Pattern apacheLogRegex = Pattern.compile(
"(\\d+.\\d+.\\d+.\\d+).*\\[(.*)\\].*GET (\\S+) \\S+ (\\d+) (\\d+)" +
" (\\S+) (.*)");
public static File temp = Files.createTempDir();
public static String outputPath = new File(temp,
"output").getAbsolutePath();
public static class SerializableLogLine extends LogLine implements
Serializable {
private void setValues(LogLine line) {
setIp(line.getIp());
setTimestamp(line.getTimestamp());
setUrl(line.getUrl());
setReferrer(line.getReferrer());
setUseragent(line.getUseragent());
setSessionid(line.getSessionid());
}
public SerializableLogLine(LogLine line) {
setValues(line);
}
private void writeObject(java.io.ObjectOutputStream out)
throws IOException {
DatumWriter<LogLine> writer = new SpecificDatumWriter<LogLine>
(LogLine.class);
Encoder encoder = EncoderFactory.get().binaryEncoder(out, null);
writer.write(this, encoder);
encoder.flush();
}
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
DatumReader<LogLine> reader =
new SpecificDatumReader<LogLine>(LogLine.class);
Decoder decoder = DecoderFactory.get().binaryDecoder(in, null);
setValues(reader.read(null, decoder));
}
private void readObjectNoData()
throws ObjectStreamException {
}
@Override
public int compareTo(SpecificRecord o) {
if (this == o) return 0;
if (o instanceof SerializableLogLine) {
SerializableLogLine that = (SerializableLogLine) o;
if (this.getTimestamp() < that.getTimestamp()) return -1;
if (this.getTimestamp() > that.getTimestamp()) return 1;
return 0;
} else {
throw new IllegalArgumentException("Can only compare two " +
"LogLines");
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("{").
append("ip: ").append(getIp()).
append("timeStamp: ").append(new Date(getTimestamp())).
append("url: ").append(getUrl()).
append("referrer: ").append(getReferrer()).
append("userAgent: ").append(getUseragent()).
append("session id: ").append(getSessionid()).
append("}");
return sb.toString();
}
}
// get the IP of the click event, to use as a user identified
public static String getIP(String line) {
System.out.println("Line:" + line);
System.out.println("regex:" + apacheLogRegex);
Matcher m = apacheLogRegex.matcher(line);
if (m.find())
return m.group(1);
else {
System.out.println("no match");
return "0";
}
}
// get all the relevant fields of the event
public static SerializableLogLine getFields(String line) throws
ParseException {
Matcher m = apacheLogRegex.matcher(line);
if (m.find()) {
String ip = m.group(1);
Date timeStamp = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss")
.parse(m.group(2));
String url = m.group(3);
String referrer = m.group(6);
String userAgent = m.group(7);
return new SerializableLogLine(new LogLine(ip,
timeStamp.getTime(), url, referrer, userAgent, 0));
} else {
System.out.println("no match");
return new SerializableLogLine(new LogLine("0",
new Date().getTime(), "", "", "", 0));
}
}
public static List<SerializableLogLine> sessionize
(List<SerializableLogLine> lines) {
List<SerializableLogLine> sessionizedLines = new
ArrayList<SerializableLogLine>(lines);
Collections.sort(sessionizedLines);
int sessionId = 0;
sessionizedLines.get(0).setSessionid(sessionId);
for (int i = 1; i < sessionizedLines.size(); i++) {
SerializableLogLine thisLine = sessionizedLines.get(i);
SerializableLogLine prevLine = sessionizedLines.get(i - 1);
if (thisLine.getTimestamp() - prevLine.getTimestamp() > 30 * 60 *
1000) {
sessionId++;
}
thisLine.setSessionid(sessionId);
}
return sessionizedLines;
}
public static void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println("Usage: JavaSessionize <master> [input file]");
System.exit(1);
}
System.out.println("Output:" + outputPath);
JavaSparkContext jsc = new JavaSparkContext(args[0],
"JavaSessionize",
System.getenv("SPARK_HOME"),
JavaSparkContext.jarOfClass(JavaSessionize.class));
JavaRDD<String> dataSet = (args.length == 2) ? jsc.textFile(args[1])
: jsc.parallelize(testLines);
// @formatter:off
JavaPairRDD<String, SerializableLogLine> parsed = dataSet.map(
new PairFunction<String, String, SerializableLogLine>() {
// @formatter:on
@Override
public Tuple2<String, SerializableLogLine> call(String s)
throws
Exception {
return new Tuple2<String, SerializableLogLine>(getIP(s),
getFields(s));
}
});
// This groups clicks by IP address
JavaPairRDD<String, List<SerializableLogLine>> grouped = parsed
.groupByKey();
JavaPairRDD<String, List<SerializableLogLine>> sessionized = grouped
.mapValues(new Function<List<SerializableLogLine>,
List<SerializableLogLine>>() {
@Override
public List<SerializableLogLine> call
(List<SerializableLogLine> logLines)
throws
Exception {
return sessionize(logLines);
}
});
sessionized.foreach(new VoidFunction<Tuple2<String,
List<SerializableLogLine>>>() {
@Override
public void call(Tuple2<String, List<SerializableLogLine>>
stringListTuple2) throws Exception {
System.out.println("IP: " + stringListTuple2._1());
for (SerializableLogLine line : stringListTuple2._2()) {
System.out.println(line);
}
}
});
// right now sessionize is an RDD of pairs: <String,List<LogLine>>.
// We want to output an RDD of <String,LogLine>
// First, grab the Lists, then flatten them,
// then pair them with something empty to make Hadoop happy
// @formatter:off
JavaRDD<List<SerializableLogLine>> nokeys = sessionized.map(
new Function<Tuple2<String, List<SerializableLogLine>>,
List<SerializableLogLine>>() {
// @formatter:on
@Override
public List<SerializableLogLine> call(Tuple2<String,
List<SerializableLogLine>> stringListTuple2) throws
Exception {
return stringListTuple2._2();
}
});
// @formatter:off
JavaRDD<SerializableLogLine> flatLines = nokeys.flatMap(
new FlatMapFunction<List<SerializableLogLine>,
SerializableLogLine>() {
// @formatter:on
@Override
public Iterable<SerializableLogLine> call
(List<SerializableLogLine> serializableLogLines) throws
Exception {
return serializableLogLines;
}
});
JavaPairRDD<Void, SerializableLogLine> outputPairs = flatLines.map
(new PairFunction<SerializableLogLine, Void,
SerializableLogLine>() {
@Override
public Tuple2<Void, SerializableLogLine> call
(SerializableLogLine
serializableLogLine) throws Exception {
return new Tuple2<Void, SerializableLogLine>(null,
serializableLogLine);
}
});
Job job = new Job();
ParquetOutputFormat.setWriteSupportClass(job, AvroWriteSupport.class);
AvroParquetOutputFormat.setSchema(job, LogLine.SCHEMA$);
//dummy instance, because that's the only way to get the class of a
// parameterized type
ParquetOutputFormat<LogLine> pOutput = new
ParquetOutputFormat<LogLine>();
//System.out.println("job write support - " +
// job.getConfiguration().get("parquet.write.support.class") +
// " job schema - " + job.getConfiguration().get("parquet
// .avro.schema")) ;
outputPairs.saveAsNewAPIHadoopFile(outputPath, //path
Void.class, //key class
LogLine.class, //value class
pOutput.getClass(), //output format class
job.getConfiguration()); //configuration
}
}