package com.github.projectflink.generators;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Random;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.CollectionEnvironment;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.core.fs.FileSystem.WriteMode;
import org.apache.flink.util.Collector;
/**
* TODO: There is something wrong with the parallelization of the generator.
*/
public class Logdata {
private static String[] requestType = {"GET", "POST", "PUT", "DELETE"};
public static void main(String[] args) throws Exception {
// set up the execution environment
final ExecutionEnvironment env = /*new CollectionEnvironment(); */ ExecutionEnvironment.getExecutionEnvironment();
int dop = Integer.valueOf(args[0]);
String outPath = args[1];
long finalSizeGB = Integer.valueOf(args[2]);
final long bytesPerMapper = ((finalSizeGB * 1024 * 1024 * 1024) / dop);
System.err.println("Generating Log data with the following properties:\n"
+ "dop="+dop+" outPath="+outPath+" finalSizeGB="+finalSizeGB+" bytesPerMapper="+bytesPerMapper);
DataSet<Long> empty = env.generateSequence(1, dop);
empty.print();
DataSet<String> logLine = empty.flatMap(new FlatMapFunction<Long, String>() {
private static final long serialVersionUID = 1L;
@Override
public void flatMap(Long value, Collector<String> out) throws Exception {
System.err.println("val = "+value);
Random rnd = new Utils.XORShiftRandom();
StringBuffer sb = new StringBuffer();
long bytesGenerated = 0;
while(true) {
// write ip:
sb.append(rnd.nextInt(255)).append('.').append(rnd.nextInt(255)).append('.').append(rnd.nextInt(255)).append('.').append(rnd.nextInt(255));
sb.append(" - - ["); // some spaces
sb.append( (new Date(Math.abs(rnd.nextLong())).toString()));
sb.append("] \"");
sb.append(requestType[rnd.nextInt(requestType.length-1)]);
sb.append(' ');
if(rnd.nextBoolean()) {
// access to album
sb.append("/album.php?picture=").append(rnd.nextInt());
} else {
// access search
sb.append("/search.php?term=");
int terms = rnd.nextInt(8);
for(int i = 0; i < terms; i++) {
sb.append(Utils.getRandomRealWord(rnd)).append('+');
}
}
sb.append(" HTTP/1.1\" ").append(Utils.getRandomUA(rnd));
/*if(sb.charAt(sb.length()-1) != '\n') {
sb.append('\n');
} */
final String str = sb.toString();
sb.delete(0, sb.length());
bytesGenerated += str.length();
out.collect(str);
if(bytesGenerated > bytesPerMapper) {
break;
}
}
}
}).setParallelism(dop);
logLine.writeAsText(outPath, WriteMode.OVERWRITE);
env.setParallelism(dop);
env.execute("Flink Distributed Log Data Generator");
}
}