package com.github.projectflink.generators;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.util.Collector;
import java.util.Random;
/**
* Generates texts.
*/
public class Text {
private static String[] sentenceEnds = {".", "...", "?", "??", "!", "-- "};
public static void main(String[] args) throws Exception {
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
int dop = Integer.valueOf(args[0]);
String outPath = args[1];
long finalSizeGB = Integer.valueOf(args[2]);
int numberOfFiles = dop;
if(args.length > 3) {
numberOfFiles = Integer.valueOf(args[3]);
}
final long bytesPerMapper = ((finalSizeGB * 1024 * 1024 * 1024) / numberOfFiles);
System.err.println("Generating Text data with the following properties:\n"
+ "dop="+dop+" outPath="+outPath+" finalSizeGB="+finalSizeGB+" bytesPerMapper="+bytesPerMapper+" number of files="+numberOfFiles);
DataSet<Long> empty = env.generateSequence(1, numberOfFiles);
DataSet<String> logLine = empty.flatMap(new FlatMapFunction<Long, String>() {
private static final long serialVersionUID = 1L;
@Override
public void flatMap(Long value, Collector<String> out) throws Exception {
System.err.println("got value="+value);
Random rnd = new Utils.XORShiftRandom();
StringBuffer sb = new StringBuffer();
long bytesGenerated = 0;
while(true) {
int sentenceLength = rnd.nextInt(25); // up to 16 words per sentence
for(int i = 0; i < sentenceLength; i++) {
sb.append(Utils.getFastZipfRandomWord());
sb.append(' ');
}
sb.append(sentenceEnds[rnd.nextInt(sentenceEnds.length-1)]);
final String str = sb.toString();
sb.delete(0, sb.length());
bytesGenerated += str.length();
out.collect(str);
// System.err.println("line ="+str);
if(bytesGenerated > bytesPerMapper) {
System.err.println("value="+value+" done with "+bytesGenerated);
break;
}
}
}
}).setParallelism(numberOfFiles);
logLine.writeAsText(outPath, FileSystem.WriteMode.OVERWRITE);
env.setParallelism(numberOfFiles);
env.execute("Flink Distributed Text Data Generator");
}
}