package com.github.projectflink.generators; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.core.fs.FileSystem; import org.apache.flink.util.Collector; import java.util.Random; /** * Generates texts. */ public class Text { private static String[] sentenceEnds = {".", "...", "?", "??", "!", "-- "}; public static void main(String[] args) throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); int dop = Integer.valueOf(args[0]); String outPath = args[1]; long finalSizeGB = Integer.valueOf(args[2]); int numberOfFiles = dop; if(args.length > 3) { numberOfFiles = Integer.valueOf(args[3]); } final long bytesPerMapper = ((finalSizeGB * 1024 * 1024 * 1024) / numberOfFiles); System.err.println("Generating Text data with the following properties:\n" + "dop="+dop+" outPath="+outPath+" finalSizeGB="+finalSizeGB+" bytesPerMapper="+bytesPerMapper+" number of files="+numberOfFiles); DataSet<Long> empty = env.generateSequence(1, numberOfFiles); DataSet<String> logLine = empty.flatMap(new FlatMapFunction<Long, String>() { private static final long serialVersionUID = 1L; @Override public void flatMap(Long value, Collector<String> out) throws Exception { System.err.println("got value="+value); Random rnd = new Utils.XORShiftRandom(); StringBuffer sb = new StringBuffer(); long bytesGenerated = 0; while(true) { int sentenceLength = rnd.nextInt(25); // up to 16 words per sentence for(int i = 0; i < sentenceLength; i++) { sb.append(Utils.getFastZipfRandomWord()); sb.append(' '); } sb.append(sentenceEnds[rnd.nextInt(sentenceEnds.length-1)]); final String str = sb.toString(); sb.delete(0, sb.length()); bytesGenerated += str.length(); out.collect(str); // System.err.println("line ="+str); if(bytesGenerated > bytesPerMapper) { System.err.println("value="+value+" done with "+bytesGenerated); break; } } } }).setParallelism(numberOfFiles); logLine.writeAsText(outPath, FileSystem.WriteMode.OVERWRITE); env.setParallelism(numberOfFiles); env.execute("Flink Distributed Text Data Generator"); } }