/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.examples.java.relational.util;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Calendar;
import java.util.Random;
import org.apache.flink.examples.java.relational.WebLogAnalysis;
/**
* Data generator for the {@link WebLogAnalysis} example program.
*
*/
public class WebLogDataGenerator {
/**
* Main method to generate data for the {@link WebLogAnalysis} example program.
* <p>
* The generator creates to files:
* <ul>
* <li><code>{tmp.dir}/documents</code> for the web documents
* <li><code>{tmp.dir}/ranks</code> for the ranks of the web documents
* <li><code>{tmp.dir}/visits</code> for the logged visits of web documents
* </ul>
*
* @param args
* <ol>
* <li>Int: Number of web documents
* <li>Int: Number of visits
* </ol>
*/
public static void main(String[] args) {
// parse parameters
if (args.length < 2) {
System.out.println("WebLogDataGenerator <numberOfDocuments> <numberOfVisits>");
System.exit(1);
}
int noDocs = Integer.parseInt(args[0]);
int noVisits = Integer.parseInt(args[1]);
String[] filterKWs = { "editors", "oscillations", "convection" };
String[] words = { "Lorem", "ipsum", "dolor", "sit", "amet",
"consectetuer", "adipiscing", "elit", "sed", "diam", "nonummy",
"nibh", "euismod", "tincidunt", "ut", "laoreet", "dolore",
"magna", "aliquam", "erat", "volutpat", "Ut", "wisi", "enim",
"ad", "minim", "veniam", "quis", "nostrud", "exerci", "tation",
"ullamcorper", "suscipit", "lobortis", "nisl", "ut", "aliquip",
"ex", "ea", "commodo" };
final String outPath = System.getProperty("java.io.tmpdir");
System.out.println("Generating documents files...");
genDocs(noDocs, filterKWs, words, outPath + "/documents");
System.out.println("Generating ranks files...");
genRanks(noDocs, outPath + "/ranks");
System.out.println("Generating visits files...");
genVisits(noVisits, noDocs, outPath + "/visits");
System.out.println("Done!");
}
/**
* Generates the files for the documents relation. The entries apply the
* following format: <br />
* <code>URL | Content</code>
*
* @param noDocs
* Number of entries for the documents relation
* @param filterKeyWords
* A list of keywords that should be contained
* @param words
* A list of words to fill the entries
* @param path
* Output path for the documents relation
*/
private static void genDocs(int noDocs, String[] filterKeyWords, String[] words, String path) {
Random rand = new Random(Calendar.getInstance().getTimeInMillis());
try (FileWriter fw = new FileWriter(path)) {
for (int i = 0; i < noDocs; i++) {
int wordsInDoc = rand.nextInt(40) + 10;
// URL
StringBuilder doc = new StringBuilder("url_" + i + "|");
for (int j = 0; j < wordsInDoc; j++) {
if (rand.nextDouble() > 0.9) {
// Approx. every 10th word is a keyword
doc.append(filterKeyWords[rand.nextInt(filterKeyWords.length)] + " ");
} else {
// Fills up the docs file(s) with random words
doc.append(words[rand.nextInt(words.length)] + " ");
}
}
doc.append("|\n");
fw.write(doc.toString());
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Generates the files for the ranks relation. The ranks entries apply the
* following format: <br />
* <code>Rank | URL | Average Duration |\n</code>
*
* @param noDocs
* Number of entries in the documents relation
* @param path
* Output path for the ranks relation
*/
private static void genRanks(int noDocs, String path) {
Random rand = new Random(Calendar.getInstance().getTimeInMillis());
try (FileWriter fw = new FileWriter(path)) {
for (int i = 0; i < noDocs; i++) {
// Rank
StringBuilder rank = new StringBuilder(rand.nextInt(100) + "|");
// URL
rank.append("url_" + i + "|");
// Average duration
rank.append(rand.nextInt(10) + rand.nextInt(50) + "|\n");
fw.write(rank.toString());
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Generates the files for the visits relation. The visits entries apply the
* following format:<br />
* <code>IP Address | URL | Date (YYYY-MM-DD) | Misc. Data (e.g. User-Agent) |\n</code>
*
* @param noVisits
* Number of entries for the visits relation
* @param noDocs
* Number of entries in the documents relation
* @param path
* Output path for the visits relation
*/
private static void genVisits(int noVisits, int noDocs, String path) {
Random rand = new Random(Calendar.getInstance().getTimeInMillis());
try (FileWriter fw = new FileWriter(path)) {
for (int i = 0; i < noVisits; i++) {
int year = 2000 + rand.nextInt(10); // yearFilter 3
int month = rand.nextInt(12) + 1; // month between 1 and 12
int day = rand.nextInt(27) + 1; // day between 1 and 28
// IP address
StringBuilder visit = new StringBuilder(rand.nextInt(256) + "."
+ rand.nextInt(256) + "." + rand.nextInt(256) + "."
+ rand.nextInt(256) + "|");
// URL
visit.append("url_" + rand.nextInt(noDocs) + "|");
// Date (format: YYYY-MM-DD)
visit.append(year + "-" + month + "-" + day + "|");
// Miscellaneous data, e.g. User-Agent
visit.append("0.12|Mozilla Firefox 3.1|de|de|Nothing special|124|\n");
fw.write(visit.toString());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}