/*
* Copyright (c) 2013 LDBC
* Linked Data Benchmark Council (http://ldbc.eu)
*
* This file is part of ldbc_socialnet_dbgen.
*
* ldbc_socialnet_dbgen is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ldbc_socialnet_dbgen is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ldbc_socialnet_dbgen. If not, see <http://www.gnu.org/licenses/>.
*
* Copyright (C) 2011 OpenLink Software <bdsmt@openlinksw.com>
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; only Version 2 of the License dated
* June 1991.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package ldbc.snb.datagen.generator;
import ldbc.snb.datagen.dictionary.Dictionaries;
import ldbc.snb.datagen.hadoop.*;
import ldbc.snb.datagen.objects.Person;
import ldbc.snb.datagen.objects.similarity.GeoDistanceSimilarity;
import ldbc.snb.datagen.util.ConfigParser;
import ldbc.snb.datagen.vocabulary.SN;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import java.io.File;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
public class LDBCDatagen {
static boolean initialized = false;
public static synchronized void init (Configuration conf) {
if(!initialized) {
DatagenParams.readConf(conf);
Dictionaries.loadDictionaries(conf);
SN.initialize();
try {
Person.personSimilarity = (Person.PersonSimilarity) Class.forName(conf.get("ldbc.snb.datagen.generator.person.similarity")).newInstance();
} catch(Exception e) {
System.err.println("Error while loading person similarity class");
System.err.println(e.getMessage());
}
initialized = true;
}
}
private void printProgress(String message) {
System.out.println("************************************************");
System.out.println("* " + message + " *");
System.out.println("************************************************");
}
public int runGenerateJob(Configuration conf) throws Exception {
String hadoopPrefix = conf.get("ldbc.snb.datagen.serializer.hadoopDir");
FileSystem fs = FileSystem.get(conf);
ArrayList<Float> percentages = new ArrayList<Float>();
percentages.add(0.45f);
percentages.add(0.45f);
percentages.add(0.1f);
//percentages.add(1.0f);
//percentages.add(0.1f);
long start = System.currentTimeMillis();
printProgress("Starting: Person generation");
long startPerson = System.currentTimeMillis();
HadoopPersonGenerator personGenerator = new HadoopPersonGenerator( conf );
personGenerator.run(hadoopPrefix+"/persons", "ldbc.snb.datagen.hadoop.UniversityKeySetter");
long endPerson = System.currentTimeMillis();
printProgress("Creating university location correlated edges");
long startUniversity = System.currentTimeMillis();
HadoopKnowsGenerator knowsGenerator = new HadoopKnowsGenerator(conf,
"ldbc.snb.datagen.hadoop.UniversityKeySetter",
"ldbc.snb.datagen.hadoop.RandomKeySetter",
percentages,
0,
conf.get("ldbc.snb.datagen.generator.knowsGenerator"));
knowsGenerator.run(hadoopPrefix+"/persons",hadoopPrefix+"/universityEdges");
long endUniversity = System.currentTimeMillis();
printProgress("Creating main interest correlated edges");
long startInterest= System.currentTimeMillis();
knowsGenerator = new HadoopKnowsGenerator( conf,
"ldbc.snb.datagen.hadoop.InterestKeySetter",
"ldbc.snb.datagen.hadoop.RandomKeySetter",
percentages,
1,
conf.get("ldbc.snb.datagen.generator.knowsGenerator"));
knowsGenerator.run(hadoopPrefix+"/persons",hadoopPrefix+"/interestEdges");
long endInterest = System.currentTimeMillis();
printProgress("Creating random correlated edges");
long startRandom= System.currentTimeMillis();
knowsGenerator = new HadoopKnowsGenerator( conf,
"ldbc.snb.datagen.hadoop.RandomKeySetter",
"ldbc.snb.datagen.hadoop.RandomKeySetter",
percentages,
2,
"ldbc.snb.datagen.generator.RandomKnowsGenerator");
knowsGenerator.run(hadoopPrefix+"/persons",hadoopPrefix+"/randomEdges");
long endRandom= System.currentTimeMillis();
fs.delete(new Path(DatagenParams.hadoopDir + "/persons"), true);
printProgress("Merging the different edge files");
ArrayList<String> edgeFileNames = new ArrayList<String>();
edgeFileNames.add(hadoopPrefix+"/universityEdges");
edgeFileNames.add(hadoopPrefix+"/interestEdges");
edgeFileNames.add(hadoopPrefix+"/randomEdges");
long startMerge = System.currentTimeMillis();
HadoopMergeFriendshipFiles merger = new HadoopMergeFriendshipFiles(conf,"ldbc.snb.datagen.hadoop.RandomKeySetter");
merger.run(hadoopPrefix+"/mergedPersons", edgeFileNames);
long endMerge = System.currentTimeMillis();
/*printProgress("Creating edges to fill the degree gap");
long startGap = System.currentTimeMillis();
knowsGenerator = new HadoopKnowsGenerator(conf,null, "ldbc.snb.datagen.hadoop.DegreeGapKeySetter", 1.0f);
knowsGenerator.run(personsFileName2,personsFileName1);
fs.delete(new Path(personsFileName2), true);
long endGap = System.currentTimeMillis();
*/
printProgress("Serializing persons");
long startPersonSerializing= System.currentTimeMillis();
if(conf.getBoolean("ldbc.snb.datagen.serializer.persons.sort",false) == false) {
HadoopPersonSerializer serializer = new HadoopPersonSerializer(conf);
serializer.run(hadoopPrefix + "/mergedPersons");
} else {
HadoopPersonSortAndSerializer serializer = new HadoopPersonSortAndSerializer(conf);
serializer.run(hadoopPrefix + "/mergedPersons");
}
long endPersonSerializing= System.currentTimeMillis();
long startPersonActivity= System.currentTimeMillis();
if(conf.getBoolean("ldbc.snb.datagen.generator.activity", true)) {
printProgress("Generating and serializing person activity");
HadoopPersonActivityGenerator activityGenerator = new HadoopPersonActivityGenerator(conf);
activityGenerator.run(hadoopPrefix+"/mergedPersons");
int numThreads = DatagenParams.numThreads;
int blockSize = DatagenParams.blockSize;
int numBlocks = (int)Math.ceil(DatagenParams.numPersons / (double)blockSize);
for( int i = 0; i < numThreads; ++i ) {
if( i < numBlocks ) {
fs.copyToLocalFile(false, new Path(DatagenParams.hadoopDir + "/m" + i + "personFactors.txt"), new Path("./"));
fs.copyToLocalFile(false, new Path(DatagenParams.hadoopDir + "/m" + i + "activityFactors.txt"), new Path("./"));
fs.copyToLocalFile(false, new Path(DatagenParams.hadoopDir + "/m0friendList" + i + ".csv"), new Path("./"));
}
}
}
long endPersonActivity= System.currentTimeMillis();
long startSortingUpdateStreams= System.currentTimeMillis();
if(conf.getBoolean("ldbc.snb.datagen.serializer.updateStreams", false)) {
printProgress("Sorting update streams ");
int blockSize = DatagenParams.blockSize;
int numBlocks = (int)Math.ceil(DatagenParams.numPersons / (double)blockSize);
List<String> personStreamsFileNames = new ArrayList<String>();
List<String> forumStreamsFileNames = new ArrayList<String>();
for( int i = 0; i < DatagenParams.numThreads; ++i) {
int numPartitions = conf.getInt("ldbc.snb.datagen.serializer.numUpdatePartitions", 1);
//if( i < numBlocks ) {
for (int j = 0; j < numPartitions; ++j) {
personStreamsFileNames.add(DatagenParams.hadoopDir + "/temp_updateStream_person_" + i + "_" + j);
if( conf.getBoolean("ldbc.snb.datagen.generator.activity", false)) {
forumStreamsFileNames.add(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j);
}
}
/*} else {
for (int j = 0; j < numPartitions; ++j) {
fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_person_" + i + "_" + j), true);
fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j), true);
}
} */
}
HadoopUpdateStreamSorterAndSerializer updateSorterAndSerializer = new HadoopUpdateStreamSorterAndSerializer(conf);
updateSorterAndSerializer.run(personStreamsFileNames, "person");
updateSorterAndSerializer.run(forumStreamsFileNames, "forum");
for(String file : personStreamsFileNames) {
fs.delete(new Path(file), true);
}
for(String file : forumStreamsFileNames) {
fs.delete(new Path(file), true);
}
/*for( int i = 0; i < DatagenParams.numThreads; ++i) {
int numPartitions = conf.getInt("ldbc.snb.datagen.serializer.numUpdatePartitions", 1);
if( i < numBlocks ) {
for (int j = 0; j < numPartitions; ++j) {
HadoopFileSorter updateStreamSorter = new HadoopFileSorter(conf, LongWritable.class, Text.class);
HadoopUpdateStreamSerializer updateSerializer = new HadoopUpdateStreamSerializer(conf);
updateStreamSorter.run(DatagenParams.hadoopDir + "/temp_updateStream_person_" + i + "_" + j, DatagenParams.hadoopDir + "/updateStream_person_" + i + "_" + j);
fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_person_" + i + "_" + j), true);
updateSerializer.run(DatagenParams.hadoopDir + "/updateStream_person_" + i + "_" + j, i, j, "person");
fs.delete(new Path(DatagenParams.hadoopDir + "/updateStream_person_" + i + "_" + j), true);
if( conf.getBoolean("ldbc.snb.datagen.generator.activity", false)) {
updateStreamSorter.run(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j, DatagenParams.hadoopDir + "/updateStream_forum_" + i + "_" + j);
fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j), true);
updateSerializer.run(DatagenParams.hadoopDir + "/updateStream_forum_" + i + "_" + j, i, j, "forum");
fs.delete(new Path(DatagenParams.hadoopDir + "/updateStream_forum_" + i + "_" + j), true);
}
}
} else {
for (int j = 0; j < numPartitions; ++j) {
fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_person_" + i + "_" + j), true);
fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j), true);
}
}
}*/
long minDate = Long.MAX_VALUE;
long maxDate = Long.MIN_VALUE;
long count = 0;
for( int i = 0; i < DatagenParams.numThreads; ++i) {
Path propertiesFile = new Path(DatagenParams.hadoopDir+"/temp_updateStream_person_"+i+".properties");
FSDataInputStream file = fs.open(propertiesFile);
Properties properties = new Properties();
properties.load(file);
long aux;
aux = Long.parseLong(properties.getProperty("ldbc.snb.interactive.min_write_event_start_time"));
minDate = aux < minDate ? aux : minDate;
aux = Long.parseLong(properties.getProperty("ldbc.snb.interactive.max_write_event_start_time"));
maxDate = aux > maxDate ? aux : maxDate;
aux = Long.parseLong(properties.getProperty("ldbc.snb.interactive.num_events"));
count += aux;
file.close();
fs.delete(propertiesFile,true);
if( conf.getBoolean("ldbc.snb.datagen.generator.activity", false)) {
propertiesFile = new Path(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + ".properties");
file = fs.open(propertiesFile);
properties = new Properties();
properties.load(file);
aux = Long.parseLong(properties.getProperty("ldbc.snb.interactive.min_write_event_start_time"));
minDate = aux < minDate ? aux : minDate;
aux = Long.parseLong(properties.getProperty("ldbc.snb.interactive.max_write_event_start_time"));
maxDate = aux > maxDate ? aux : maxDate;
aux = Long.parseLong(properties.getProperty("ldbc.snb.interactive.num_events"));
count += aux;
file.close();
fs.delete(propertiesFile, true);
}
}
OutputStream output = fs.create(new Path(DatagenParams.socialNetworkDir+"/updateStream"+".properties"),true);
output.write(new String("ldbc.snb.interactive.gct_delta_duration:" + DatagenParams.deltaTime + "\n").getBytes());
output.write(new String("ldbc.snb.interactive.min_write_event_start_time:" + minDate + "\n").getBytes());
output.write(new String("ldbc.snb.interactive.max_write_event_start_time:" + maxDate + "\n").getBytes());
output.write(new String("ldbc.snb.interactive.update_interleave:" + (maxDate - minDate) / count + "\n").getBytes());
output.write(new String("ldbc.snb.interactive.num_events:" + count).getBytes());
output.close();
}
long endSortingUpdateStreams= System.currentTimeMillis();
printProgress("Serializing invariant schema ");
long startInvariantSerializing= System.currentTimeMillis();
HadoopInvariantSerializer invariantSerializer = new HadoopInvariantSerializer(conf);
invariantSerializer.run();
long endInvariantSerializing= System.currentTimeMillis();
long end = System.currentTimeMillis();
System.out.println(((end - start) / 1000)
+ " total seconds");
System.out.println("Person generation time: "+((endPerson - startPerson) / 1000));
System.out.println("University correlated edge generation time: "+((endUniversity - startUniversity) / 1000));
System.out.println("Interest correlated edge generation time: "+((endInterest - startInterest) / 1000));
System.out.println("Random correlated edge generation time: "+((endRandom - startRandom) / 1000));
System.out.println("Edges merge time: "+((endMerge - startMerge) / 1000));
System.out.println("Person serialization time: "+((endPersonSerializing - startPersonSerializing) / 1000));
System.out.println("Person activity generation and serialization time: "+((endPersonActivity - startPersonActivity) / 1000));
System.out.println("Sorting update streams time: "+((endSortingUpdateStreams - startSortingUpdateStreams) / 1000));
System.out.println("Invariant schema serialization time: "+((endInvariantSerializing - startInvariantSerializing) / 1000));
System.out.println("Total Execution time: "+((end - start) / 1000));
if(conf.getBoolean("ldbc.snb.datagen.parametergenerator.parameters",false) && conf.getBoolean("ldbc.snb.datagen.generator.activity",false)) {
System.out.println("Running Parameter Generation");
System.out.println("Generating Interactive Parameters");
ProcessBuilder pb = new ProcessBuilder("mkdir", "-p",conf.get("ldbc.snb.datagen.serializer.outputDir")+"/substitution_parameters");
pb.directory(new File("./"));
Process p = pb.start();
p.waitFor();
pb = new ProcessBuilder(conf.get("ldbc.snb.datagen.parametergenerator.python"), "paramgenerator/generateparams.py", "./",conf.get("ldbc.snb.datagen.serializer.outputDir")+"/substitution_parameters");
pb.directory(new File("./"));
File logInteractive = new File("parameters_interactive.log");
pb.redirectErrorStream(true);
pb.redirectOutput(ProcessBuilder.Redirect.appendTo(logInteractive));
p = pb.start();
p.waitFor();
System.out.println("Generating BI Parameters");
pb = new ProcessBuilder(conf.get("ldbc.snb.datagen.parametergenerator.python"), "paramgenerator/generateparamsbi.py", "./",conf.get("ldbc.snb.datagen.serializer.outputDir")+"/substitution_parameters");
pb.directory(new File("./"));
File logBi = new File("parameters_bi.log");
pb.redirectErrorStream(true);
pb.redirectOutput(ProcessBuilder.Redirect.appendTo(logBi));
p = pb.start();
p.waitFor();
System.out.println("Finished Parameter Generation");
}
return 0;
}
public static void main(String[] args) /*throws Exception*/ {
try {
Configuration conf = ConfigParser.initialize();
ConfigParser.readConfig(conf, args[0]);
ConfigParser.readConfig(conf, LDBCDatagen.class.getResourceAsStream("/params.ini"));
conf.set("ldbc.snb.datagen.serializer.hadoopDir",conf.get("ldbc.snb.datagen.serializer.outputDir")+"/hadoop");
conf.set("ldbc.snb.datagen.serializer.socialNetworkDir",conf.get("ldbc.snb.datagen.serializer.outputDir")+"/social_network");
ConfigParser.printConfig(conf);
// conf.setBoolean("mapreduce.map.output.compress", true);
// conf.setBoolean("mapreduce.output.fileoutputformat.compress", false);
// Deleting existing files
FileSystem dfs = FileSystem.get(conf);
dfs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")), true);
dfs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.socialNetworkDir")), true);
// Create input text file in HDFS
LDBCDatagen datagen = new LDBCDatagen();
LDBCDatagen.init(conf);
datagen.runGenerateJob(conf);
}catch(AssertionError e ) {
System.err.println("Error during execution");
System.err.println(e.getMessage());
e.printStackTrace();
System.exit(1);
}catch(Exception e ) {
System.err.println("Error during execution");
System.err.println(e.getMessage());
e.printStackTrace();
System.exit(1);
}
}
}