/*
* Copyright (c) 2013 LDBC
* Linked Data Benchmark Council (http://ldbc.eu)
*
* This file is part of ldbc_socialnet_dbgen.
*
* ldbc_socialnet_dbgen is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* ldbc_socialnet_dbgen is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with ldbc_socialnet_dbgen. If not, see <http://www.gnu.org/licenses/>.
*
* Copyright (C) 2011 OpenLink Software <bdsmt@openlinksw.com>
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; only Version 2 of the License dated
* June 1991.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package ldbc.snb.datagen.generator;
import ldbc.snb.datagen.util.ScaleFactor;
import org.apache.hadoop.conf.Configuration;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.util.TreeMap;
public class DatagenParams {
//Files and folders
public static final String DICTIONARY_DIRECTORY = "/dictionaries/";
public static final String SPARKBENCH_DIRECTORY = "/sparkbench";
public static final String IPZONE_DIRECTORY = "/ipaddrByCountries";
public static final String STATS_FILE = "testdata.json";
public static final String RDF_OUTPUT_FILE = "ldbc_socialnet_dbg";
public static final String PERSON_COUNTS_FILE = "personFactors.txt";
public static final String ACTIVITY_FILE = "activityFactors.txt";
// Dictionaries dataset files
public static final String browserDictonryFile = DICTIONARY_DIRECTORY + "browsersDic.txt";
public static final String companiesDictionaryFile = DICTIONARY_DIRECTORY + "companiesByCountry.txt";
public static final String countryAbbrMappingFile = DICTIONARY_DIRECTORY + "countryAbbrMapping.txt";
public static final String popularTagByCountryFile = DICTIONARY_DIRECTORY + "popularTagByCountry.txt";
public static final String countryDictionaryFile = DICTIONARY_DIRECTORY + "dicLocations.txt";
public static final String tagsFile = DICTIONARY_DIRECTORY + "tags.txt";
public static final String emailDictionaryFile = DICTIONARY_DIRECTORY + "email.txt";
public static final String nameDictionaryFile = DICTIONARY_DIRECTORY + "givennameByCountryBirthPlace.txt.freq.full";
public static final String universityDictionaryFile = DICTIONARY_DIRECTORY + "universities.txt";
public static final String cityDictionaryFile = DICTIONARY_DIRECTORY + "citiesByCountry.txt";
public static final String languageDictionaryFile = DICTIONARY_DIRECTORY + "languagesByCountry.txt";
public static final String popularDictionaryFile = DICTIONARY_DIRECTORY + "popularPlacesByCountry.txt";
public static final String agentFile = DICTIONARY_DIRECTORY + "smartPhonesProviders.txt";
public static final String surnamDictionaryFile = DICTIONARY_DIRECTORY + "surnameByCountryBirthPlace.txt.freq.sort";
public static final String tagClassFile = DICTIONARY_DIRECTORY + "tagClasses.txt";
public static final String tagClassHierarchyFile = DICTIONARY_DIRECTORY + "tagClassHierarchy.txt";
public static final String tagTextFile = DICTIONARY_DIRECTORY + "tagText.txt";
public static final String tagMatrixFile = DICTIONARY_DIRECTORY + "tagMatrix.txt";
public static final String flashmobDistFile = DICTIONARY_DIRECTORY + "flashmobDist.txt";
public static final String fbSocialDegreeFile = DICTIONARY_DIRECTORY + "facebookBucket100.dat";
//private parameters
private enum ParameterNames {
BASE_CORRELATED ("ldbc.snb.datagen.generator.baseProbCorrelated"),
BEST_UNIVERSTY_RATIO ("ldbc.snb.datagen.generator.probTopUniv"),
BLOCK_SIZE ("ldbc.snb.datagen.generator.blockSize"),
CELL_SIZE ("ldbc.snb.datagen.generator.cellSize"),
COMPANY_UNCORRELATED_RATIO ("ldbc.snb.datagen.generator.probUnCorrelatedCompany"),
DIFFERENT_IP_IN_TRAVEL_RATIO ("ldbc.snb.datagen.generator.probDiffIPinTravelSeason"),
DIFFERENT_IP_NOT_TRAVEL_RATIO ("ldbc.snb.datagen.generator.probDiffIPnotTravelSeason"),
ENGLISH_RATIO ("ldbc.snb.datagen.generator.probEnglish"),
FLASHMOB_TAGS_PER_MONTH ("ldbc.snb.datagen.generator.flashmobTagsPerMonth"),
FLASHMOB_TAG_DIST_EXP ("ldbc.snb.datagen.generator.flashmobTagDistExp"),
FLASHMOB_TAG_MAX_LEVEL ("ldbc.snb.datagen.generator.flashmobTagMaxLevel"),
FLASHMOB_TAG_MIN_LEVEL ("ldbc.snb.datagen.generator.flashmobTagMinLevel"),
FRIEND_REACCEPT ("ldbc.snb.datagen.generator.friendReApproveRatio"),
FRIEND_REJECT ("ldbc.snb.datagen.generator.friendRejectRatio"),
GROUP_MAX_POST_MONTH ("ldbc.snb.datagen.generator.maxNumGroupPostPerMonth"),
GROUP_MODERATOR_RATIO ("ldbc.snb.datagen.generator.groupModeratorProb"),
LARGE_COMMENT_RATIO ("ldbc.snb.datagen.generator.ratioLargeComment"),
LARGE_POST_RATIO ("ldbc.snb.datagen.generator.ratioLargePost"),
LIMIT_CORRELATED ("ldbc.snb.datagen.generator.limitProCorrelated"),
MAX_COMMENT_POST ("ldbc.snb.datagen.generator.maxNumComments"),
MAX_COMMENT_SIZE ("ldbc.snb.datagen.generator.maxCommentSize"),
MAX_COMPANIES ("ldbc.snb.datagen.generator.maxCompanies"),
MAX_EMAIL ("ldbc.snb.datagen.generator.maxEmails"),
MAX_FRIENDS ("ldbc.snb.datagen.generator.maxNumFriends"),
MAX_GROUP_MEMBERS ("ldbc.snb.datagen.generator.maxNumMemberGroup"),
MAX_LARGE_COMMENT_SIZE ("ldbc.snb.datagen.generator.maxLargeCommentSize"),
MAX_LARGE_POST_SIZE ("ldbc.snb.datagen.generator.maxLargePostSize"),
MAX_NUM_FLASHMOB_POST_PER_MONTH ("ldbc.snb.datagen.generator.maxNumFlashmobPostPerMonth"),
MAX_NUM_GROUP_FLASHMOB_POST_PER_MONTH ("ldbc.snb.datagen.generator.maxNumGroupFlashmobPostPerMonth"),
MAX_NUM_TAG_PER_FLASHMOB_POST ("ldbc.snb.datagen.generator.maxNumTagPerFlashmobPost"),
MAX_PHOTOALBUM ("ldbc.snb.datagen.generator.maxNumPhotoAlbumsPerMonth"),
MAX_PHOTO_PER_ALBUM ("ldbc.snb.datagen.generator.maxNumPhotoPerAlbums"),
MAX_POPULAR_PLACES ("ldbc.snb.datagen.generator.maxNumPopularPlaces"),
MAX_TEXT_SIZE ("ldbc.snb.datagen.generator.maxTextSize"),
MIN_COMMENT_SIZE ("ldbc.snb.datagen.generator.minCommentSize"),
MIN_FRIENDS ("ldbc.snb.datagen.generator.minNumFriends"),
MIN_LARGE_COMMENT_SIZE ("ldbc.snb.datagen.generator.minLargeCommentSize"),
MIN_LARGE_POST_SIZE ("ldbc.snb.datagen.generator.minLargePostSize"),
MIN_TEXT_SIZE ("ldbc.snb.datagen.generator.minTextSize"),
MISSING_RATIO ("ldbc.snb.datagen.generator.missingRatio"),
NUM_CELL_WINDOW ("ldbc.snb.datagen.generator.numberOfCellPerWindow"),
OTHER_BROWSER_RATIO ("ldbc.snb.datagen.generator.probAnotherBrowser"),
POPULAR_PLACE_RATIO ("ldbc.snb.datagen.generator.probPopularPlaces"),
PROB_INTEREST_FLASHMOB_TAG ("ldbc.snb.datagen.generator.probInterestFlashmobTag"),
PROB_RANDOM_PER_LEVEL ("ldbc.snb.datagen.generator.probRandomPerLevel"),
REDUCE_TEXT_RATIO ("ldbc.snb.datagen.generator.ratioReduceText"),
SECOND_LANGUAGE_RATIO ("ldbc.snb.datagen.generator.probSecondLang"),
STATUS_MISSING_RATIO ("ldbc.snb.datagen.generator.missingStatusRatio"),
STATUS_SINGLE_RATIO ("ldbc.snb.datagen.generator.probSingleStatus"),
TAG_UNCORRELATED_COUNTRY ("ldbc.snb.datagen.generator.tagCountryCorrProb"),
UNIVERSITY_UNCORRELATED_RATIO ("ldbc.snb.datagen.generator.probUnCorrelatedOrganization"),
MAX_NUM_LIKE ("ldbc.snb.datagen.generator.maxNumLike"),
UPDATE_PORTION ("ldbc.snb.datagen.serializer.updatePortion"),
USER_MAX_GROUP ("ldbc.snb.datagen.generator.maxNumGroupCreatedPerUser"),
USER_MAX_POST_MONTH ("ldbc.snb.datagen.generator.maxNumPostPerMonth"),
USER_MAX_TAGS ("ldbc.snb.datagen.generator.maxNumTagsPerUser"),
USER_MIN_TAGS ("ldbc.snb.datagen.generator.minNumTagsPerUser");
private final String name;
private ParameterNames( String name ) {
this.name = name;
}
public String toString() {
return name;
}
}
public static double baseProbCorrelated = 0.0; // the base probability to create a correlated edge between two persons
public static double flashmobTagDistExp = 0.0; // the flashmob tag distribution exponent
public static double flashmobTagMaxLevel = 0.0; // the flashmob tag max activity volume level
public static double flashmobTagMinLevel = 0.0; // the flashmob tag min activity volume level
public static double friendReApproveRatio = 0.0;
public static double friendRejectRatio = 0.0;
public static double groupModeratorProb = 0.0;
public static double limitProCorrelated = 0.0;
public static double missingRatio = 0.0;
public static double missingStatusRatio = 0.0;
public static double probAnotherBrowser = 0.0;
public static double probDiffIPinTravelSeason = 0.0; // in travel season
public static double probDiffIPnotTravelSeason = 0.0; // not in travel season
public static double probEnglish = 0.0;
public static double probInterestFlashmobTag = 0.0;
public static double probPopularPlaces = 0.0; //probability of taking a photo at popular place
public static double probRandomPerLevel = 0.0;
public static double probSecondLang = 0.0;
public static double probSingleStatus = 0.0; // Status "Single" has more probability than others'
public static double probTopUniv = 0.0; // 90% users go to top university
public static double probUnCorrelatedCompany = 0.0;
public static double probUnCorrelatedOrganization = 0.0;
public static double ratioLargeComment = 0.0;
public static double ratioLargePost = 0.0;
public static double ratioReduceText = 0.0; // 80% text has size less than 1/2 max size
public static double tagCountryCorrProb = 0.0;
public static double updatePortion = 0.0;
public static int blockSize = 0;
public static int cellSize = 0; // Number of user in one cell
public static int flashmobTagsPerMonth = 0;
public static int maxCommentSize = 0;
public static int maxCompanies = 0;
public static int maxEmails = 0;
public static int maxLargeCommentSize = 0;
public static int maxLargePostSize = 0;
public static int maxNumComments = 0;
public static int maxNumFlashmobPostPerMonth = 0;
public static int maxNumFriends = 0;
public static int maxNumGroupCreatedPerUser = 0;
public static int maxNumGroupFlashmobPostPerMonth = 0;
public static int maxNumGroupPostPerMonth = 0;
public static int maxNumMemberGroup = 0;
public static int maxNumLike = 0;
public static int maxNumPhotoAlbumsPerMonth = 0;
public static int maxNumPhotoPerAlbums = 0;
public static int maxNumPopularPlaces = 0;
public static int maxNumPostPerMonth = 0;
public static int maxNumTagPerFlashmobPost = 0;
public static int maxNumTagsPerUser = 0;
public static int maxTextSize = 0;
public static int minCommentSize = 0;
public static int minLargeCommentSize = 0;
public static int minLargePostSize = 0;
public static int minNumFriends = 0;
public static int minNumTagsPerUser = 0;
public static int minTextSize = 0;
public static int numberOfCellPerWindow = 0;
public static final int startMonth = 1;
public static final int startDate = 1;
public static final int endMonth = 1;
public static final int endDate = 1;
public static final double alpha = 0.4;
public static String outputDir = "./";
public static String hadoopDir = "./";
public static String socialNetworkDir = "./";
public static int numThreads = 1;
public static int deltaTime = 10000;
public static int numPersons = 10000;
public static int startYear = 2010;
public static int endYear = 2013;
public static int numYears = 3;
public static boolean updateStreams = false;
public static boolean exportText = true;
public static boolean compressed = false;
public static int numPartitions = 1;
public static int numUpdatePartitions = 1;
public static void readConf( Configuration conf ) {
try {
ParameterNames values[] = ParameterNames.values();
for( int i = 0; i < values.length; ++i ) {
if (conf.get(values[i].toString()) == null) {
throw new IllegalStateException("Missing " + values[i].toString() + " parameter");
}
}
cellSize = Short.parseShort(conf.get(ParameterNames.CELL_SIZE.toString()));
numberOfCellPerWindow = Integer.parseInt(conf.get(ParameterNames.NUM_CELL_WINDOW.toString()));
minNumFriends = Integer.parseInt(conf.get(ParameterNames.MIN_FRIENDS.toString()));
maxNumFriends = Integer.parseInt(conf.get(ParameterNames.MAX_FRIENDS.toString()));
friendRejectRatio = Double.parseDouble(conf.get(ParameterNames.FRIEND_REJECT.toString()));
friendReApproveRatio = Double.parseDouble(conf.get(ParameterNames.FRIEND_REACCEPT.toString()));
minNumTagsPerUser = Integer.parseInt(conf.get(ParameterNames.USER_MIN_TAGS.toString()));
maxNumTagsPerUser = Integer.parseInt(conf.get(ParameterNames.USER_MAX_TAGS.toString()));
maxNumPostPerMonth = Integer.parseInt(conf.get(ParameterNames.USER_MAX_POST_MONTH.toString()));
maxNumComments = Integer.parseInt(conf.get(ParameterNames.MAX_COMMENT_POST.toString()));
limitProCorrelated = Double.parseDouble(conf.get(ParameterNames.LIMIT_CORRELATED.toString()));
baseProbCorrelated = Double.parseDouble(conf.get(ParameterNames.BASE_CORRELATED.toString()));
maxEmails = Integer.parseInt(conf.get(ParameterNames.MAX_EMAIL.toString()));
maxCompanies = Integer.parseInt(conf.get(ParameterNames.MAX_EMAIL.toString()));
probEnglish = Double.parseDouble(conf.get(ParameterNames.MAX_EMAIL.toString()));
probSecondLang = Double.parseDouble(conf.get(ParameterNames.MAX_EMAIL.toString()));
probAnotherBrowser = Double.parseDouble(conf.get(ParameterNames.OTHER_BROWSER_RATIO.toString()));
minTextSize = Integer.parseInt(conf.get(ParameterNames.MIN_TEXT_SIZE.toString()));
maxTextSize = Integer.parseInt(conf.get(ParameterNames.MAX_TEXT_SIZE.toString()));
minCommentSize = Integer.parseInt(conf.get(ParameterNames.MIN_COMMENT_SIZE.toString()));
maxCommentSize = Integer.parseInt(conf.get(ParameterNames.MAX_COMMENT_SIZE.toString()));
ratioReduceText = Double.parseDouble(conf.get(ParameterNames.REDUCE_TEXT_RATIO.toString()));
minLargePostSize = Integer.parseInt(conf.get(ParameterNames.MIN_LARGE_POST_SIZE.toString()));
maxLargePostSize = Integer.parseInt(conf.get(ParameterNames.MAX_LARGE_POST_SIZE.toString()));
minLargeCommentSize = Integer.parseInt(conf.get(ParameterNames.MIN_LARGE_COMMENT_SIZE.toString()));
maxLargeCommentSize = Integer.parseInt(conf.get(ParameterNames.MAX_LARGE_COMMENT_SIZE.toString()));
ratioLargePost = Double.parseDouble(conf.get(ParameterNames.LARGE_POST_RATIO.toString()));
ratioLargeComment = Double.parseDouble(conf.get(ParameterNames.LARGE_COMMENT_RATIO.toString()));
maxNumLike = Integer.parseInt(conf.get(ParameterNames.MAX_NUM_LIKE.toString()));
maxNumPhotoAlbumsPerMonth = Integer.parseInt(conf.get(ParameterNames.MAX_PHOTOALBUM.toString()));
maxNumPhotoPerAlbums = Integer.parseInt(conf.get(ParameterNames.MAX_PHOTO_PER_ALBUM.toString()));
maxNumGroupCreatedPerUser = Integer.parseInt(conf.get(ParameterNames.USER_MAX_GROUP.toString()));
maxNumMemberGroup = Integer.parseInt(conf.get(ParameterNames.MAX_GROUP_MEMBERS.toString()));
groupModeratorProb = Double.parseDouble(conf.get(ParameterNames.GROUP_MODERATOR_RATIO.toString()));
maxNumGroupPostPerMonth = Integer.parseInt(conf.get(ParameterNames.GROUP_MAX_POST_MONTH.toString()));
missingRatio = Double.parseDouble(conf.get(ParameterNames.MISSING_RATIO.toString()));
missingStatusRatio = Double.parseDouble(conf.get(ParameterNames.STATUS_MISSING_RATIO.toString()));
probSingleStatus = Double.parseDouble(conf.get(ParameterNames.STATUS_SINGLE_RATIO.toString()));
probDiffIPinTravelSeason = Double.parseDouble(conf.get(ParameterNames.DIFFERENT_IP_IN_TRAVEL_RATIO.toString()));
probDiffIPnotTravelSeason = Double.parseDouble(conf.get(ParameterNames.DIFFERENT_IP_NOT_TRAVEL_RATIO.toString()));
probUnCorrelatedCompany = Double.parseDouble(conf.get(ParameterNames.COMPANY_UNCORRELATED_RATIO.toString()));
probUnCorrelatedOrganization = Double.parseDouble(conf.get(ParameterNames.UNIVERSITY_UNCORRELATED_RATIO.toString()));
probTopUniv = Double.parseDouble(conf.get(ParameterNames.BEST_UNIVERSTY_RATIO.toString()));
maxNumPopularPlaces = Integer.parseInt(conf.get(ParameterNames.MAX_POPULAR_PLACES.toString()));
probPopularPlaces = Double.parseDouble(conf.get(ParameterNames.POPULAR_PLACE_RATIO.toString()));
tagCountryCorrProb = Double.parseDouble(conf.get(ParameterNames.TAG_UNCORRELATED_COUNTRY.toString()));
flashmobTagsPerMonth = Integer.parseInt(conf.get(ParameterNames.FLASHMOB_TAGS_PER_MONTH.toString()));
probInterestFlashmobTag = Double.parseDouble(conf.get(ParameterNames.PROB_INTEREST_FLASHMOB_TAG.toString()));
probRandomPerLevel = Double.parseDouble(conf.get(ParameterNames.PROB_RANDOM_PER_LEVEL.toString()));
maxNumFlashmobPostPerMonth = Integer.parseInt(conf.get(ParameterNames.MAX_NUM_FLASHMOB_POST_PER_MONTH.toString()));
maxNumGroupFlashmobPostPerMonth = Integer.parseInt(conf.get(ParameterNames.MAX_NUM_GROUP_FLASHMOB_POST_PER_MONTH.toString()));
maxNumTagPerFlashmobPost = Integer.parseInt(conf.get(ParameterNames.MAX_NUM_TAG_PER_FLASHMOB_POST.toString()));
flashmobTagMinLevel = Double.parseDouble(conf.get(ParameterNames.FLASHMOB_TAG_MIN_LEVEL.toString()));
flashmobTagMaxLevel = Double.parseDouble(conf.get(ParameterNames.FLASHMOB_TAG_MAX_LEVEL.toString()));
flashmobTagDistExp = Double.parseDouble(conf.get(ParameterNames.FLASHMOB_TAG_DIST_EXP.toString()));
updatePortion = Double.parseDouble(conf.get(ParameterNames.UPDATE_PORTION.toString()));
blockSize = Integer.parseInt(conf.get(ParameterNames.BLOCK_SIZE.toString()));
} catch (Exception e) {
System.out.println("Error reading scale factors");
System.err.println(e.getMessage());
System.exit(-1);
}
try {
numPersons = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numPersons"));
startYear = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.startYear"));
numYears = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numYears"));
endYear = startYear + numYears;
compressed = conf.getBoolean("ldbc.snb.datagen.serializer.compressed",false);
numThreads = conf.getInt("ldbc.snb.datagen.generator.numThreads",1);
updateStreams = conf.getBoolean("ldbc.snb.datagen.serializer.updateStreams",false);
numPartitions = conf.getInt("ldbc.snb.datagen.serializer.numPartitions",1);
numUpdatePartitions = conf.getInt("ldbc.snb.datagen.serializer.numUpdatePartitions",1);
deltaTime = conf.getInt("ldbc.snb.datagen.generator.deltaTime",10000);
outputDir = conf.get("ldbc.snb.datagen.serializer.outputDir");
hadoopDir = outputDir+"/hadoop";
socialNetworkDir = outputDir+"social_network";
System.out.println(" ... Num Persons " + numPersons);
System.out.println(" ... Start Year " + startYear);
System.out.println(" ... Num Years " + numYears);
} catch (Exception e) {
System.err.println(e.getMessage());
System.exit(-1);
}
}
}