package ldbc.snb.datagen.generator;
import ldbc.snb.datagen.dictionary.Dictionaries;
import ldbc.snb.datagen.generator.distribution.DegreeDistribution;
import ldbc.snb.datagen.generator.distribution.utils.BucketedDistribution;
import ldbc.snb.datagen.objects.Person;
import ldbc.snb.datagen.util.RandomGeneratorFarm;
import ldbc.snb.datagen.vocabulary.SN;
import org.apache.hadoop.conf.Configuration;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.GregorianCalendar;
/**
* Created by aprat on 10/7/14.
*/
public class PersonGenerator {
private DegreeDistribution degreeDistribution_ = null;
private PowerDistGenerator randomTagPowerLaw = null;
private RandomGeneratorFarm randomFarm = null;
private int nextId = 0;
public PersonGenerator( Configuration conf, String degreeDistribution ) {
try{
degreeDistribution_ = (DegreeDistribution)Class.forName(degreeDistribution).newInstance();
degreeDistribution_.initialize(conf);
} catch(ClassNotFoundException e) {
System.out.print(e.getMessage());
} catch(IllegalAccessException e) {
System.out.print(e.getMessage());
} catch(InstantiationException e) {
System.out.print(e.getMessage());
}
randomTagPowerLaw = new PowerDistGenerator( DatagenParams.minNumTagsPerUser,
DatagenParams.maxNumTagsPerUser + 1,
DatagenParams.alpha);
randomFarm = new RandomGeneratorFarm();
}
private long composeUserId(long id, long date) {
long idMask = ~(0xFFFFFFFFFFFFFFFFL << 41);
long bucket = (long) (256 * (date - Dictionaries.dates.getStartDateTime()) / (double) Dictionaries.dates.getEndDateTime());
return (bucket << 41) | ((id & idMask));
}
/** Tells if a person is a large poster or not.
*
* @param user The person to check.
* @return True if the person is a large poster. False otherwise.
*/
private boolean isUserALargePoster(Person user) {
if (Dictionaries.dates.getBirthMonth(user.birthDay()) == GregorianCalendar.JANUARY) {
return true;
}
return false;
}
private Person generateUser() {
long creationDate = Dictionaries.dates.randomPersonCreationDate(randomFarm.get(RandomGeneratorFarm.Aspect.DATE));
int countryId = Dictionaries.places.getCountryForUser(randomFarm.get(RandomGeneratorFarm.Aspect.COUNTRY));
Person person = new Person();
person.creationDate(creationDate);
person.gender((randomFarm.get(RandomGeneratorFarm.Aspect.GENDER).nextDouble() > 0.5) ? (byte) 1 : (byte) 0);
person.birthDay(Dictionaries.dates.getBirthDay(randomFarm.get(RandomGeneratorFarm.Aspect.BIRTH_DAY), creationDate));
person.browserId(Dictionaries.browsers.getRandomBrowserId(randomFarm.get(RandomGeneratorFarm.Aspect.BROWSER)));
person.countryId(countryId);
person.cityId(Dictionaries.places.getRandomCity(randomFarm.get(RandomGeneratorFarm.Aspect.CITY), countryId));
person.ipAddress(Dictionaries.ips.getRandomIPFromLocation(randomFarm.get(RandomGeneratorFarm.Aspect.IP), countryId));
person.maxNumKnows(Math.min(degreeDistribution_.nextDegree(),DatagenParams.numPersons));
person.accountId(composeUserId(nextId++, creationDate));
person.mainInterest(Dictionaries.tags.getaTagByCountry(randomFarm.get(RandomGeneratorFarm.Aspect.TAG_OTHER_COUNTRY), randomFarm.get(RandomGeneratorFarm.Aspect.TAG), person.countryId()));
short numTags = ((short) randomTagPowerLaw.getValue(randomFarm.get(RandomGeneratorFarm.Aspect.NUM_TAG)));
person.interests(Dictionaries.tagMatrix.getSetofTags(randomFarm.get(RandomGeneratorFarm.Aspect.TOPIC), randomFarm.get(RandomGeneratorFarm.Aspect.TAG_OTHER_COUNTRY), person.mainInterest(), numTags));
person.universityLocationId(Dictionaries.universities.getRandomUniversity(randomFarm, person.countryId()));
person.randomId(randomFarm.get(RandomGeneratorFarm.Aspect.RANDOM).nextInt(Integer.MAX_VALUE) % 100);
person.firstName(Dictionaries.names.getRandomGivenName(randomFarm.get(RandomGeneratorFarm.Aspect.NAME),
person.countryId(),
person.gender() == 1,
Dictionaries.dates.getBirthYear(person.birthDay())));
person.lastName(Dictionaries.names.getRandomSurname(randomFarm.get(RandomGeneratorFarm.Aspect.SURNAME), person.countryId()));
int numEmails = randomFarm.get(RandomGeneratorFarm.Aspect.EXTRA_INFO).nextInt(DatagenParams.maxEmails) + 1;
double prob = randomFarm.get(RandomGeneratorFarm.Aspect.EXTRA_INFO).nextDouble();
/*if (prob >= DatagenParams.missingRatio)*/ {
String base = person.firstName();
base = Normalizer.normalize(base, Normalizer.Form.NFD);
base = base.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
base = base.replaceAll(" ", ".");
base = base.replaceAll("[.]+", ".");
for (int i = 0; i < numEmails; i++) {
String email = base + "" + person.accountId() + "@" +
Dictionaries.emails.getRandomEmail(randomFarm.get(RandomGeneratorFarm.Aspect.TOP_EMAIL),
randomFarm.get(RandomGeneratorFarm.Aspect.EMAIL));
person.emails().add(email);
}
}
// Set class year
prob = randomFarm.get(RandomGeneratorFarm.Aspect.EXTRA_INFO).nextDouble();
if ((prob < DatagenParams.missingRatio) || person.universityLocationId() == -1) {
person.classYear(-1);
} else {
person.classYear(Dictionaries.dates.getClassYear(randomFarm.get(RandomGeneratorFarm.Aspect.DATE),
person.creationDate(),
person.birthDay()));
}
// Set company and workFrom
int numCompanies = randomFarm.get(RandomGeneratorFarm.Aspect.EXTRA_INFO).nextInt(DatagenParams.maxCompanies) + 1;
prob = randomFarm.get(RandomGeneratorFarm.Aspect.EXTRA_INFO).nextDouble();
if (prob >= DatagenParams.missingRatio) {
for (int i = 0; i < numCompanies; i++) {
long workFrom;
workFrom = Dictionaries.dates.getWorkFromYear(randomFarm.get(RandomGeneratorFarm.Aspect.DATE),
person.classYear(),
person.birthDay());
long company = Dictionaries.companies.getRandomCompany(randomFarm, person.countryId());
person.companies().put(company, workFrom);
}
}
ArrayList<Integer> userLanguages = Dictionaries.languages.getLanguages(randomFarm.get(RandomGeneratorFarm.Aspect.LANGUAGE),
person.countryId());
int internationalLang = Dictionaries.languages.getInternationlLanguage(randomFarm.get(RandomGeneratorFarm.Aspect.LANGUAGE));
if (internationalLang != -1 && userLanguages.indexOf(internationalLang) == -1) {
userLanguages.add(internationalLang);
}
person.languages().addAll(userLanguages);
// Set activity characteristics
person.isLargePoster(isLargePoster(person));
return person;
}
private boolean isLargePoster(Person p ) {
if(Dictionaries.dates.getBirthMonth(p.birthDay()) == GregorianCalendar.JANUARY) {
return true;
}
return false;
}
private void resetState(int blockId){
degreeDistribution_.reset(blockId);
randomFarm.resetRandomGenerators((long) blockId);
}
/** Generates a block of persons
*
* @param seed The seed to feed the pseudo-random number generators.
* @param blockSize The size of the block of persons to generate.
* @return
*/
public Person[] generateUserBlock( int seed, int blockSize ) {
resetState(seed);
nextId=seed*blockSize;
SN.machineId = seed;
Person[] block;
block = new Person[blockSize];
for (int j =0; j < blockSize; ++j) {
block[j] = generateUser();
// System.out.println(j);
}
return block;
}
}