/* * Copyright (c) 2013 LDBC * Linked Data Benchmark Council (http://ldbc.eu) * * This file is part of ldbc_socialnet_dbgen. * * ldbc_socialnet_dbgen is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ldbc_socialnet_dbgen is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with ldbc_socialnet_dbgen. If not, see <http://www.gnu.org/licenses/>. * * Copyright (C) 2011 OpenLink Software <bdsmt@openlinksw.com> * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; only Version 2 of the License dated * June 1991. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ package ldbc.snb.datagen.dictionary; import ldbc.snb.datagen.generator.DatagenParams; import umontreal.iro.lecuyer.probdist.GeometricDist; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Random; import java.util.Vector; public class NamesDictionary { /** * Geometric probability used */ private static final double GEOMETRIC_RATIO = 0.2; private static final int topN = 30; PlaceDictionary locationDic; HashMap<Integer, Vector<String>> surNamesByLocations; Vector<HashMap<Integer, Vector<String>>> givenNamesByLocationsMale; // Year / Location / Names Vector<HashMap<Integer, Vector<String>>> givenNamesByLocationsFemale; GeometricDist geoDist; public NamesDictionary( PlaceDictionary locationDic ) { this.locationDic = locationDic; geoDist = new GeometricDist(GEOMETRIC_RATIO); init(); } private void init() { surNamesByLocations = new HashMap<Integer, Vector<String>>(); for (Integer id : locationDic.getCountries()) { surNamesByLocations.put(id, new Vector<String>()); } //assume that there is only 2 periods of birthyears int birthYearPeriod = 2; givenNamesByLocationsMale = new Vector<HashMap<Integer, Vector<String>>>(birthYearPeriod); givenNamesByLocationsFemale = new Vector<HashMap<Integer, Vector<String>>>(birthYearPeriod); for (int i = 0; i < birthYearPeriod; i++){ givenNamesByLocationsMale.add(new HashMap<Integer, Vector<String>>()); givenNamesByLocationsFemale.add(new HashMap<Integer, Vector<String>>()); for (Integer id : locationDic.getCountries()) { givenNamesByLocationsMale.lastElement().put(id, new Vector<String>()); givenNamesByLocationsFemale.lastElement().put(id, new Vector<String>()); } } extractSurNames(); extractGivenNames(); } public void extractSurNames() { try { BufferedReader surnameDictionary = new BufferedReader( new InputStreamReader(getClass( ).getResourceAsStream(DatagenParams.surnamDictionaryFile), "UTF-8")); String line; int totalSurNames = 0; while ((line = surnameDictionary.readLine()) != null) { String infos[] = line.split(","); String locationName = infos[1]; int locationId = locationDic.getCountryId(locationName); if( locationId != locationDic.INVALID_LOCATION ) { String surName = infos[2].trim(); surNamesByLocations.get(locationId).add(surName); totalSurNames++; } } surnameDictionary.close(); System.out.println("Done ... " + totalSurNames + " surnames were extracted "); } catch (IOException e) { e.printStackTrace(); } } public void extractGivenNames() { try { BufferedReader givennameDictionary = new BufferedReader( new InputStreamReader(getClass( ).getResourceAsStream(DatagenParams.nameDictionaryFile), "UTF-8")); String line; int totalGivenNames = 0; while ((line = givennameDictionary.readLine()) != null){ String infos[] = line.split(" "); String locationName = infos[0]; int gender = Integer.parseInt(infos[2]); int birthYearPeriod = Integer.parseInt(infos[3]); int locationId = locationDic.getCountryId(locationName); if( locationId != locationDic.INVALID_LOCATION ) { String givenName = infos[1].trim(); if (gender == 0) { givenNamesByLocationsMale.get(birthYearPeriod).get(locationId).add(givenName); } else { givenNamesByLocationsFemale.get(birthYearPeriod).get(locationId).add(givenName); } totalGivenNames++; } } givennameDictionary.close(); System.out.println("Done ... " + totalGivenNames + " given names were extracted "); } catch (IOException e) { e.printStackTrace(); } } /* * If the number of names is smaller than the computed rank * uniformly get a name from all names * Else, from 0 to (limitRank - 1) will be distributed according to * geometric distribution, out of this scope will be distribution */ private int getGeoDistRandomIdx(Random random, int numNames){ int nameIdx = -1; double prob = random.nextDouble(); int rank = geoDist.inverseFInt(prob); if (rank < topN) { if (numNames > rank) { nameIdx = rank; } else { nameIdx = random.nextInt(numNames); } } else { if (numNames > rank) { nameIdx = topN + random.nextInt(numNames - topN); } else { nameIdx = random.nextInt(numNames); } } return nameIdx; } public String getRandomSurname(Random random,int locationId) { int surNameIdx = getGeoDistRandomIdx(random,surNamesByLocations.get(locationId).size()); return surNamesByLocations.get(locationId).get(surNameIdx); } public String getRandomGivenName(Random random, int locationId, boolean isMale, int birthYear){ String name = ""; int period = (birthYear < 1985) ? 0 : 1; Vector<HashMap<Integer, Vector<String>>> target = (isMale) ? givenNamesByLocationsMale : givenNamesByLocationsFemale; // Note that, only vector of names for the first period contains list of names not in topN int nameId = getGeoDistRandomIdx(random, target.get(0).get(locationId).size()); if (nameId >= topN) { name = target.get(0).get(locationId).get(nameId); } else { name = target.get(period).get(locationId).get(nameId); } return name; } /** * return a given name which is the median of topN for a given location/gender/year * we use it for parameter generation */ public String getMedianGivenName(int locationId, boolean isMale, int birthYear){ int period = 0; Vector<HashMap<Integer, Vector<String>>> target = (isMale) ? givenNamesByLocationsMale : givenNamesByLocationsFemale; int size = target.get(period).get(locationId).size(); String name = target.get(period).get(locationId).get(size/2); return name; } }