/* * Copyright (c) 2013 LDBC * Linked Data Benchmark Council (http://ldbc.eu) * * This file is part of ldbc_socialnet_dbgen. * * ldbc_socialnet_dbgen is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * ldbc_socialnet_dbgen is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with ldbc_socialnet_dbgen. If not, see <http://www.gnu.org/licenses/>. * * Copyright (C) 2011 OpenLink Software <bdsmt@openlinksw.com> * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; only Version 2 of the License dated * June 1991. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ package ldbc.snb.datagen.dictionary; import ldbc.snb.datagen.generator.DatagenParams; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Random; public class LanguageDictionary { private static final String SEPARATOR = " "; private static final String ISO_ENGLISH_CODE = "en"; private ArrayList<String> languages; /** * < @brief The array of languages. * */ private HashMap<Integer, ArrayList<Integer>> officalLanguagesByCountry; /** * < @brief The official languages by country. * */ private HashMap<Integer, ArrayList<Integer>> languagesByCountry; /** * < @brief The languages by country. * */ private PlaceDictionary placeDictionary; /** * < @brief The location dictionary. * */ private double probEnglish; /** * < @brief The probability to speak english. * */ private double probSecondLang; /**< @brief The probability of speaking a second language. **/ /** * @param locationDic The location dictionary. * @param probEnglish The probability of speaking english. * @param probSecondLang The probability of speaking a second language. * @brief Constructor */ public LanguageDictionary(PlaceDictionary locationDic, double probEnglish, double probSecondLang) { this.placeDictionary = locationDic; this.probEnglish = probEnglish; this.probSecondLang = probSecondLang; this.languages = new ArrayList<String>(); this.officalLanguagesByCountry = new HashMap<Integer, ArrayList<Integer>>(); this.languagesByCountry = new HashMap<Integer, ArrayList<Integer>>(); load(DatagenParams.languageDictionaryFile); } /** * @param fileName The name of the dictionary file. * @brief Loads a dictionary file. */ private void load(String fileName) { try { for (Integer id : placeDictionary.getCountries()) { officalLanguagesByCountry.put(id, new ArrayList<Integer>()); languagesByCountry.put(id, new ArrayList<Integer>()); } BufferedReader dictionary = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream(fileName), "UTF-8")); String line; while ((line = dictionary.readLine()) != null) { String data[] = line.split(SEPARATOR); if (placeDictionary.getCountryId(data[0]) != PlaceDictionary.INVALID_LOCATION) { for (int i = 1; i < data.length; i++) { Integer countryId = placeDictionary.getCountryId(data[0]); String languageData[] = data[i].split(" "); Integer id = languages.indexOf(languageData[0]); if (id == -1) { id = languages.size(); languages.add(languageData[0]); } if (languageData.length == 3) { officalLanguagesByCountry.get(countryId).add(id); } languagesByCountry.get(countryId).add(id); } } } dictionary.close(); } catch (IOException e) { e.printStackTrace(); } } /** * @param languageId The language identifier. * @return The name of the language. * @brief Gets the name of the language. */ public String getLanguageName(int languageId) { if (languageId < 0 || languageId >= languages.size()) { System.out.println("Trying to acces the invalid language with id=" + languageId); return ""; } return languages.get(languageId); } /** * @param random Random number generator. * @param country The country to retrieve the languages from. * @return The set of randomly choosen languages. * @breif Gets a set of random languages from a country. */ public ArrayList<Integer> getLanguages(Random random, int country) { ArrayList<Integer> langSet = new ArrayList<Integer>(); if (officalLanguagesByCountry.get(country).size() != 0) { int id = random.nextInt(officalLanguagesByCountry.get(country).size()); langSet.add(officalLanguagesByCountry.get(country).get(id)); } else { int id = random.nextInt(languagesByCountry.get(country).size()); langSet.add(languagesByCountry.get(country).get(id)); } if (random.nextDouble() < probSecondLang) { int id = random.nextInt(languagesByCountry.get(country).size()); if (langSet.indexOf(languagesByCountry.get(country).get(id)) == -1) { langSet.add(languagesByCountry.get(country).get(id)); } } return langSet; } /** * @param random * @return The language. * @brief Gets a random language. */ public int getInternationlLanguage(Random random) { Integer languageId = -1; if (random.nextDouble() < probEnglish) { languageId = languages.indexOf(ISO_ENGLISH_CODE); } return languageId; } }