package org.wikibrain.core.lang; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.typesafe.config.Config; import gnu.trove.set.TByteSet; import gnu.trove.set.hash.TByteHashSet; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.wikibrain.conf.Configuration; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.FileMatcher; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.MetaInfoDao; import org.wikibrain.core.model.LocalPage; import java.util.*; /** * Author: bjhecht, shilad */ public class LanguageSet implements Iterable<Language> { public static final LanguageSet ALL = new LanguageSet( Language.getByLangCode("en"), Arrays.asList(Language.LANGUAGES)); private Set<Language> langs; private Language defaultLanguage; /** * Creates the empty language set */ public LanguageSet() { this(null, new ArrayList<Language>()); } /** * Initializes a new instance of a LanguageSet using a comma-separated list of * language codes (as defined by the Wikimedia Foundation). For instance, * "en,de,fr" will result in a LanguageSet with English, German, and French. * @param csv A list of language codes separated by commas. The first language * is automatically assumed to be the default language. */ public LanguageSet(String csv) { this(Arrays.asList(csv.split(","))); } public LanguageSet(List<String> langCodes) { this(getLangsFromCodes(langCodes)); } /** * Creates an instance of a language set with defaultLang as the default language and * inputLangs as the set of languages. * @param defaultLang * @param inputLangs */ public LanguageSet(Language defaultLang, Collection<Language> inputLangs) { if (defaultLang != null && !inputLangs.contains(defaultLang)) { throw new IllegalArgumentException("Attempted to initiate a LanguageSet with a default language" + " that is not in the input collection of languages"); } this.langs = Sets.newHashSet(); this.langs.addAll(inputLangs); this.defaultLanguage = defaultLang; } /** * Creates a LanguageSet instance with an undefined default language * @param inputLangs */ public LanguageSet(Collection<Language> inputLangs) { this(getDefault(inputLangs), inputLangs); } /** * Creates a LanguageSet instance with a single language * @param inputLang */ public LanguageSet(Language inputLang) { this(inputLang, Arrays.asList(inputLang)); } private static Language getDefault(Collection<Language> inputLangs) { if (inputLangs.isEmpty()) { return null; } List<Language> temp = new ArrayList<Language>(inputLangs); Collections.sort(temp); return temp.iterator().next(); } /** * Sets the default language. * @param newDefaultLanguage * @throws WikiBrainException If the input default language is not in the language set. */ public void setDefaultLanguage(Language newDefaultLanguage) throws WikiBrainException { if (!langs.contains(newDefaultLanguage)) { throw new WikiBrainException(String.format("Attempted to make %s a default language, " + "but it is not in the language set: %s", newDefaultLanguage.getLangCode(), this.toString())); } this.defaultLanguage = newDefaultLanguage; } public Language getDefaultLanguage() { return defaultLanguage; } public Set<Language> getLanguages() { return Collections.unmodifiableSet(langs); } public int size() { return langs.size(); } public boolean containsLanguage(Language language){ return langs.contains(language); } public boolean containsLanguage(String langCode){ return langs.contains(Language.getByLangCode(langCode)); } public String getLangCodeString() { List<String> output = Lists.newArrayList(); for (Language lang : langs) { if (lang.equals(defaultLanguage)) { output.add(lang.getLangCode().toUpperCase()); } else { output.add(lang.getLangCode()); } } Collections.sort(output); return StringUtils.join(output, ","); } public List<String> getLangCodes() { List<String> output = Lists.newArrayList(); for (Language lang : langs) { output.add(lang.getLangCode()); } return output; } public byte[] toByteArray() { TByteSet byteSet = new TByteHashSet(); Set<byte[]> extras = new HashSet<byte[]>(); for (Language l : langs) { short id = l.getId(); if (id < 256) { byteSet.add((byte) (id-128)); } else { byte[] temp = new byte[2]; temp[0] = (byte) -128; temp[1] = (byte) (id-255-128); extras.add(temp); } } byte[] output = byteSet.toArray(); for (byte[] b : extras) { output = ArrayUtils.addAll(output, b); } return output; } public byte[] toByteArray(int maxSize) { byte[] temp = toByteArray(); return Arrays.copyOf(temp, maxSize < temp.length ? maxSize : temp.length); } public static LanguageSet getLanguageSet(byte[] truncated) { Set<Language> languages = new HashSet<Language>(); boolean extra = false; for (byte b : truncated) { if (extra) { languages.add(Language.getById(b+128+255)); extra = false; } else if (b == -128) { extra = true; } else { languages.add(Language.getById(b + 128)); } } return new LanguageSet(languages); } /** * Returns English if English is in the set, else returns Simple. If Simple is not in the * set, will return the default language or throws an exception, depending on the value of returnDefaultLangIfEnglishNotAvailable * @return * @throws WikiBrainException */ public Language getBestAvailableEnglishLang(boolean returnDefaultLangIfEnglishNotAvailable) throws WikiBrainException { if (this.containsLanguage(Language.getByLangCode("en"))){ return Language.getByLangCode("en"); }else if (this.containsLanguage(Language.getByLangCode("simple"))){ return Language.getByLangCode("simple"); }else{ if (returnDefaultLangIfEnglishNotAvailable){ return this.getDefaultLanguage(); } throw new WikiBrainException("No English language available"); } } private static Collection<Language> getLangsFromCodes(Collection<String> langCodes) { Collection<Language> languages = new ArrayList<Language>(); for (String langCode : langCodes) { languages.add(Language.getByLangCode(langCode.trim())); } return languages; } @Override public boolean equals(Object o){ if (o instanceof LanguageSet){ String myString = this.toString(); String theirString = o.toString(); return (myString.equals(theirString)); } return false; } @Override public String toString(){ return "(" + getLangCodeString() + ")"; } @Override public Iterator<Language> iterator() { return langs.iterator(); } static class Provider extends org.wikibrain.conf.Provider<LanguageSet> { public Provider(Configurator configurator, Configuration config) throws ConfigurationException { super(configurator, config); } @Override public Class<LanguageSet> getType() { return LanguageSet.class; } @Override public String getPath() { return "languages"; // hack: languages are in the root element } @Override public LanguageSet get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { try { String type = config.getString("type"); if (type.equals("loaded")) { MetaInfoDao miDao = getConfigurator().get(MetaInfoDao.class); return miDao.getLoadedLanguages(LocalPage.class); } else if (type.equals("downloaded")) { List<Language> languages = new ArrayList<Language>(); // TODO: set the default language reasonably for (Language lang : Language.LANGUAGES) { if (Env.getFiles(lang, FileMatcher.ARTICLES, getConfig()).size() > 0) { languages.add(lang); } } return new LanguageSet(languages); } else if (type.equals("custom")) { return new LanguageSet(config.getStringList("langCodes")); } else { throw new ConfigurationException("Unknown LanguageSet type: " + type); } } catch (DaoException e) { throw new ConfigurationException(e); } } } }