package org.wikibrain.core.lang; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Locale; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A language associated with a language edition of Wikipedia. * The set of all Languages is loaded when the system starts up from a text file. * Languages can be queried by langCode or id. */ public class Language implements Comparable<Language>, Serializable { private static final long serialVersionUID = 6331325313592646604l; public static final String LANGUAGE_TSV = "languages.tsv"; /** * Languages is immediately initialized based on the languages.tsv. */ public static Language[] LANGUAGES; static { InputStream stream = null; try { stream = Language.class.getClassLoader() .getResourceAsStream(LANGUAGE_TSV); loadAllLanguages(stream); } catch (IOException e) { throw new RuntimeException(e); // What else can we do? } finally { if (stream != null) IOUtils.closeQuietly(stream); } } private final short id; private final String langCode; private final String enLangName; private final String nativeName; private Locale locale; private Language(short id, String langCode, String enLangName, String nativeName) { this.id = id; this.langCode = langCode; this.enLangName = enLangName; this.nativeName = nativeName; } public short getId() { return id; } public String getLangCode() { return langCode; } public String getEnLangName() { return enLangName; } public String getNativeName() { return nativeName; } public Locale getLocale() { if (locale != null) { return locale; } synchronized (this) { if (locale == null){ locale = new Locale(langCode); } } return locale; } /** * @param langCode langCode, such as "en" * @return associated language * @throws IllegalArgumentException if langCode is unknown. */ public static Language getByLangCode(String langCode) { langCode = langCode.replace('_', '-').toLowerCase(); if (WIKIDATA.getLangCode().equals(langCode)) { return WIKIDATA; } for (Language lang : LANGUAGES) { if (lang.langCode.equalsIgnoreCase(langCode)) { return lang; } } throw new IllegalArgumentException("unknown langCode: '" + langCode + "'"); } public static Language getByLangCodeLenient(String langCode) { langCode = langCode.replace('_', '-').toLowerCase(); List<String> flavors = new ArrayList<String>(); flavors.add(langCode); if (langCode.contains("-")) { flavors.add(langCode.substring(0, langCode.indexOf("-"))); } for (String s : flavors) { try { return getByLangCode(s); } catch (IllegalArgumentException e) { } } throw new IllegalArgumentException("unknown langCode: '" + langCode + "'"); } public static Language getByFullLangName(String language) { for (Language lang : LANGUAGES) { if (lang.enLangName.equalsIgnoreCase(language) || lang.nativeName.equalsIgnoreCase(language)) { return lang; } } throw new IllegalArgumentException("unknown language: '" + language + "'"); } public static boolean hasLangCode(String langCode) { langCode = langCode.replace('_', '-').toLowerCase(); for (Language lang : LANGUAGES) { if (lang.langCode.equalsIgnoreCase(langCode)) { return true; } } return false; } /** * @param id numeric id associated with the language * @return associated language w * @throws IllegalArgumentException if id is unknown. */ public static Language getById(int id) { if (0 < id && id <= LANGUAGES.length) { return LANGUAGES[id-1]; } else { throw new IllegalArgumentException("unknown language id: '" + id + "'"); } } /** * Loads the languages from the text file. */ static private void loadAllLanguages(InputStream stream) throws IOException { List<String> lines = IOUtils.readLines(stream, "UTF-8"); String header = lines.get(0); lines = lines.subList(1, lines.size()); if (header.equals("id\tlangCode\tenLangName\tnativeName")) { LANGUAGES = new Language[lines.size()]; for (int i = 0; i < lines.size(); i++) { String[] cols = StringUtils.splitPreserveAllTokens(lines.get(i), "\t"); short id = Short.parseShort(cols[0]); //int id = Integer.parseInt(cols[0]); if (id != i+1) { throw new IOException("expected language id " + (i+1) + ", but got " + id); } LANGUAGES[i] = new Language(id, cols[1], cols[2], cols[3]); } } else { throw new IOException("invalid header in languages.tsv: " + header); } } public String getDomain() { return langCode + ".wikipedia.org"; } public LanguageInfo getLanguageInfo() { return LanguageInfo.getByLanguage(this); } @Override public int compareTo(Language language) { return Short.valueOf(this.id).compareTo(language.id); } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Language language = (Language) o; if (id != language.id) return false; if (!langCode.equals(language.langCode)) return false; return true; } @Override public int hashCode() { int result = (int) id; result = 31 * result + langCode.hashCode(); return result; } @Override public String toString() { return this.getEnLangName(); } /** * HACK: Not really a language, but treated as a language by Wikimedia. * Must come before other languages. */ public static Language WIKIDATA = new Language((short) -1, "wikidata", "Wikidata", "Wikidata"); /** * These can be automatically regenereated by running * mac-wikibrain/wikibrain-core/src/main/resources/make_lang_constants.py */ public static final Language EN = Language.getByLangCode("en"); public static final Language DE = Language.getByLangCode("de"); public static final Language FR = Language.getByLangCode("fr"); public static final Language NL = Language.getByLangCode("nl"); public static final Language IT = Language.getByLangCode("it"); public static final Language PL = Language.getByLangCode("pl"); public static final Language ES = Language.getByLangCode("es"); public static final Language RU = Language.getByLangCode("ru"); public static final Language JA = Language.getByLangCode("ja"); public static final Language PT = Language.getByLangCode("pt"); public static final Language ZH = Language.getByLangCode("zh"); public static final Language SV = Language.getByLangCode("sv"); public static final Language VI = Language.getByLangCode("vi"); public static final Language UK = Language.getByLangCode("uk"); public static final Language CA = Language.getByLangCode("ca"); public static final Language NO = Language.getByLangCode("no"); public static final Language FI = Language.getByLangCode("fi"); public static final Language CS = Language.getByLangCode("cs"); public static final Language HU = Language.getByLangCode("hu"); public static final Language KO = Language.getByLangCode("ko"); public static final Language FA = Language.getByLangCode("fa"); public static final Language ID = Language.getByLangCode("id"); public static final Language TR = Language.getByLangCode("tr"); public static final Language AR = Language.getByLangCode("ar"); public static final Language RO = Language.getByLangCode("ro"); public static final Language SK = Language.getByLangCode("sk"); public static final Language EO = Language.getByLangCode("eo"); public static final Language DA = Language.getByLangCode("da"); public static final Language SR = Language.getByLangCode("sr"); public static final Language LT = Language.getByLangCode("lt"); public static final Language MS = Language.getByLangCode("ms"); public static final Language HE = Language.getByLangCode("he"); public static final Language EU = Language.getByLangCode("eu"); public static final Language SL = Language.getByLangCode("sl"); public static final Language BG = Language.getByLangCode("bg"); public static final Language KK = Language.getByLangCode("kk"); public static final Language VO = Language.getByLangCode("vo"); public static final Language HR = Language.getByLangCode("hr"); public static final Language WAR = Language.getByLangCode("war"); public static final Language HI = Language.getByLangCode("hi"); public static final Language ET = Language.getByLangCode("et"); public static final Language GL = Language.getByLangCode("gl"); public static final Language AZ = Language.getByLangCode("az"); public static final Language NN = Language.getByLangCode("nn"); public static final Language SIMPLE = Language.getByLangCode("simple"); public static final Language LA = Language.getByLangCode("la"); public static final Language EL = Language.getByLangCode("el"); public static final Language TH = Language.getByLangCode("th"); public static final Language NEW = Language.getByLangCode("new"); public static final Language ROA_RUP = Language.getByLangCode("roa-rup"); public static final Language OC = Language.getByLangCode("oc"); public static final Language SH = Language.getByLangCode("sh"); public static final Language KA = Language.getByLangCode("ka"); public static final Language MK = Language.getByLangCode("mk"); public static final Language TL = Language.getByLangCode("tl"); public static final Language HT = Language.getByLangCode("ht"); public static final Language PMS = Language.getByLangCode("pms"); public static final Language TE = Language.getByLangCode("te"); public static final Language TA = Language.getByLangCode("ta"); public static final Language BE_X_OLD = Language.getByLangCode("be-x-old"); public static final Language BE = Language.getByLangCode("be"); public static final Language BR = Language.getByLangCode("br"); public static final Language CEB = Language.getByLangCode("ceb"); public static final Language LV = Language.getByLangCode("lv"); public static final Language SQ = Language.getByLangCode("sq"); public static final Language JV = Language.getByLangCode("jv"); public static final Language MG = Language.getByLangCode("mg"); public static final Language CY = Language.getByLangCode("cy"); public static final Language LB = Language.getByLangCode("lb"); public static final Language MR = Language.getByLangCode("mr"); public static final Language IS = Language.getByLangCode("is"); public static final Language BS = Language.getByLangCode("bs"); public static final Language YO = Language.getByLangCode("yo"); public static final Language AN = Language.getByLangCode("an"); public static final Language LMO = Language.getByLangCode("lmo"); public static final Language HY = Language.getByLangCode("hy"); public static final Language FY = Language.getByLangCode("fy"); public static final Language BPY = Language.getByLangCode("bpy"); public static final Language ML = Language.getByLangCode("ml"); public static final Language PNB = Language.getByLangCode("pnb"); public static final Language SW = Language.getByLangCode("sw"); public static final Language BN = Language.getByLangCode("bn"); public static final Language IO = Language.getByLangCode("io"); public static final Language AF = Language.getByLangCode("af"); public static final Language GU = Language.getByLangCode("gu"); public static final Language ZH_YUE = Language.getByLangCode("zh-yue"); public static final Language NE = Language.getByLangCode("ne"); public static final Language NDS = Language.getByLangCode("nds"); public static final Language UR = Language.getByLangCode("ur"); public static final Language KU = Language.getByLangCode("ku"); public static final Language UZ = Language.getByLangCode("uz"); public static final Language AST = Language.getByLangCode("ast"); public static final Language SCN = Language.getByLangCode("scn"); public static final Language SU = Language.getByLangCode("su"); public static final Language QU = Language.getByLangCode("qu"); public static final Language DIQ = Language.getByLangCode("diq"); public static final Language BA = Language.getByLangCode("ba"); public static final Language TT = Language.getByLangCode("tt"); public static final Language MY = Language.getByLangCode("my"); public static final Language GA = Language.getByLangCode("ga"); public static final Language CV = Language.getByLangCode("cv"); public static final Language IA = Language.getByLangCode("ia"); public static final Language NAP = Language.getByLangCode("nap"); public static final Language BAT_SMG = Language.getByLangCode("bat-smg"); public static final Language MAP_BMS = Language.getByLangCode("map-bms"); public static final Language WA = Language.getByLangCode("wa"); public static final Language ALS = Language.getByLangCode("als"); public static final Language KN = Language.getByLangCode("kn"); public static final Language AM = Language.getByLangCode("am"); public static final Language GD = Language.getByLangCode("gd"); public static final Language BUG = Language.getByLangCode("bug"); public static final Language TG = Language.getByLangCode("tg"); public static final Language ZH_MIN_NAN = Language.getByLangCode("zh-min-nan"); public static final Language YI = Language.getByLangCode("yi"); public static final Language VEC = Language.getByLangCode("vec"); public static final Language SCO = Language.getByLangCode("sco"); public static final Language HIF = Language.getByLangCode("hif"); public static final Language ROA_TARA = Language.getByLangCode("roa-tara"); public static final Language OS = Language.getByLangCode("os"); public static final Language ARZ = Language.getByLangCode("arz"); public static final Language NAH = Language.getByLangCode("nah"); public static final Language MZN = Language.getByLangCode("mzn"); public static final Language SAH = Language.getByLangCode("sah"); public static final Language KY = Language.getByLangCode("ky"); public static final Language MN = Language.getByLangCode("mn"); public static final Language SA = Language.getByLangCode("sa"); public static final Language PAM = Language.getByLangCode("pam"); public static final Language HSB = Language.getByLangCode("hsb"); public static final Language LI = Language.getByLangCode("li"); public static final Language MI = Language.getByLangCode("mi"); public static final Language SI = Language.getByLangCode("si"); public static final Language CO = Language.getByLangCode("co"); public static final Language CKB = Language.getByLangCode("ckb"); public static final Language GAN = Language.getByLangCode("gan"); public static final Language GLK = Language.getByLangCode("glk"); public static final Language BO = Language.getByLangCode("bo"); public static final Language FO = Language.getByLangCode("fo"); public static final Language BAR = Language.getByLangCode("bar"); public static final Language BCL = Language.getByLangCode("bcl"); public static final Language ILO = Language.getByLangCode("ilo"); public static final Language MRJ = Language.getByLangCode("mrj"); public static final Language SE = Language.getByLangCode("se"); public static final Language FIU_VRO = Language.getByLangCode("fiu-vro"); public static final Language NDS_NL = Language.getByLangCode("nds-nl"); public static final Language TK = Language.getByLangCode("tk"); public static final Language VLS = Language.getByLangCode("vls"); public static final Language PS = Language.getByLangCode("ps"); public static final Language GV = Language.getByLangCode("gv"); public static final Language RUE = Language.getByLangCode("rue"); public static final Language DV = Language.getByLangCode("dv"); public static final Language NRM = Language.getByLangCode("nrm"); public static final Language PAG = Language.getByLangCode("pag"); public static final Language PA = Language.getByLangCode("pa"); public static final Language KOI = Language.getByLangCode("koi"); public static final Language RM = Language.getByLangCode("rm"); public static final Language KM = Language.getByLangCode("km"); public static final Language KV = Language.getByLangCode("kv"); public static final Language UDM = Language.getByLangCode("udm"); public static final Language CSB = Language.getByLangCode("csb"); public static final Language MHR = Language.getByLangCode("mhr"); public static final Language FUR = Language.getByLangCode("fur"); public static final Language MT = Language.getByLangCode("mt"); public static final Language ZEA = Language.getByLangCode("zea"); public static final Language WUU = Language.getByLangCode("wuu"); public static final Language LIJ = Language.getByLangCode("lij"); public static final Language UG = Language.getByLangCode("ug"); public static final Language LAD = Language.getByLangCode("lad"); public static final Language PI = Language.getByLangCode("pi"); public static final Language XMF = Language.getByLangCode("xmf"); public static final Language SC = Language.getByLangCode("sc"); public static final Language BH = Language.getByLangCode("bh"); public static final Language ZH_CLASSICAL = Language.getByLangCode("zh-classical"); public static final Language OR = Language.getByLangCode("or"); public static final Language NOV = Language.getByLangCode("nov"); public static final Language KSH = Language.getByLangCode("ksh"); public static final Language ANG = Language.getByLangCode("ang"); public static final Language SO = Language.getByLangCode("so"); public static final Language KW = Language.getByLangCode("kw"); public static final Language STQ = Language.getByLangCode("stq"); public static final Language NV = Language.getByLangCode("nv"); public static final Language HAK = Language.getByLangCode("hak"); public static final Language FRR = Language.getByLangCode("frr"); public static final Language AY = Language.getByLangCode("ay"); public static final Language FRP = Language.getByLangCode("frp"); public static final Language EXT = Language.getByLangCode("ext"); public static final Language SZL = Language.getByLangCode("szl"); public static final Language PCD = Language.getByLangCode("pcd"); public static final Language IE = Language.getByLangCode("ie"); public static final Language GAG = Language.getByLangCode("gag"); public static final Language HAW = Language.getByLangCode("haw"); public static final Language XAL = Language.getByLangCode("xal"); public static final Language LN = Language.getByLangCode("ln"); public static final Language RW = Language.getByLangCode("rw"); public static final Language PDC = Language.getByLangCode("pdc"); public static final Language PFL = Language.getByLangCode("pfl"); public static final Language VEP = Language.getByLangCode("vep"); public static final Language KRC = Language.getByLangCode("krc"); public static final Language CRH = Language.getByLangCode("crh"); public static final Language EML = Language.getByLangCode("eml"); public static final Language GN = Language.getByLangCode("gn"); public static final Language ACE = Language.getByLangCode("ace"); public static final Language TO = Language.getByLangCode("to"); public static final Language CE = Language.getByLangCode("ce"); public static final Language KL = Language.getByLangCode("kl"); public static final Language ARC = Language.getByLangCode("arc"); public static final Language MYV = Language.getByLangCode("myv"); public static final Language DSB = Language.getByLangCode("dsb"); public static final Language AS = Language.getByLangCode("as"); public static final Language BJN = Language.getByLangCode("bjn"); public static final Language PAP = Language.getByLangCode("pap"); public static final Language TPI = Language.getByLangCode("tpi"); public static final Language LBE = Language.getByLangCode("lbe"); public static final Language MDF = Language.getByLangCode("mdf"); public static final Language WO = Language.getByLangCode("wo"); public static final Language JBO = Language.getByLangCode("jbo"); public static final Language KAB = Language.getByLangCode("kab"); public static final Language SN = Language.getByLangCode("sn"); public static final Language AV = Language.getByLangCode("av"); public static final Language CBK_ZAM = Language.getByLangCode("cbk-zam"); public static final Language TY = Language.getByLangCode("ty"); public static final Language SRN = Language.getByLangCode("srn"); public static final Language KBD = Language.getByLangCode("kbd"); public static final Language LO = Language.getByLangCode("lo"); public static final Language LEZ = Language.getByLangCode("lez"); public static final Language AB = Language.getByLangCode("ab"); public static final Language MWL = Language.getByLangCode("mwl"); public static final Language LTG = Language.getByLangCode("ltg"); public static final Language NA = Language.getByLangCode("na"); public static final Language IG = Language.getByLangCode("ig"); public static final Language KG = Language.getByLangCode("kg"); public static final Language TET = Language.getByLangCode("tet"); public static final Language ZA = Language.getByLangCode("za"); public static final Language KAA = Language.getByLangCode("kaa"); public static final Language NSO = Language.getByLangCode("nso"); public static final Language ZU = Language.getByLangCode("zu"); public static final Language RMY = Language.getByLangCode("rmy"); public static final Language CU = Language.getByLangCode("cu"); public static final Language TN = Language.getByLangCode("tn"); public static final Language CHR = Language.getByLangCode("chr"); public static final Language CHY = Language.getByLangCode("chy"); public static final Language GOT = Language.getByLangCode("got"); public static final Language SM = Language.getByLangCode("sm"); public static final Language BI = Language.getByLangCode("bi"); public static final Language MO = Language.getByLangCode("mo"); public static final Language BM = Language.getByLangCode("bm"); public static final Language IU = Language.getByLangCode("iu"); public static final Language PIH = Language.getByLangCode("pih"); public static final Language IK = Language.getByLangCode("ik"); public static final Language SS = Language.getByLangCode("ss"); public static final Language SD = Language.getByLangCode("sd"); public static final Language PNT = Language.getByLangCode("pnt"); public static final Language CDO = Language.getByLangCode("cdo"); public static final Language EE = Language.getByLangCode("ee"); public static final Language HA = Language.getByLangCode("ha"); public static final Language TI = Language.getByLangCode("ti"); public static final Language BXR = Language.getByLangCode("bxr"); public static final Language TS = Language.getByLangCode("ts"); public static final Language OM = Language.getByLangCode("om"); public static final Language KS = Language.getByLangCode("ks"); public static final Language KI = Language.getByLangCode("ki"); public static final Language VE = Language.getByLangCode("ve"); public static final Language SG = Language.getByLangCode("sg"); public static final Language RN = Language.getByLangCode("rn"); public static final Language CR = Language.getByLangCode("cr"); public static final Language DZ = Language.getByLangCode("dz"); public static final Language LG = Language.getByLangCode("lg"); public static final Language AK = Language.getByLangCode("ak"); public static final Language FF = Language.getByLangCode("ff"); public static final Language TUM = Language.getByLangCode("tum"); public static final Language FJ = Language.getByLangCode("fj"); public static final Language ST = Language.getByLangCode("st"); public static final Language TW = Language.getByLangCode("tw"); public static final Language XH = Language.getByLangCode("xh"); public static final Language CH = Language.getByLangCode("ch"); public static final Language NY = Language.getByLangCode("ny"); public static final Language NG = Language.getByLangCode("ng"); public static final Language II = Language.getByLangCode("ii"); public static final Language CHO = Language.getByLangCode("cho"); public static final Language MH = Language.getByLangCode("mh"); public static final Language AA = Language.getByLangCode("aa"); public static final Language KJ = Language.getByLangCode("kj"); public static final Language HO = Language.getByLangCode("ho"); public static final Language MUS = Language.getByLangCode("mus"); public static final Language KR = Language.getByLangCode("kr"); public static final Language HZ = Language.getByLangCode("hz"); }