/** * Copyright (c) 2012-2016 André Bargull * Alle Rechte vorbehalten / All Rights Reserved. Use is subject to license terms. * * <https://github.com/anba/es6draft> */ package com.github.anba.es6draft.runtime.objects.intl; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Map; import java.util.TreeMap; /** * Parser for BCP47 language tags * * @see <a href="http://tools.ietf.org/html/rfc5646">RFC-5646</a> */ final class LanguageTagParser { private static final HashSet<String> irregular = set("en-gb-oed", "i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", "i-klingon", "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", "i-tsu", "sgn-be-fr", "sgn-be-nl", "sgn-ch-de"); private static final HashSet<String> regular = set("art-lojban", "cel-gaulish", "no-bok", "no-nyn", "zh-guoyu", "zh-hakka", "zh-min", "zh-min-nan", "zh-xiang"); private static final HashSet<String> grandfathered = new HashSet<>(); static { grandfathered.addAll(irregular); grandfathered.addAll(regular); } @SafeVarargs private static <T> HashSet<T> set(T... elements) { return new HashSet<>(Arrays.asList(elements)); } private static final int NONE = 0b00, ALPHA = 0b01, DIGIT = 0b10, ALPHA_DIGIT = 0b11; private int token; private int tokenStart; private int tokenLength; private final String input; private final int length; private int pos; private LanguageTag tag; public static final class LanguageTag { private final String input; private boolean grandfathered; private String language; private String extLang1, extLang2, extLang3; private String script; private String region; private LinkedHashSet<String> variants; private TreeMap<Character, String> extensions; private String privateuse; LanguageTag(String input) { this.input = input; } /** * Canonicalization for language tags. * * @return the canonicalized language tag * @see <a href="http://tools.ietf.org/html/rfc5646#section-4.5">RFC 5646</a> */ public String canonicalize() { assert grandfathered || language != null || privateuse != null; if (grandfathered) { String r = LanguageSubtagRegistryData.grandfathered(input); assert r != null : "grandfathered with no canonicalized value"; return r; } if (language == null) { return "x-" + privateuse; } String redundant = LanguageSubtagRegistryData.redundant(input); if (redundant != null) { // replace complete tag if redundant return redundant; } StringBuilder sb = new StringBuilder(); String lang = LanguageSubtagRegistryData.language(language); sb.append(lang); if (extLang1 != null) { String ext = LanguageSubtagRegistryData.extlang(lang, extLang1); if (ext != null) { // replaces language subtag sb.setLength(0); sb.append(ext); } else { sb.append('-').append(extLang1); } } if (extLang2 != null) { sb.append('-').append(extLang2); } if (extLang3 != null) { sb.append('-').append(extLang3); } if (script != null) { // titlecase sb.append('-').append((char) (script.charAt(0) & ~0x20)).append(script, 1, script.length()); } if (region != null) { if (region.length() == 2) { // uppercase String reg = LanguageSubtagRegistryData.region(region); sb.append('-').append((char) (reg.charAt(0) & ~0x20)).append((char) (reg.charAt(1) & ~0x20)); } else { sb.append('-').append(region); } } if (variants != null) { for (String variant : variants) { String var = LanguageSubtagRegistryData.variant(sb, variant); if (var != null) { // replace complete string sb.setLength(0); sb.append(var); } else { sb.append('-').append(variant); } } } if (extensions != null) { // sorted by singleton for (Map.Entry<Character, String> entry : extensions.entrySet()) { char singleton = entry.getKey().charValue(); String value = entry.getValue(); if (singleton == 'u') { value = canonicalizeUnicodeExtension(value); } sb.append('-').append(singleton).append('-').append(value); } } if (privateuse != null) { sb.append("-x-").append(privateuse); } return sb.toString(); } /** * Additional canonicalization for unicode extension sequences. * * @param ext * the unicode extension * @return the canonicalized unicode extension * @see <a href="http://tools.ietf.org/html/rfc6067#section-2.1.1">RFC 6067</a> */ private String canonicalizeUnicodeExtension(String ext) { final int KEYWORD_LENGTH = 2; StringBuilder sb = new StringBuilder(ext.length() + 1); int start = 0, index = indexOf(ext, '-', start); if ((index - start) > KEYWORD_LENGTH) { // attributes ArrayList<String> attributes = new ArrayList<>(5); for (;;) { attributes.add(ext.substring(start, index)); start = index + 1; index = indexOf(ext, '-', start); if ((index - start) <= KEYWORD_LENGTH) { break; } } appendSorted(sb, attributes); } if ((index - start) == KEYWORD_LENGTH) { // keywords ArrayList<String> keywords = new ArrayList<>(5); for (int keystart = start;;) { start = index + 1; index = indexOf(ext, '-', start); if ((index - start) <= KEYWORD_LENGTH) { keywords.add(ext.substring(keystart, start - 1)); if ((index - start) == KEYWORD_LENGTH) { keystart = start; } else { break; } } } appendSorted(sb, keywords); } assert sb.length() == ext.length() + 1 : String.format("%s != %s", sb, ext); sb.setLength(ext.length()); return sb.toString(); } private static int indexOf(String s, int ch, int fromIndex) { int index = s.indexOf(ch, fromIndex); return (index < 0 ? s.length() : index); } private static void appendSorted(StringBuilder sb, ArrayList<String> list) { Collections.sort(list); for (String s : list) { sb.append(s).append('-'); } } } public static LanguageTag parse(String locale) { String lower = toLowerASCIIOrNull(locale); if (lower == null) { // input contains invalid characters return null; } return new LanguageTagParser(lower).parse(); } private LanguageTagParser(String input) { this.input = input; this.length = input.length(); } private LanguageTag parse() { tag = new LanguageTag(input); rollback(0); if (languageTag() && pos >= length && token == NONE) { // successfully parsed the language tag, no characters left unparsed and the last token // was consumed as well return tag; } return null; } private void rollback(int p) { pos = p; next(); } private void consume() { next(); } private void next() { int tok = NONE, start = pos, len = 0; for (; pos < length; ++len) { char c = input.charAt(pos++); if (isAlpha(c)) { tok |= ALPHA; } else if (isDigit(c)) { tok |= DIGIT; } else { assert c == '-'; break; } } token = tok; tokenStart = start; tokenLength = len; // System.out.printf("token=%d, tokenStart=%d, tokenLength=%d%n", token, tokenStart, // tokenLength); } private char tokenStartChar() { return input.charAt(tokenStart); } private String tokenString() { return input.substring(tokenStart, tokenStart + tokenLength); } private String consumeTokenString() { String s = tokenString(); consume(); return s; } private static String toLowerASCIIOrNull(String s) { int i = 0, len = s.length(); if (len == 0 || s.charAt(len - 1) == '-') { // handle '-' as last character early to reduce additional efforts in next() return null; } lower: { for (; i < len; ++i) { char c = s.charAt(i); if (isUpper(c)) { break lower; } else if (!(isAlpha(c) || isDigit(c) || c == '-')) { return null; } } return s; } char[] ca = s.toCharArray(); for (; i < len; ++i) { char c = ca[i]; if (isUpper(c)) { ca[i] = (char) (c + ('a' - 'A')); } else if (!(isAlpha(c) || isDigit(c) || c == '-')) { return null; } } return new String(ca); } private static boolean isUpper(char c) { return ('A' <= c && c <= 'Z'); } private static boolean isAlpha(char c) { return ('a' <= c && c <= 'z'); } private static boolean isDigit(char c) { return ('0' <= c && c <= '9'); } /** * <pre> * Language-Tag = langtag ; normal language tags * / privateuse ; private use tag * / grandfathered ; grandfathered tags * </pre> * * @return {@code true} if parsed successfully */ private boolean languageTag() { return grandfathered() || privateuse() || langtag(); } /** * <pre> * grandfathered = irregular ; non-redundant tags registered * / regular ; during the RFC 3066 era * * irregular = "en-GB-oed" ; irregular tags do not match * / "i-ami" ; the 'langtag' production and * / "i-bnn" ; would not otherwise be * / "i-default" ; considered 'well-formed' * / "i-enochian" ; These tags are all valid, * / "i-hak" ; but most are deprecated * / "i-klingon" ; in favor of more modern * / "i-lux" ; subtags or subtag * / "i-mingo" ; combination * / "i-navajo" * / "i-pwn" * / "i-tao" * / "i-tay" * / "i-tsu" * / "sgn-BE-FR" * / "sgn-BE-NL" * / "sgn-CH-DE" * * regular = "art-lojban" ; these tags match the 'langtag' * / "cel-gaulish" ; production, but their subtags * / "no-bok" ; are not extended language * / "no-nyn" ; or variant subtags: their meaning * / "zh-guoyu" ; is defined by their registration * / "zh-hakka" ; and all of these are deprecated * / "zh-min" ; in favor of a more modern * / "zh-min-nan" ; subtag or sequence of subtags * / "zh-xiang" * </pre> * * @return {@code true} if parsed successfully */ private boolean grandfathered() { if (grandfathered.contains(input)) { tag.grandfathered = true; pos = length; // consume complete input token = NONE; // consume complete input return true; } return false; } /** * <pre> * langtag = language * ["-" script] * ["-" region] * *("-" variant) * *("-" extension) * ["-" privateuse] * </pre> * * @return {@code true} if parsed successfully */ private boolean langtag() { if (language()) { script(); region(); if (!variants()) { return false; } if (!extensions()) { return false; } privateuse(); return true; } return false; } /** * <pre> * language = 2*3ALPHA ; shortest ISO 639 code * ["-" extlang] ; sometimes followed by * ; extended language subtags * / 4ALPHA ; or reserved for future use * / 5*8ALPHA ; or registered language subtag * </pre> * * @return {@code true} if parsed successfully */ private boolean language() { if (token == ALPHA) { if (tokenLength >= 2 && tokenLength <= 3) { tag.language = consumeTokenString(); extlang(); return true; } else if (tokenLength >= 4 && tokenLength <= 8) { tag.language = consumeTokenString(); return true; } } return false; } /** * <pre> * extlang = 3ALPHA ; selected ISO 639 codes * *2("-" 3ALPHA) ; permanently reserved * </pre> */ private void extlang() { if (token == ALPHA && tokenLength == 3) { tag.extLang1 = consumeTokenString(); if (token == ALPHA && tokenLength == 3) { tag.extLang2 = consumeTokenString(); if (token == ALPHA && tokenLength == 3) { tag.extLang3 = consumeTokenString(); } } } } /** * <pre> * script = 4ALPHA ; ISO 15924 code * </pre> */ private void script() { if (token == ALPHA && tokenLength == 4) { tag.script = consumeTokenString(); } } /** * <pre> * region = 2ALPHA ; ISO 3166-1 code * / 3DIGIT ; UN M.49 code * </pre> */ private void region() { if ((token == ALPHA && tokenLength == 2) || (token == DIGIT && tokenLength == 3)) { tag.region = consumeTokenString(); } } /** * <pre> * variant = 5*8alphanum ; registered variants * / (DIGIT 3alphanum) * </pre> * * @return {@code true} if parsed successfully */ private boolean variants() { for (;;) { String variant; if (alphanum() && tokenLength >= 5 && tokenLength <= 8) { variant = consumeTokenString(); } else if (alphanum() && tokenLength == 4 && isDigit(tokenStartChar())) { variant = consumeTokenString(); } else { return true; } if (tag.variants == null) { tag.variants = new LinkedHashSet<>(); } if (!tag.variants.add(variant)) { return false; } } } /** * <pre> * extension = singleton 1*("-" (2*8alphanum)) * * ; Single alphanumerics * ; "x" reserved for private use * singleton = DIGIT ; 0 - 9 * / %x41-57 ; A - W * / %x59-5A ; Y - Z * / %x61-77 ; a - w * / %x79-7A ; y - z * </pre> * * @return {@code true} if parsed successfully */ private boolean extensions() { for (;;) { int saved = tokenStart; if (alphanum() && tokenLength == 1 && tokenStartChar() != 'x') { char singleton = tokenStartChar(); consume(); if (alphanum() && tokenLength >= 2 && tokenLength <= 8) { int startExtension = tokenStart; int len = tokenLength; consume(); while (alphanum() && tokenLength >= 2 && tokenLength <= 8) { len += tokenLength + 1; // token + separator consume(); } String value = input.substring(startExtension, startExtension + len); if (tag.extensions == null) { tag.extensions = new TreeMap<>(); } if (tag.extensions.put(singleton, value) != null) { return false; } continue; } rollback(saved); } return true; } } /** * <pre> * privateuse = "x" 1*("-" (1*8alphanum)) * </pre> * * @return {@code true} if parsed successfully */ private boolean privateuse() { int saved = tokenStart; if (token == ALPHA && tokenLength == 1 && tokenStartChar() == 'x') { consume(); if (alphanum() && tokenLength >= 1 && tokenLength <= 8) { int startPrivateuse = tokenStart; consume(); while (alphanum() && tokenLength >= 1 && tokenLength <= 8) { consume(); } // always last subtag, so just use the remaining string tag.privateuse = input.substring(startPrivateuse); return true; } rollback(saved); } return false; } /** * <pre> * alphanum = (ALPHA / DIGIT) ; letters and numbers * </pre> * * @return {@code true} if the current token is an alphanumeric character */ private boolean alphanum() { return (token & ALPHA_DIGIT) != NONE; } }