/* * Copyright (c) 2006 Henri Sivonen * Copyright (c) 2007-2010 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package org.whattf.datatype; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.ArrayList; import java.util.Map; import java.util.regex.Pattern; import org.relaxng.datatype.DatatypeException; import org.whattf.datatype.data.LanguageData; /** * * @version $Id$ * @author hsivonen */ public final class Language extends AbstractDatatype { /** * The singleton instance. */ public static final Language THE_INSTANCE = new Language(); private static final Pattern HYPHEN = Pattern.compile("-"); private static final boolean WARN = System.getProperty( "org.whattf.datatype.warn", "").equals("true") ? true : false; private static String[] languages = null; private static String[] extlangs = null; private static String[] scripts = null; private static String[] regions = null; private static String[] variants = null; private static String[] grandfathered = null; private static String[] redundant = null; private static String[] deprecated = null; private static String[] deprecatedLang = null; private static int[] suppressedScriptByLanguage = null; private static Map<String, String> preferredValueByLanguageMap = new HashMap<String, String>(); private static String[][][] prefixesByVariant = null; private static int[] prefixByExtlang = null; static { try { LanguageData data = new LanguageData(); languages = data.getLanguages(); extlangs = data.getExtlangs(); scripts = data.getScripts(); regions = data.getRegions(); variants = data.getVariants(); grandfathered = data.getGrandfathered(); redundant = data.getRedundant(); deprecated = data.getDeprecated(); deprecatedLang = data.getDeprecatedLang(); suppressedScriptByLanguage = data.getSuppressedScriptByLanguage(); prefixByExtlang = data.getPrefixByExtlang(); preferredValueByLanguageMap = data.getPreferredValueByLanguageMap(); prefixesByVariant = data.getPrefixesByVariant(); } catch (IOException e) { throw new RuntimeException(e); } } /** * Package-private constructor */ private Language() { super(); } public void checkValid(CharSequence lit) throws DatatypeException { String literal = lit.toString(); if (literal.length() == 0) { throw newDatatypeException("The empty string is not a valid language tag."); } literal = toAsciiLowerCase(literal); if (isGrandfathered(literal)) { if (isDeprecated(literal) && WARN) { throw newDatatypeException("The grandfathered language tag ", literal, " is deprecated." + " Use \u201C" + preferredValueByLanguageMap.get(literal) + "\u201D instead.", WARN); } return; } if (isRedundant(literal)) { if (isDeprecated(literal) && WARN) { throw newDatatypeException("The language tag ", lit.toString(), " is deprecated." + " Use \u201C" + preferredValueByLanguageMap.get(literal) + "\u201D instead.", WARN); } return; } if (literal.startsWith("-")) { throw newDatatypeException("Language tag must not start with HYPHEN-MINUS."); } if (literal.endsWith("-")) { throw newDatatypeException("Language tag must not end with HYPHEN-MINUS."); } String[] subtags = HYPHEN.split(literal); for (int j = 0; j < subtags.length; j++) { int len = subtags[j].length(); if (len == 0) { throw newDatatypeException("Zero-length subtag."); } else if (len > 8) { throw newDatatypeException("Subtags must not exceed 8 characters in length."); } } // Language int i = 0; String subtag = subtags[i]; int len = subtag.length(); if ("x".equals(subtag)) { checkPrivateUse(i, subtags); return; } if ((len == 2 || len == 3) && isLowerCaseAlpha(subtag)) { if (!isLanguage(subtag)) { throw newDatatypeException("The language subtag ", subtag, " is not a valid ISO language part of a language tag."); } if (isDeprecatedLang(subtag) && WARN) { throw newDatatypeException("The language subtag ", subtag, " is deprecated." + " Use \u201C" + preferredValueByLanguageMap.get(literal) + "\u201D instead.", WARN); } i++; if (i == subtags.length) { return; } subtag = subtags[i]; len = subtag.length(); } else if (len == 4 && isLowerCaseAlpha(subtag)) { throw newDatatypeException("Found reserved language tag: ", subtag, "."); } else if (len >= 5 && isLowerCaseAlpha(subtag)) { if (!isLanguage(subtag)) { throw newDatatypeException("The language subtag ", subtag, " is not a valid IANA language part of a language tag."); } if (isDeprecatedLang(subtag) && WARN) { throw newDatatypeException("The language subtag ", subtag, " is deprecated." + " Use \u201C" + preferredValueByLanguageMap.get(literal) + "\u201D instead.", WARN); } i++; if (i == subtags.length) { return; } subtag = subtags[i]; len = subtag.length(); } else { throw newDatatypeException("The language subtag ", subtag, " is not a valid language subtag."); } // extlang if ("x".equals(subtag)) { checkPrivateUse(i, subtags); return; } if (subtag.length() == 3 && isLowerCaseAlpha(subtag)) { if (!isExtlang(subtag)) { throw newDatatypeException("Bad extlang subtag ", subtag, "."); } if (!usesPrefixByExtlang(subtags[0], subtag)) { // IANA language tags are never correct prefixes. throw newDatatypeException("Extlang subtag ", subtag, " has an incorrect prefix."); } i++; if (i == subtags.length) { return; } subtag = subtags[i]; len = subtag.length(); } // Script? if ("x".equals(subtag)) { checkPrivateUse(i, subtags); return; } if (subtag.length() == 4 & isLowerCaseAlpha(subtag)) { if (!isScript(subtag)) { throw newDatatypeException("Bad script subtag."); } if (isDeprecated(subtag) && WARN) { throw newDatatypeException("The script subtag ", subtag, " is deprecated." + " Use \u201C" + preferredValueByLanguageMap.get(literal) + "\u201D instead.", WARN); } if (shouldSuppressScript(subtags[0], subtag)) { throw newDatatypeException("Language tag should omit the default script for the language."); } i++; if (i == subtags.length) { return; } subtag = subtags[i]; len = subtag.length(); } // Region if ((len == 3 && isDigit(subtag)) || (len == 2 && isLowerCaseAlpha(subtag))) { if (!isRegion(subtag)) { throw newDatatypeException("Bad region subtag."); } if (isDeprecated(subtag) && WARN) { throw newDatatypeException("The region subtag ", subtag, " is deprecated." + " Use \u201C" + preferredValueByLanguageMap.get(literal) + "\u201D instead.", WARN); } i++; if (i == subtags.length) { return; } subtag = subtags[i]; len = subtag.length(); } // Variant for (;;) { if ("x".equals(subtag)) { checkPrivateUse(i, subtags); return; } // cutting corners here a bit since there are no extensions at this // time if (len == 1 && isLowerCaseAlphaNumeric(subtag)) { throw newDatatypeException("Unknown extension ", subtag, "."); } else if ((len == 4 && isDigit(subtag.charAt(0)) && isLowerCaseAlphaNumeric(subtag)) || (len >= 5 && isLowerCaseAlphaNumeric(subtag))) { if (!isVariant(subtag)) { throw newDatatypeException("Bad variant subtag ", subtag, "."); } if (isDeprecated(subtag) && WARN) { throw newDatatypeException("The variant subtag ", subtag, " is deprecated." + " Use \u201C" + preferredValueByLanguageMap.get(literal) + "\u201D instead.", WARN); } checkForValidPrefix(subtag, subtags, i); } else { throw newDatatypeException("The subtag ", subtag, " does not match the format for any permissible subtag type."); } i++; if (i == subtags.length) { return; } subtag = subtags[i]; len = subtag.length(); } } private void checkForValidPrefix(String subtag, String[] subtags, int i) throws DatatypeException { String variant = subtags[i]; int index = Arrays.binarySearch(variants, variant); assert index >= 0; String[][] prefixes = prefixesByVariant[index]; if (prefixes.length == 0) { return; } List<String> recommendedPrefixes = new ArrayList<String>(); for (int j = 0; j < prefixes.length; j++) { String[] prefix = prefixes[j]; for (int k = 0; k < prefix.length; k++) { String prefixComponent = prefix[k]; if (!subtagsContainPrefixComponent(prefixComponent, subtags, i)) { recommendedPrefixes.add(prefixComponent); } } if (prefixMatches(prefix, subtags, i)) { return; } } if (recommendedPrefixes.size() == 0) { return; } int count = recommendedPrefixes.size(); StringBuilder sb = new StringBuilder(); if (recommendedPrefixes.size() > 1) { sb.append(" one of "); } for (String prefix : recommendedPrefixes) { if (count != recommendedPrefixes.size()) { sb.append(", "); if (count == 1) { sb.append(" or "); } } sb.append("\u201C"); sb.append(prefix); sb.append('\u201D'); count--; } throw newDatatypeException("Variant ", subtag, " lacks recommended prefix. Use " + sb + " instead."); } private boolean prefixMatches(String[] prefix, String[] subtags, int limit) { for (int i = 0; i < prefix.length; i++) { String prefixComponent = prefix[i]; if (!subtagsContainPrefixComponent(prefixComponent, subtags, limit)) { return false; } } return true; } private boolean subtagsContainPrefixComponent(String prefixComponent, String[] subtags, int limit) { for (int i = 0; i < limit; i++) { String subtag = subtags[i]; if (subtag.equals(prefixComponent)) { return true; } } return false; } private boolean usesPrefixByExtlang(String language, String extlang) { int langIndex = Arrays.binarySearch(languages, language); int extlangIndex = Arrays.binarySearch(extlangs, extlang); assert langIndex > -1; int prefixExpected = prefixByExtlang[extlangIndex]; return prefixExpected == langIndex; } private boolean shouldSuppressScript(String language, String script) { int langIndex = Arrays.binarySearch(languages, language); assert langIndex > -1; int scriptIndex = suppressedScriptByLanguage[langIndex]; if (scriptIndex < 0) { return false; } else { return scripts[scriptIndex].equals(script); } } private boolean isVariant(String subtag) { return (Arrays.binarySearch(variants, subtag) > -1); } private boolean isRegion(String subtag) { return (Arrays.binarySearch(regions, subtag) > -1) || "aa".equals(subtag) || ("qm".compareTo(subtag) <= 0 && "qz".compareTo(subtag) >= 0) || ("xa".compareTo(subtag) <= 0 && "xz".compareTo(subtag) >= 0) || "zz".equals(subtag); } private boolean isScript(String subtag) { return (Arrays.binarySearch(scripts, subtag) > -1) || ("qaaa".compareTo(subtag) <= 0 && "qabx".compareTo(subtag) >= 0); } private boolean isExtlang(String subtag) { return (Arrays.binarySearch(extlangs, subtag) > -1); } private boolean isLanguage(String subtag) { return (Arrays.binarySearch(languages, subtag) > -1) || ("qaa".compareTo(subtag) <= 0 && "qtz".compareTo(subtag) >= 0); } private void checkPrivateUse(int i, String[] subtags) throws DatatypeException { int len = subtags.length; i++; if (i == len) { throw newDatatypeException("No subtags in private use sequence."); } while (i < len) { String subtag = subtags[i]; if (subtag.length() < 2) { throw newDatatypeException("Private use subtag ", subtag, " is too short."); } if (!isLowerCaseAlphaNumeric(subtag)) { throw newDatatypeException("Bad character in private use subtag ", subtag, "."); } i++; } } private final boolean isLowerCaseAlphaNumeric(char c) { return isLowerCaseAlpha(c) || isDigit(c); } private final boolean isLowerCaseAlphaNumeric(String str) { for (int i = 0; i < str.length(); i++) { if (!isLowerCaseAlphaNumeric(str.charAt(i))) { return false; } } return true; } /** * @param c * @return */ private final boolean isDigit(char c) { return (c >= '0' && c <= '9'); } private final boolean isDigit(String str) { for (int i = 0; i < str.length(); i++) { if (!isDigit(str.charAt(i))) { return false; } } return true; } /** * @param c * @return */ private final boolean isLowerCaseAlpha(char c) { return (c >= 'a' && c <= 'z'); } private final boolean isLowerCaseAlpha(String str) { for (int i = 0; i < str.length(); i++) { if (!isLowerCaseAlpha(str.charAt(i))) { return false; } } return true; } private boolean isGrandfathered(String literal) { return Arrays.binarySearch(grandfathered, literal) > -1; } private boolean isRedundant(String literal) { return Arrays.binarySearch(redundant, literal) > -1; } private boolean isDeprecated(String subtag) { return Arrays.binarySearch(deprecated, subtag) > -1; } private boolean isDeprecatedLang(String subtag) { return Arrays.binarySearch(deprecatedLang, subtag) > -1; } @Override public String getName() { return "language tag"; } }