/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.riot.web ; import java.util.Locale ; import java.util.regex.Matcher ; import java.util.regex.Pattern ; import org.apache.jena.atlas.lib.Chars ; import org.apache.jena.riot.system.RiotChars ; /** * Language tags: support for parsing and canonicalization of case. * Grandfathered forms ("i-") are left untouched. Unsupported or syntactically * illegal forms are handled in canonicalization by doing nothing. * <ul> * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc4646.txt">RFC * 4646</a></li> * <li>Matching Language tags: <a href="http://www.ietf.org/rfc/rfc4647.txt">RFC * 4647</a></li> * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc5646.txt">RFC * 5646</a></li> * </ul> */ public class LangTag { // Valid language tag, not ireegular nor grandfathered. /** Index of the language part */ public static final int idxLanguage = 0 ; /** Index of the script part */ public static final int idxScript = 1 ; /** Index of the region part */ public static final int idxRegion = 2 ; /** Index of the variant part */ public static final int idxVariant = 3 ; /** Index of all extensions */ public static final int idxExtension = 4 ; private static final int partsLength = 5 ; private LangTag() {} // Defined by BCP 47 which is currently RFC5646 which obsoletes RFC4646. // Canonical forms: /* * RFC 4646 In this format, all non-initial two-letter subtags are * uppercase, all non-initial four-letter subtags are titlecase, and all * other subtags are lowercase. */ /* * RFC 5646 An implementation can reproduce this format without accessing * the registry as follows. All subtags, including extension and private use * subtags, use lowercase letters with two exceptions: two-letter and * four-letter subtags that neither appear at the start of the tag nor occur * after singletons. Such two-letter subtags are all uppercase (as in the * tags "en-CA-x-ca" or "sgn-BE-FR") and four- letter subtags are titlecase * (as in the tag "az-Latn-x-latn"). */ /* * ABNF definition: <a href="http://www.ietf.org/rfc/rfc4234.txt">RFC * 4234</a> * * Language-Tag = langtag / privateuse ; private use tag / grandfathered ; * grandfathered registrations * * langtag = (language ["-" script] ["-" region]("-" variant)("-" extension) * ["-" privateuse]) * * language = (2*3ALPHA [ extlang ]) ; shortest ISO 639 code / 4ALPHA ; * reserved for future use / 5*8ALPHA ; registered language subtag * * extlang = *3("-" 3ALPHA) ; reserved for future use * * script = 4ALPHA ; ISO 15924 code * * region = 2ALPHA ; ISO 3166 code / 3DIGIT ; UN M.49 code * * variant = 5*8alphanum ; registered variants / (DIGIT 3alphanum) * * extension = singleton 1*("-" (2*8alphanum)) * * singleton = %x41-57 / %x59-5A / %x61-77 / %x79-7A / DIGIT ; "a"-"w" / * "y"-"z" / "A"-"W" / "Y"-"Z" / "0"-"9" ; Single letters: x/X is reserved * for private use * * privateuse = ("x"/"X") 1*("-" (1*8alphanum)) * * grandfathered = 1*3ALPHA 1*2("-" (2*8alphanum)) ; grandfathered * registration ; Note: i is the only singleton ; that starts a * grandfathered tag * * alphanum = (ALPHA / DIGIT) ; letters and numbers */ private static final String languageRE_1 = "(?:[a-zA-Z]{2,3}(?:-[a-zA-Z]{3}){0,3})" ; private static final String languageRE_2 = "[a-zA-Z]{4}" ; private static final String languageRE_3 = "[a-zA-Z]{5,8}" ; private static final String language = languageRE_1 + "|" + languageRE_2 + "|" + languageRE_3 ; private static final String script = "[a-zA-Z]{4}" ; private static final String region = "[a-zA-Z]{2}|[0-9]{3}" ; private static final String variant = "[a-zA-Z0-9]{5,8}" ; private static final String extension1 = "(?:[a-zA-Z0-9]-[a-zA-Z0-9]{2,8})" ; private static final String extension = extension1 + "(?:-" + extension1 + ")*" ; // private static final String singleton = null ; // private static final String privateuse = null ; // private static final String grandfathered = null ; private static final String langtag = String.format("^(%s)(?:-(%s))?(?:-(%s))?(?:-(%s))?(?:-(%s))?$", language, script, region, variant, extension) ; // Private use forms "x-" private static final String privateuseRE = "^[xX](-[a-zA-Z0-9]{1,8})*$" ; // In general, this can look like a langtag but there are no registered // forms that do so. // This is for the "i-" forms only. private static final String grandfatheredRE = "i(?:-[a-zA-Z0-9]{2,8}){1,2}" ; private static Pattern pattern = Pattern.compile(langtag) ; private static Pattern patternPrivateuse = Pattern.compile(privateuseRE) ; private static Pattern patternGrandfathered = Pattern.compile(grandfatheredRE) ; /** * Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-' * [a-zA-Z0-9]+)* */ public static boolean check(String languageTag) { int len = languageTag.length() ; int idx = 0 ; boolean first = true ; while (idx < languageTag.length()) { int idx2 = checkPart(languageTag, idx, first) ; first = false ; if ( idx2 == idx ) // zero length part. return false ; idx = idx2 ; if ( idx == len ) return true ; if ( languageTag.charAt(idx) != Chars.CH_DASH ) return false ; idx++ ; if ( idx == len ) // trailing DASH return false ; } return true ; } private static int checkPart(String languageTag, int idx, boolean leader) { for (; idx < languageTag.length(); idx++) { int ch = languageTag.charAt(idx) ; if ( leader ) { if ( RiotChars.isA2Z(ch) ) continue ; } else { if ( RiotChars.isA2ZN(ch) ) continue ; } // Not acceptable. return idx ; } // Off end. return idx ; } /** * Parse a langtag string and return it's parts in canonical case. See * constants for the array contents. Parts not present cause a null in the * return array. * * @return Langtag parts, or null if the input string does not poarse as a * lang tag. */ public static String[] parse(String languageTag) { String[] parts = new String[partsLength] ; String x = pattern.toString() ; Pattern.compile(languageRE_1) ; Matcher m = pattern.matcher(languageTag) ; if ( !m.find() ) { m = patternPrivateuse.matcher(languageTag) ; if ( m.find() ) { // Place in the "extension" part parts[idxExtension] = m.group(0) ; return parts ; } m = patternGrandfathered.matcher(languageTag) ; if ( m.find() ) { // Place in the "extension" part parts[idxExtension] = m.group(0) ; return parts ; } // Give up. return null ; } int gc = m.groupCount() ; for (int i = 0; i < gc; i++) parts[i] = m.group(i + 1) ; parts[idxLanguage] = lowercase(parts[idxLanguage]) ; parts[idxScript] = strcase(parts[idxScript]) ; parts[idxRegion] = strcase(parts[idxRegion]) ; parts[idxVariant] = strcase(parts[idxVariant]) ; // parts[idxExtension] = strcase(parts[idxExtension]) ; // Leave // extensions alone. return parts ; } /** Canonicalize with the rules of RFC 4646 */ public static String canonical(String str) { if ( str == null ) return null ; String[] parts = parse(str) ; String x = canonical(parts) ; if ( x == null ) { // Could try to apply the rule case-seeting rules // even through it's not a conforming langtag. return str ; } return x ; } /** * Canonicalize with the rules of RFC 4646 "In this format, all non-initial * two-letter subtags are uppercase, all non-initial four-letter subtags are * titlecase, and all other subtags are lowercase." In addition, leave * extensions unchanged. */ public static String canonical(String[] parts) { // We canonicalised parts on parsing. // RFC 5646 is slightly different. if ( parts == null ) return null ; if ( parts[0] == null ) { // Grandfathered return parts[idxExtension] ; } StringBuilder sb = new StringBuilder() ; sb.append(parts[0]) ; for (int i = 1; i < parts.length; i++) { if ( parts[i] != null ) { sb.append("-") ; sb.append(parts[i]) ; } } return sb.toString() ; } private static String strcase(String string) { if ( string == null ) return null ; if ( string.length() == 2 ) return uppercase(string) ; if ( string.length() == 4 ) return titlecase(string) ; return lowercase(string) ; } private static String lowercase(String string) { if ( string == null ) return null ; return string.toLowerCase(Locale.ROOT) ; } private static String uppercase(String string) { if ( string == null ) return null ; return string.toUpperCase(Locale.ROOT) ; } private static String titlecase(String string) { if ( string == null ) return null ; char ch1 = string.charAt(0) ; ch1 = Character.toUpperCase(ch1) ; string = lowercase(string.substring(1)) ; return ch1 + string ; } }