LangTag.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.riot.web ;

import java.util.Locale ;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;

import org.apache.jena.atlas.lib.Chars ;
import org.apache.jena.riot.system.RiotChars ;

/**
 * Language tags: support for parsing and canonicalization of case.
 * Grandfathered forms ("i-") are left untouched. Unsupported or syntactically
 * illegal forms are handled in canonicalization by doing nothing.
 * <ul>
 * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc4646.txt">RFC
 * 4646</a></li>
 * <li>Matching Language tags: <a href="http://www.ietf.org/rfc/rfc4647.txt">RFC
 * 4647</a></li>
 * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc5646.txt">RFC
 * 5646</a></li>
 * </ul>
 */

public class LangTag {
    // Valid language tag, not ireegular nor grandfathered.
    /** Index of the language part */
    public static final int  idxLanguage  = 0 ;
    /** Index of the script part */
    public static final int  idxScript    = 1 ;
    /** Index of the region part */
    public static final int  idxRegion    = 2 ;
    /** Index of the variant part */
    public static final int  idxVariant   = 3 ;
    /** Index of all extensions */
    public static final int  idxExtension = 4 ;

    private static final int partsLength  = 5 ;

    private LangTag() {}

    // Defined by BCP 47 which is currently RFC5646 which obsoletes RFC4646.

    // Canonical forms:
    /*
     * RFC 4646 In this format, all non-initial two-letter subtags are
     * uppercase, all non-initial four-letter subtags are titlecase, and all
     * other subtags are lowercase.
     */
    /*
     * RFC 5646 An implementation can reproduce this format without accessing
     * the registry as follows. All subtags, including extension and private use
     * subtags, use lowercase letters with two exceptions: two-letter and
     * four-letter subtags that neither appear at the start of the tag nor occur
     * after singletons. Such two-letter subtags are all uppercase (as in the
     * tags "en-CA-x-ca" or "sgn-BE-FR") and four- letter subtags are titlecase
     * (as in the tag "az-Latn-x-latn").
     */

    /*
     * ABNF definition: <a href="http://www.ietf.org/rfc/rfc4234.txt">RFC
     * 4234</a>
     * 
     * Language-Tag = langtag / privateuse ; private use tag / grandfathered ;
     * grandfathered registrations
     * 
     * langtag = (language ["-" script] ["-" region]("-" variant)("-" extension)
     * ["-" privateuse])
     * 
     * language = (2*3ALPHA [ extlang ]) ; shortest ISO 639 code / 4ALPHA ;
     * reserved for future use / 5*8ALPHA ; registered language subtag
     * 
     * extlang = *3("-" 3ALPHA) ; reserved for future use
     * 
     * script = 4ALPHA ; ISO 15924 code
     * 
     * region = 2ALPHA ; ISO 3166 code / 3DIGIT ; UN M.49 code
     * 
     * variant = 5*8alphanum ; registered variants / (DIGIT 3alphanum)
     * 
     * extension = singleton 1*("-" (2*8alphanum))
     * 
     * singleton = %x41-57 / %x59-5A / %x61-77 / %x79-7A / DIGIT ; "a"-"w" /
     * "y"-"z" / "A"-"W" / "Y"-"Z" / "0"-"9" ; Single letters: x/X is reserved
     * for private use
     * 
     * privateuse = ("x"/"X") 1*("-" (1*8alphanum))
     * 
     * grandfathered = 1*3ALPHA 1*2("-" (2*8alphanum)) ; grandfathered
     * registration ; Note: i is the only singleton ; that starts a
     * grandfathered tag
     * 
     * alphanum = (ALPHA / DIGIT) ; letters and numbers
     */

    private static final String languageRE_1         = "(?:[a-zA-Z]{2,3}(?:-[a-zA-Z]{3}){0,3})" ;
    private static final String languageRE_2         = "[a-zA-Z]{4}" ;
    private static final String languageRE_3         = "[a-zA-Z]{5,8}" ;
    private static final String language             = languageRE_1 + "|" + languageRE_2 + "|" + languageRE_3 ;

    private static final String script               = "[a-zA-Z]{4}" ;
    private static final String region               = "[a-zA-Z]{2}|[0-9]{3}" ;
    private static final String variant              = "[a-zA-Z0-9]{5,8}" ;
    private static final String extension1           = "(?:[a-zA-Z0-9]-[a-zA-Z0-9]{2,8})" ;
    private static final String extension            = extension1 + "(?:-" + extension1 + ")*" ;

    // private static final String singleton = null ;
    // private static final String privateuse = null ;
    // private static final String grandfathered = null ;

    private static final String langtag              = String.format("^(%s)(?:-(%s))?(?:-(%s))?(?:-(%s))?(?:-(%s))?$",
                                                                     language, script, region, variant, extension) ;

    // Private use forms "x-"
    private static final String privateuseRE         = "^[xX](-[a-zA-Z0-9]{1,8})*$" ;
    // In general, this can look like a langtag but there are no registered
    // forms that do so.
    // This is for the "i-" forms only.
    private static final String grandfatheredRE      = "i(?:-[a-zA-Z0-9]{2,8}){1,2}" ;

    private static Pattern      pattern              = Pattern.compile(langtag) ;
    private static Pattern      patternPrivateuse    = Pattern.compile(privateuseRE) ;
    private static Pattern      patternGrandfathered = Pattern.compile(grandfatheredRE) ;

    /**
     * Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-'
     * [a-zA-Z0-9]+)*
     */
    public static boolean check(String languageTag) {
        int len = languageTag.length() ;
        int idx = 0 ;
        boolean first = true ;
        while (idx < languageTag.length()) {
            int idx2 = checkPart(languageTag, idx, first) ;
            first = false ;
            if ( idx2 == idx )
                // zero length part.
                return false ;
            idx = idx2 ;
            if ( idx == len )
                return true ;
            if ( languageTag.charAt(idx) != Chars.CH_DASH )
                return false ;
            idx++ ;
            if ( idx == len )
                // trailing DASH
                return false ;
        }
        return true ;
    }

    private static int checkPart(String languageTag, int idx, boolean leader) {
        for (; idx < languageTag.length(); idx++) {
            int ch = languageTag.charAt(idx) ;
            if ( leader ) {
                if ( RiotChars.isA2Z(ch) )
                    continue ;
            } else {
                if ( RiotChars.isA2ZN(ch) )
                    continue ;
            }
            // Not acceptable.
            return idx ;
        }
        // Off end.
        return idx ;
    }

    /**
     * Parse a langtag string and return it's parts in canonical case. See
     * constants for the array contents. Parts not present cause a null in the
     * return array.
     * 
     * @return Langtag parts, or null if the input string does not poarse as a
     *         lang tag.
     */
    public static String[] parse(String languageTag) {
        String[] parts = new String[partsLength] ;

        String x = pattern.toString() ;

        Pattern.compile(languageRE_1) ;

        Matcher m = pattern.matcher(languageTag) ;
        if ( !m.find() ) {
            m = patternPrivateuse.matcher(languageTag) ;
            if ( m.find() ) {
                // Place in the "extension" part
                parts[idxExtension] = m.group(0) ;
                return parts ;
            }

            m = patternGrandfathered.matcher(languageTag) ;

            if ( m.find() ) {
                // Place in the "extension" part
                parts[idxExtension] = m.group(0) ;
                return parts ;
            }

            // Give up.
            return null ;
        }

        int gc = m.groupCount() ;
        for (int i = 0; i < gc; i++)
            parts[i] = m.group(i + 1) ;

        parts[idxLanguage] = lowercase(parts[idxLanguage]) ;
        parts[idxScript] = strcase(parts[idxScript]) ;
        parts[idxRegion] = strcase(parts[idxRegion]) ;
        parts[idxVariant] = strcase(parts[idxVariant]) ;
        // parts[idxExtension] = strcase(parts[idxExtension]) ; // Leave
        // extensions alone.
        return parts ;
    }

    /** Canonicalize with the rules of RFC 4646 */
    public static String canonical(String str) {
        if ( str == null )
            return null ;
        String[] parts = parse(str) ;
        String x = canonical(parts) ;
        if ( x == null ) {
            // Could try to apply the rule case-seeting rules
            // even through it's not a conforming langtag.
            return str ;
        }
        return x ;
    }

    /**
     * Canonicalize with the rules of RFC 4646 "In this format, all non-initial
     * two-letter subtags are uppercase, all non-initial four-letter subtags are
     * titlecase, and all other subtags are lowercase." In addition, leave
     * extensions unchanged.
     */
    public static String canonical(String[] parts) {
        // We canonicalised parts on parsing.
        // RFC 5646 is slightly different.
        if ( parts == null )
            return null ;

        if ( parts[0] == null ) {
            // Grandfathered
            return parts[idxExtension] ;
        }

        StringBuilder sb = new StringBuilder() ;
        sb.append(parts[0]) ;
        for (int i = 1; i < parts.length; i++) {
            if ( parts[i] != null ) {
                sb.append("-") ;
                sb.append(parts[i]) ;
            }
        }
        return sb.toString() ;
    }

    private static String strcase(String string) {
        if ( string == null )
            return null ;
        if ( string.length() == 2 )
            return uppercase(string) ;
        if ( string.length() == 4 )
            return titlecase(string) ;
        return lowercase(string) ;
    }

    private static String lowercase(String string) {
        if ( string == null )
            return null ;
        return string.toLowerCase(Locale.ROOT) ;
    }

    private static String uppercase(String string) {
        if ( string == null )
            return null ;
        return string.toUpperCase(Locale.ROOT) ;
    }

    private static String titlecase(String string) {
        if ( string == null )
            return null ;
        char ch1 = string.charAt(0) ;
        ch1 = Character.toUpperCase(ch1) ;
        string = lowercase(string.substring(1)) ;
        return ch1 + string ;
    }
}