Util.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.rdf.model.impl;
import java.util.Objects ;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;

import org.apache.jena.JenaRuntime ;
import org.apache.jena.datatypes.RDFDatatype ;
import org.apache.jena.datatypes.xsd.XSDDatatype ;
import org.apache.jena.graph.Node ;
import org.apache.jena.rdf.model.Literal ;
import org.apache.jena.shared.CannotEncodeCharacterException ;
import org.apache.jena.util.SplitIRI ;
import org.apache.xerces.util.XMLChar ;

/** Some utility functions.
 */
public class Util extends Object {

    /**
     * Given an absolute URI, determine the split point between the namespace
     * part and the localname part. If there is no valid localname part then the
     * length of the string is returned. The algorithm tries to find the longest
     * NCName at the end of the uri, not immediately preceeded by the first
     * colon in the string.
     * <p>
     * This operation follows XML QName rules which are more complicated than
     * needed for Turtle and TriG.   For example, QName can't start with a digit. 
     * 
     * @param uri
     * @return the index of the first character of the localname
     * @see SplitIRI
     */
    public static int splitNamespaceXML(String uri) {
        
        // XML Namespaces 1.0:
        // A qname name is NCName ':' NCName
        // NCName             ::=      NCNameStartChar NCNameChar*
        // NCNameChar         ::=      NameChar - ':'
        // NCNameStartChar    ::=      Letter | '_'
        // 
        // XML 1.0
        // NameStartChar      ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] |
        //                        [#xD8-#xF6] | [#xF8-#x2FF] |
        //                        [#x370-#x37D] | [#x37F-#x1FFF] |
        //                        [#x200C-#x200D] | [#x2070-#x218F] |
        //                        [#x2C00-#x2FEF] | [#x3001-#xD7FF] |
        //                        [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
        // NameChar           ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
        //                        [#x0300-#x036F] | [#x203F-#x2040]
        // Name               ::= NameStartChar (NameChar)*
        
        char ch;
        int lg = uri.length();
        if (lg == 0)
            return 0;
        int i = lg-1 ;
        for ( ; i >= 1 ; i--) {
            ch = uri.charAt(i);
            if (notNameChar(ch)) break;
        }
        
        int j = i + 1 ;

        if ( j >= lg )
            return lg ;
        
        // Check we haven't split up a %-encoding.
        if ( j >= 2 && uri.charAt(j-2) == '%' )
            j = j+1 ;
        if ( j >= 1 && uri.charAt(j-1) == '%' )
            j = j+2 ;
        
        // Have found the leftmost NCNameChar from the
        // end of the URI string.
        // Now scan forward for an NCNameStartChar
        // The split must start with NCNameStart.
        for (; j < lg; j++) {
            ch = uri.charAt(j);
//            if (XMLChar.isNCNameStart(ch))
//                break ;
            if (XMLChar.isNCNameStart(ch))
            {
                // "mailto:" is special.
                // Keep part after mailto: at least one charcater.
                // Do a quick test before calling .startsWith
                // OLD: if ( uri.charAt(j - 1) == ':' && uri.lastIndexOf(':', j - 2) == -1)
                if ( j == 7 && uri.startsWith("mailto:"))
                    continue; // split "mailto:me" as "mailto:m" and "e" !
                else
                    break;
            }
        }
        return j;
    }

    /**
	    answer true iff this is not a legal NCName character, ie, is
	    a possible split-point start.
    */
    public static boolean notNameChar( char ch )
        { return !XMLChar.isNCName( ch ); }

    protected static Pattern standardEntities = 
    	   Pattern.compile( "&|<|>|\t|\n|\r|\'|\"" );
    
    public static String substituteStandardEntities( String s )
        {
        if (standardEntities.matcher( s ).find())
            {
            return substituteEntitiesInElementContent( s )
                .replaceAll( "'", "'" )
                .replaceAll( "\t","	" )
                .replaceAll( "\n", "
" )
                .replaceAll( "\r", "" )
                .replaceAll( "\"", """ )
                ;
            }
        else
            return s;
        }
    
    protected static Pattern entityValueEntities = 
 	   Pattern.compile( "&|%|\'|\"" );
 
   public static String substituteEntitiesInEntityValue( String s )
     {
     if (entityValueEntities.matcher( s ).find())
         {
         return s
             .replaceAll( "&","&" )
             .replaceAll( "'", "'" )
             .replaceAll( "%", "%" )
             .replaceAll( "\"", """ )
             ;
         }
     else
         return s;
     }
    protected static Pattern elementContentEntities = Pattern.compile( "<|>|&|[\0-\37&&[^\n\t]]|\uFFFF|\uFFFE" );
    /**
        Answer <code>s</code> modified to replace <, >, and & by
        their corresponding entity references. 
        
    <p>
        Implementation note: as a (possibly misguided) performance hack, 
        the obvious cascade of replaceAll calls is replaced by an explicit
        loop that looks for all three special characters at once.
    */
    public static String substituteEntitiesInElementContent( String s ) 
        {
        Matcher m = elementContentEntities.matcher( s );
        if (!m.find())
            return s;
        else
            {
            int start = 0;
            StringBuilder result = new StringBuilder();
            do
                {
                result.append( s.substring( start, m.start() ) );
                char ch = s.charAt( m.start() );
                switch ( ch )
                {
                    case '\r': result.append( "" ); break;
                    case '<': result.append( "<" ); break;
                    case '&': result.append( "&" ); break;
                    case '>': result.append( ">" ); break;
                    default: throw new CannotEncodeCharacterException( ch, "XML" );
                }
                start = m.end();
                } while (m.find( start ));
            result.append( s.substring( start ) );
            return result.toString();
            }
        }

    public static String replace(String s, String oldString, String newString) {
        return s.replace(oldString, newString) ;
    }

    /**
     * A Node is a simple string if: 
     * <li>(RDF 1.0) No datatype and no language tag.
     * <li>(RDF 1.1) xsd:string
     */
    public static boolean isSimpleString(Node n) {
        Objects.requireNonNull(n) ;
        RDFDatatype dt = n.getLiteralDatatype() ;
        if ( dt == null )
            return !isLangString(n) ;
        if ( JenaRuntime.isRDF11 )
            return dt.equals(XSDDatatype.XSDstring) ;
        return false ;
    }

    /**
     * A Node is a language string if it has a language tag. 
     * (RDF 1.0 and RDF 1.1)
     */
    public static boolean isLangString(Node n) {
        Objects.requireNonNull(n) ;
        String lang = n.getLiteralLanguage() ;
        if ( lang == null )
            return false ;
        return !lang.equals("") ;
    }
    
    /** Return true if the literal is a simple string.
     *  <p>RDF 1.0 => it is a plain literal, with no language tag
     *  <p>RDF 1.1 => it has datatype xsd:string
     */ 
    public static boolean isSimpleString(Literal lit) {
        Objects.requireNonNull(lit) ;
        String dtStr = lit.getDatatypeURI() ;
        if (  dtStr == null )
            return ! isLangString(lit) ;
        if ( JenaRuntime.isRDF11 )
            return dtStr.equals(XSDDatatype.XSDstring.getURI());
        return false ;
    }
    
    /** Return true if the literal has a language tag. (RDF 1.0 and RDF 1.1) */
    public static boolean isLangString(Literal lit) {
        Objects.requireNonNull(lit) ;
        String lang = lit.getLanguage() ;
        if ( lang == null )
            return false ;
        return ! lang.equals("") ;
    }

}