/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.rdf.model.impl;
import java.util.Objects ;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
import org.apache.jena.JenaRuntime ;
import org.apache.jena.datatypes.RDFDatatype ;
import org.apache.jena.datatypes.xsd.XSDDatatype ;
import org.apache.jena.graph.Node ;
import org.apache.jena.rdf.model.Literal ;
import org.apache.jena.shared.CannotEncodeCharacterException ;
import org.apache.jena.util.SplitIRI ;
import org.apache.xerces.util.XMLChar ;
/** Some utility functions.
*/
public class Util extends Object {
/**
* Given an absolute URI, determine the split point between the namespace
* part and the localname part. If there is no valid localname part then the
* length of the string is returned. The algorithm tries to find the longest
* NCName at the end of the uri, not immediately preceeded by the first
* colon in the string.
* <p>
* This operation follows XML QName rules which are more complicated than
* needed for Turtle and TriG. For example, QName can't start with a digit.
*
* @param uri
* @return the index of the first character of the localname
* @see SplitIRI
*/
public static int splitNamespaceXML(String uri) {
// XML Namespaces 1.0:
// A qname name is NCName ':' NCName
// NCName ::= NCNameStartChar NCNameChar*
// NCNameChar ::= NameChar - ':'
// NCNameStartChar ::= Letter | '_'
//
// XML 1.0
// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] |
// [#xD8-#xF6] | [#xF8-#x2FF] |
// [#x370-#x37D] | [#x37F-#x1FFF] |
// [#x200C-#x200D] | [#x2070-#x218F] |
// [#x2C00-#x2FEF] | [#x3001-#xD7FF] |
// [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
// NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
// [#x0300-#x036F] | [#x203F-#x2040]
// Name ::= NameStartChar (NameChar)*
char ch;
int lg = uri.length();
if (lg == 0)
return 0;
int i = lg-1 ;
for ( ; i >= 1 ; i--) {
ch = uri.charAt(i);
if (notNameChar(ch)) break;
}
int j = i + 1 ;
if ( j >= lg )
return lg ;
// Check we haven't split up a %-encoding.
if ( j >= 2 && uri.charAt(j-2) == '%' )
j = j+1 ;
if ( j >= 1 && uri.charAt(j-1) == '%' )
j = j+2 ;
// Have found the leftmost NCNameChar from the
// end of the URI string.
// Now scan forward for an NCNameStartChar
// The split must start with NCNameStart.
for (; j < lg; j++) {
ch = uri.charAt(j);
// if (XMLChar.isNCNameStart(ch))
// break ;
if (XMLChar.isNCNameStart(ch))
{
// "mailto:" is special.
// Keep part after mailto: at least one charcater.
// Do a quick test before calling .startsWith
// OLD: if ( uri.charAt(j - 1) == ':' && uri.lastIndexOf(':', j - 2) == -1)
if ( j == 7 && uri.startsWith("mailto:"))
continue; // split "mailto:me" as "mailto:m" and "e" !
else
break;
}
}
return j;
}
/**
answer true iff this is not a legal NCName character, ie, is
a possible split-point start.
*/
public static boolean notNameChar( char ch )
{ return !XMLChar.isNCName( ch ); }
protected static Pattern standardEntities =
Pattern.compile( "&|<|>|\t|\n|\r|\'|\"" );
public static String substituteStandardEntities( String s )
{
if (standardEntities.matcher( s ).find())
{
return substituteEntitiesInElementContent( s )
.replaceAll( "'", "'" )
.replaceAll( "\t"," " )
.replaceAll( "\n", "
" )
.replaceAll( "\r", "
" )
.replaceAll( "\"", """ )
;
}
else
return s;
}
protected static Pattern entityValueEntities =
Pattern.compile( "&|%|\'|\"" );
public static String substituteEntitiesInEntityValue( String s )
{
if (entityValueEntities.matcher( s ).find())
{
return s
.replaceAll( "&","&" )
.replaceAll( "'", "'" )
.replaceAll( "%", "%" )
.replaceAll( "\"", """ )
;
}
else
return s;
}
protected static Pattern elementContentEntities = Pattern.compile( "<|>|&|[\0-\37&&[^\n\t]]|\uFFFF|\uFFFE" );
/**
Answer <code>s</code> modified to replace <, >, and & by
their corresponding entity references.
<p>
Implementation note: as a (possibly misguided) performance hack,
the obvious cascade of replaceAll calls is replaced by an explicit
loop that looks for all three special characters at once.
*/
public static String substituteEntitiesInElementContent( String s )
{
Matcher m = elementContentEntities.matcher( s );
if (!m.find())
return s;
else
{
int start = 0;
StringBuilder result = new StringBuilder();
do
{
result.append( s.substring( start, m.start() ) );
char ch = s.charAt( m.start() );
switch ( ch )
{
case '\r': result.append( "
" ); break;
case '<': result.append( "<" ); break;
case '&': result.append( "&" ); break;
case '>': result.append( ">" ); break;
default: throw new CannotEncodeCharacterException( ch, "XML" );
}
start = m.end();
} while (m.find( start ));
result.append( s.substring( start ) );
return result.toString();
}
}
public static String replace(String s, String oldString, String newString) {
return s.replace(oldString, newString) ;
}
/**
* A Node is a simple string if:
* <li>(RDF 1.0) No datatype and no language tag.
* <li>(RDF 1.1) xsd:string
*/
public static boolean isSimpleString(Node n) {
Objects.requireNonNull(n) ;
RDFDatatype dt = n.getLiteralDatatype() ;
if ( dt == null )
return !isLangString(n) ;
if ( JenaRuntime.isRDF11 )
return dt.equals(XSDDatatype.XSDstring) ;
return false ;
}
/**
* A Node is a language string if it has a language tag.
* (RDF 1.0 and RDF 1.1)
*/
public static boolean isLangString(Node n) {
Objects.requireNonNull(n) ;
String lang = n.getLiteralLanguage() ;
if ( lang == null )
return false ;
return !lang.equals("") ;
}
/** Return true if the literal is a simple string.
* <p>RDF 1.0 => it is a plain literal, with no language tag
* <p>RDF 1.1 => it has datatype xsd:string
*/
public static boolean isSimpleString(Literal lit) {
Objects.requireNonNull(lit) ;
String dtStr = lit.getDatatypeURI() ;
if ( dtStr == null )
return ! isLangString(lit) ;
if ( JenaRuntime.isRDF11 )
return dtStr.equals(XSDDatatype.XSDstring.getURI());
return false ;
}
/** Return true if the literal has a language tag. (RDF 1.0 and RDF 1.1) */
public static boolean isLangString(Literal lit) {
Objects.requireNonNull(lit) ;
String lang = lit.getLanguage() ;
if ( lang == null )
return false ;
return ! lang.equals("") ;
}
}