/*
* Copyright (c) 2013-2015 Josef Hardi <josef.hardi@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.obidea.semantika.util;
public class XmlUtils
{
/**
* Determines if the input character <code>c</code> is an XML name start
* character.
*
* Source: http://www.w3.org/TR/xml/#NT-NameStartChar
*
* @param c
* the input character value to test (according to UTF-8 or UTF-16)
* @return Returns <code>true</code> if the input character is a name start
* character, or <code>false</code> otherwise.
*/
public static boolean isXmlNameStartChar(int c)
{
return c == ':' //$NON-NLS-1$
|| (c >= 'A' && c <= 'Z') //$NON-NLS-1$ //$NON-NLS-2$
|| c == '_' //$NON-NLS-1$
|| (c >= 'a' && c <= 'z') //$NON-NLS-1$ //$NON-NLS-2$
|| (c >= 0xC0 && c <= 0xD6)
|| (c >= 0xD8 && c <= 0xF6)
|| (c >= 0xF8 && c <= 0x2FF)
|| (c >= 0x370 && c <= 0x37D)
|| (c >= 0x37F && c <= 0x1FFF)
|| (c >= 0x200C && c <= 0x200D)
|| (c >= 0x2070 && c <= 0x218F)
|| (c >= 0x2C00 && c <= 0x2FEF)
|| (c >= 0x3001 && c <= 0xD7FF)
|| (c >= 0xF900 && c <= 0xFDCF)
|| (c >= 0xFDF0 && c <= 0xFFFD)
|| (c >= 0x10000 && c <= 0xEFFFF);
}
/**
* Determines if the input character <code>c</code> is an XML name character.
*
* Source: http://www.w3.org/TR/xml/#NT-NameChar
*
* @param c
* the input character value to test (according to UTF-8 or UTF-16)
* @return Returns <code>true</code> if the input character is a name
* character, or <code>false</code> otherwise.
*/
public static boolean isXmlNameChar(int c) {
return isXmlNameStartChar(c)
|| c == '-' //$NON-NLS-1$
|| c == '.' //$NON-NLS-1$
|| c >= '0' && c <= '9' //$NON-NLS-1$ //$NON-NLS-2$
|| c == 0xB7
|| c >= 0x0300 && c <= 0x036F
|| c >= 0x203F && c <= 0x2040;
}
/**
* Determines if the input character <code>c</code> is an NCName (Non-Colon
* Name, i.e., an XML Name, minus the ":") start character.
*
* @param c
* the input character value to test (according to UTF-8 or UTF-16)
* @return Returns <code>true</code> if the input character is a NCName start
* character, or <code>false</code> otherwise.
*/
public static boolean isNCNameStartChar(int c)
{
return c != ':' && isXmlNameStartChar(c); //$NON-NLS-1$
}
/**
* Determines if the input character <code>c</code> is a NCName (Non-Colon
* Name, i.e., an XML Name, minus the ":") character.
*
* @param c
* the input character value to test (according to UTF-8 or UTF-16)
* @return Returns <code>true</code> if the input character is a NCName
* character, or <code>false</code> otherwise.
*/
public static boolean isNCNameChar(int c)
{
return c != ':' && isXmlNameChar(c); //$NON-NLS-1$
}
/**
* Determines if the input character sequence <code>cs</code> is a NCName
* (Non-Colon Name). An NCName is a string which starts with an NCName start
* character and is followed by zero or more NCName characters.
*
* Source: http://www.w3.org/TR/xml-names/#NT-NCName
*
* @param cs
* The character sequence to test.
* @return Returns <code>true</code> if the input character sequence is a
* NCName or <code>false</code> otherwise.
*/
public static boolean isNCName(CharSequence cs)
{
if (isEmpty(cs)) {
return false;
}
int firstChar = Character.codePointAt(cs, 0);
if (!isNCNameStartChar(firstChar)) {
return false;
}
for (int i = Character.charCount(firstChar); i < cs.length();) {
int c = Character.codePointAt(cs, i);
if (!isNCNameChar(c)) {
return false;
}
i += Character.charCount(c);
}
return true;
}
/**
* Determines if a character sequence is a QName.
* <p>
* A QName is either:
* <ul>
* <li>an NCName (LocalName), or</li>
* <li>an NCName followed by a colon and by another NCName
* (PrefixName:LocalName)</li>
* </ul>
*
* Source: http://www.w3.org/TR/xml-names/#NT-QName
*
* @param s
* The character sequence to test.
* @return Returns <code>true</code> if the character sequence
* <code>cs</code> is a QName, or <code>false</code> otherwise.
*/
public static boolean isQName(CharSequence s)
{
if (isEmpty(s)) {
return false;
}
boolean foundColon = false;
boolean inNCName = false;
for (int i = 0; i < s.length();) {
int c = Character.codePointAt(s, i);
if (c == ':') { //$NON-NLS-1$
if (foundColon) {
return false;
}
foundColon = true;
if (!inNCName) {
return false;
}
inNCName = false;
}
else {
if (!inNCName) {
if (!isXmlNameStartChar(c)) {
return false;
}
inNCName = true;
}
else {
if (!isXmlNameChar(c)) {
return false;
}
}
}
i += Character.charCount(c);
}
return true;
}
/**
* Determines if a character sequence <code>cs</code> has a suffix that is an NCName.
*
* @param s
* The character sequence to test.
* @return Returns <code>true</code> if the character sequence <code>cs</code> has a
* suffix that is an NCName, or <code>false</code> otherwise.
*/
public static boolean hasNCNameSuffix(CharSequence cs)
{
return getNCNameSuffixIndex(cs) != -1;
}
/**
* Gets the index of the longest NCName that is the suffix of a character
* sequence.
*
* @param cs
* The character sequence.
* @return Returns the index of the longest suffix of the specified character
* sequence <code>cs</code> that is an NCName, or -1 if the character
* sequence <code>cs</code> does not have a suffix that is an NCName.
*/
public static int getNCNameSuffixIndex(CharSequence cs)
{
int index = -1;
for (int i = cs.length() - 1; i > -1; i--) {
if (!Character.isLowSurrogate(cs.charAt(i))) {
int c = Character.codePointAt(cs, i);
if (isNCNameStartChar(c)) {
index = i;
}
if (!isNCNameChar(c)) {
break;
}
}
}
return index;
}
/**
* Gets the longest NCName that is a suffix of a character sequence.
*
* @param cs
* The character sequence.
* @return Returns the string which is the longest suffix of the character
* sequence <code>s</code> that is an NCName, or <code>null</code> if
* the character sequence <code>s</code> does not have a suffix that
* is an NCName.
*/
public static String getNCNameSuffix(CharSequence cs)
{
int localPartStartIndex = getNCNameSuffixIndex(cs);
if (localPartStartIndex != -1) {
return cs.toString().substring(localPartStartIndex);
}
else {
return null;
}
}
/**
* Gets the part of a char sequence that is not the NCName suffix fragment
*
* @param cs
* The character sequence.
* @return Returns the prefix split at the last non-NCName character, or the
* whole input if no NCName is found.
*/
public static String getNCNamePrefix(CharSequence cs)
{
int localPartStartIndex = getNCNameSuffixIndex(cs);
if (localPartStartIndex != -1) {
return cs.toString().substring(0, localPartStartIndex);
}
else {
return cs.toString();
}
}
/**
* Determines if a character sequence is <code>null</code> or empty.
*
* @param s
* The character sequence.
* @return Returns <code>true</code> if the character sequence is
* <code>null</code> or empty, or <code>false</code> otherwise.
*/
private static boolean isEmpty(CharSequence cs)
{
return cs == null || cs.length() == 0;
}
}