/*
* Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2006.
*
* Licensed under the Aduna BSD-style license.
*/
package org.openrdf.rio.turtle;
import info.aduna.text.ASCIIUtil;
import info.aduna.text.StringUtil;
/**
* Utility methods for Turtle encoding/decoding.
*/
public class TurtleUtil {
/**
* Tries to find an index where the suppied URI can be split into a namespace
* and a local name that comply with the serialization constraints of the
* Turtle format. Specifically, the local name should adhere to Turtle's <a
* href="http://www.dajobe.org/2004/01/turtle/#name">name</a> production
* rule.
*
* @param uri
* The URI to split.
* @return The index where the supplied URI can be split, or <tt>-1</tt> if
* the URI cannot be split.
*/
public static int findURISplitIndex(String uri) {
int uriLength = uri.length();
int idx = uriLength - 1;
// Search last character that is not a name character
for (; idx >= 0; idx--) {
if (!TurtleUtil.isNameChar(uri.charAt(idx))) {
// Found a non-name character
break;
}
}
idx++;
// Local names need to start with a 'nameStartChar', skip characters
// that are not nameStartChar's.
for (; idx < uriLength; idx++) {
if (TurtleUtil.isNameStartChar(uri.charAt(idx))) {
break;
}
}
if (idx > 0 && idx < uriLength) {
// A valid split index has been found
return idx;
}
// No valid local name has been found
return -1;
}
public static boolean isWhitespace(int c) {
// Whitespace character are space, tab, newline and carriage return:
return c == 0x20 || c == 0x9 || c == 0xA || c == 0xD;
}
public static boolean isPrefixStartChar(int c) {
return
ASCIIUtil.isLetter(c) ||
c >= 0x00C0 && c <= 0x00D6 ||
c >= 0x00D8 && c <= 0x00F6 ||
c >= 0x00F8 && c <= 0x02FF ||
c >= 0x0370 && c <= 0x037D ||
c >= 0x037F && c <= 0x1FFF ||
c >= 0x200C && c <= 0x200D ||
c >= 0x2070 && c <= 0x218F ||
c >= 0x2C00 && c <= 0x2FEF ||
c >= 0x3001 && c <= 0xD7FF ||
c >= 0xF900 && c <= 0xFDCF ||
c >= 0xFDF0 && c <= 0xFFFD ||
c >= 0x10000 && c <= 0xEFFFF;
}
public static boolean isNameStartChar(int c) {
return c == '_' || isPrefixStartChar(c);
}
public static boolean isNameChar(int c) {
return
isNameStartChar(c) ||
ASCIIUtil.isNumber(c) ||
c == '-' ||
c == 0x00B7 ||
c >= 0x0300 && c <= 0x036F ||
c >= 0x203F && c <= 0x2040;
}
public static boolean isPrefixChar(int c) {
return isNameChar(c);
}
public static boolean isLanguageStartChar(int c) {
return ASCIIUtil.isLetter(c);
}
public static boolean isLanguageChar(int c) {
return ASCIIUtil.isLetter(c) || ASCIIUtil.isNumber(c) || c == '-';
}
public static boolean isLegalPrefix(String prefix) {
if (prefix.length() == 0) {
return false;
}
if (!isPrefixStartChar(prefix.charAt(0))) {
return false;
}
for (int i = 1; i < prefix.length(); i++) {
if (!isPrefixChar( prefix.charAt(i) )) {
return false;
}
}
return true;
}
public static boolean isLegalName(String name) {
if (name.length() == 0) {
return false;
}
if (!isNameStartChar(name.charAt(0))) {
return false;
}
for (int i = 1; i < name.length(); i++) {
if (!isNameChar( name.charAt(i) )) {
return false;
}
}
return true;
}
/**
* Encodes the supplied string for inclusion as a 'normal' string in a
* Turtle document.
*/
public static String encodeString(String s) {
s = StringUtil.gsub("\\", "\\\\", s);
s = StringUtil.gsub("\t", "\\t", s);
s = StringUtil.gsub("\n", "\\n", s);
s = StringUtil.gsub("\r", "\\r", s);
s = StringUtil.gsub("\"", "\\\"", s);
return s;
}
/**
* Encodes the supplied string for inclusion as a long string in a Turtle
* document.
**/
public static String encodeLongString(String s) {
// TODO: not all double quotes need to be escaped. It suffices to encode
// the ones that form sequences of 3 or more double quotes, and the ones
// at the end of a string.
s = StringUtil.gsub("\\", "\\\\", s);
s = StringUtil.gsub("\"", "\\\"", s);
return s;
}
/**
* Encodes the supplied string for inclusion as a (relative) URI in a Turtle
* document.
**/
public static String encodeURIString(String s) {
s = StringUtil.gsub("\\", "\\\\", s);
s = StringUtil.gsub(">", "\\>", s);
return s;
}
/**
* Decodes an encoded Turtle string. Any \-escape sequences are substituted
* with their decoded value.
*
* @param s An encoded Turtle string.
* @return The unencoded string.
* @exception IllegalArgumentException If the supplied string is not a
* correctly encoded Turtle string.
**/
public static String decodeString(String s) {
int backSlashIdx = s.indexOf('\\');
if (backSlashIdx == -1) {
// No escaped characters found
return s;
}
int startIdx = 0;
int sLength = s.length();
StringBuilder sb = new StringBuilder(sLength);
while (backSlashIdx != -1) {
sb.append(s.substring(startIdx, backSlashIdx));
if (backSlashIdx + 1 >= sLength) {
throw new IllegalArgumentException("Unescaped backslash in: " + s);
}
char c = s.charAt(backSlashIdx + 1);
if (c == 't') {
sb.append('\t');
startIdx = backSlashIdx + 2;
}
else if (c == 'r') {
sb.append('\r');
startIdx = backSlashIdx + 2;
}
else if (c == 'n') {
sb.append('\n');
startIdx = backSlashIdx + 2;
}
else if (c == '"') {
sb.append('"');
startIdx = backSlashIdx + 2;
}
else if (c == '>') {
sb.append('>');
startIdx = backSlashIdx + 2;
}
else if (c == '\\') {
sb.append('\\');
startIdx = backSlashIdx + 2;
}
else if (c == 'u') {
// \\uxxxx
if (backSlashIdx + 5 >= sLength) {
throw new IllegalArgumentException(
"Incomplete Unicode escape sequence in: " + s);
}
String xx = s.substring(backSlashIdx + 2, backSlashIdx + 6);
try {
c = (char)Integer.parseInt(xx, 16);
sb.append( c );
startIdx = backSlashIdx + 6;
}
catch (NumberFormatException e) {
throw new IllegalArgumentException(
"Illegal Unicode escape sequence '\\u" + xx + "' in: " + s);
}
}
else if (c == 'U') {
// \\Uxxxxxxxx
if (backSlashIdx + 9 >= sLength) {
throw new IllegalArgumentException(
"Incomplete Unicode escape sequence in: " + s);
}
String xx = s.substring(backSlashIdx + 2, backSlashIdx + 10);
try {
c = (char)Integer.parseInt(xx, 16);
sb.append( c );
startIdx = backSlashIdx + 10;
}
catch (NumberFormatException e) {
throw new IllegalArgumentException(
"Illegal Unicode escape sequence '\\U" + xx + "' in: " + s);
}
}
else {
throw new IllegalArgumentException("Unescaped backslash in: " + s);
}
backSlashIdx = s.indexOf('\\', startIdx);
}
sb.append( s.substring(startIdx) );
return sb.toString();
}
}