/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.waveprotocol.wave.model.util;
/**
* Some Unicode-related tools based on Unicode 5.1.0.
*
* Chapter 16 ( http://www.unicode.org/versions/Unicode5.0.0/ch16.pdf )
* defines terminology like surrogates, noncharacters, control codes, etc.
*/
public final class Utf16Util {
private Utf16Util() {}
/**
* Unicode character 'REPLACEMENT CHARACTER'.
*/
public static final char REPLACEMENT_CHARACTER = 0xFFFD;
/**
* The visitor interface used by traverseUtf16String.
*/
public interface CodePointHandler<T> {
T codePoint(int cp);
T unpairedSurrogate(char c);
T endOfString();
}
// java.lang.Character also has some of these, but not everything we want. So
// we just have our own code for everything to reduce unnecessary indirections
// that obscure the actual values of the numbers and their relations.
public static boolean isCodePoint(int c) {
return 0 <= c && c <= 0x10ffff;
}
public static boolean isSurrogate(char c) {
return 0xd800 <= c && c <= 0xdfff;
}
public static boolean isLowSurrogate(char c) {
return 0xdc00 <= c && c <= 0xdfff;
}
public static boolean isHighSurrogate(char c) {
return 0xd800 <= c && c <= 0xdbff;
}
public static boolean isSurrogate(int c) {
if (!isCodePoint(c)) {
Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c));
}
return 0xd800 <= c && c <= 0xdfff;
}
public static boolean isSupplementaryCodePoint(int c) {
if (!isCodePoint(c)) {
Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c));
}
return c >= 0x10000;
}
/**
* Traverses the given UTF-16 string from left to right, decoding surrogates
* into code points, and calls the handler for each code point and unmatched
* surrogate.
*
* The return values of the handler's methods determine whether
* to continue traversal and the return value of traverseUtf16String.
*
* Traversal continues as long as the handler returns null. If handler
* returns a non-null value, traversal immediately terminates, and
* traverseUtf16String returns the value the hander returned.
* If the end of the string is reached, traverseUtf16String calls
* handler.endOfString() and returns its value.
*/
public static <T> T traverseUtf16String(String s, CodePointHandler<T> handler) {
Preconditions.checkNotNull(s, "Null string");
nextCodeUnit:
for (int i = 0; i < s.length(); i++) {
int cp;
char c = s.charAt(i);
if (isSurrogate(c)) {
if (isLowSurrogate(c)) {
// unexpected trailing (low) surrogate
T v = handler.unpairedSurrogate(c);
if (v != null) {
return v;
}
continue nextCodeUnit;
}
// leading (high) surrogate
i++;
if (i >= s.length()) {
T v = handler.unpairedSurrogate(c);
if (v != null) {
return v;
}
break nextCodeUnit;
}
char c2 = s.charAt(i);
if (isLowSurrogate(c2)) {
// low surrogate as expected
cp = Character.toCodePoint(c, c2);
} else {
// either not a surrogate, or a high surrogate
T v = handler.unpairedSurrogate(c);
if (v != null) {
return v;
}
i--;
continue nextCodeUnit;
}
} else {
cp = c;
}
T v = handler.codePoint(cp);
if (v != null) {
return v;
}
}
return handler.endOfString();
}
/**
* Returns the index of the first surrogate character in the given string,
* or -1 if there aren't any.
*
* Does not check whether surrogates in s are paired correctly.
*/
public static int firstSurrogate(String s) {
for (int i = 0; i < s.length(); i++) {
if (isSurrogate(s.charAt(i))) {
return i;
}
}
return -1;
}
/**
* Returns the index of the first non-surrogate character in the given string,
* or -1 if all characters in s are surrogates.
*
* Does not check whether surrogates in s are paired correctly.
*/
public static int firstNonSurrogate(String s) {
for (int i = 0; i < s.length(); i++) {
if (!isSurrogate(s.charAt(i))) {
return i;
}
}
return -1;
}
private static final CodePointHandler<Boolean> UNPAIRED_SURROGATES =
new CodePointHandler<Boolean>() {
@Override
public Boolean codePoint(int cp) {
return null;
}
@Override
public Boolean unpairedSurrogate(char c) {
return true;
}
@Override
public Boolean endOfString() {
return false;
}};
// Deprecated because I think this check is too weak to be useful. We should
// always also check for noncharacter codepoints etc.
@Deprecated
public static boolean containsUnpairedSurrogates(String s) {
return traverseUtf16String(s, UNPAIRED_SURROGATES);
}
/**
* @param c
* @return true if the code point is valid, false if it is a non-character
*/
public static boolean isCodePointValid(int c) {
if (!isCodePoint(c)) {
Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c));
}
if (isSurrogate(c)) {
Preconditions.illegalArgument("Code point is a surrogate: 0x" + Integer.toHexString(c));
}
// noncharacters
{
int d = c & 0xFFFF;
if (d == 0xFFFE || d == 0xFFFF) { return false; }
}
if (0xFDD0 <= c && c <= 0xFDEF) { return false; }
return true;
}
public enum BlipCodePointResult {
/** Character OK for blip text. All others are not OK. */
OK,
/** Control characters */
CONTROL,
/** Deprecated format characters */
DEPRECATED,
/** Bidi markers. This restriction may be lifted in the future. */
BIDI,
/** Tag characters */
TAG,
/** Non-characters */
NONCHARACTER
}
/**
* Returns whether the given code point is acceptable for blip content.
*
* This definition is based on RFC5198 (section 2 in particular)
* and a few internal discussions.
*
* It may turn out to be overly restrictive, but relaxing it in the
* future is easy.
*/
public static BlipCodePointResult isCodePointGoodForBlip(int c) {
if (!isCodePoint(c)) {
Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c));
}
if (isSurrogate(c)) {
Preconditions.illegalArgument("Code point is a surrogate: 0x" + Integer.toHexString(c));
}
if (!isCodePointValid(c)) { return BlipCodePointResult.NONCHARACTER; }
// control codes
if (0 <= c && c <= 0x1f || 0x7f <= c && c <= 0x9f) { return BlipCodePointResult.CONTROL; }
// private use
// we permit these, they can be used for things like emoji
//if (0xE000 <= c && c <= 0xF8FF) { return false; }
//if (0xF0000 <= c && c <= 0xFFFFD) { return false; }
//if (0x100000 <= c && c <= 0x10FFFD) { return false; }
// deprecated format characters
if (0x206A <= c && c <= 0x206F) { return BlipCodePointResult.DEPRECATED; }
// TODO: investigate whether we can lift some of these restrictions
// bidi markers
if (c == 0x200E || c == 0x200F) { return BlipCodePointResult.BIDI; }
if (0x202A <= c && c <= 0x202E) { return BlipCodePointResult.BIDI; }
// tag characters, strongly discouraged
if (0xE0000 <= c && c <= 0xE007F) { return BlipCodePointResult.TAG; }
return BlipCodePointResult.OK;
}
/**
* Returns whether the given code point is acceptable for data document
* content.
*
* For now, it allows any valid Unicode.
*/
public static boolean isCodePointGoodForDataDocument(int c) {
if (!isCodePoint(c)) {
Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c));
}
if (isSurrogate(c)) {
Preconditions.illegalArgument("Code point is a surrogate: 0x" + Integer.toHexString(c));
}
if (!isCodePointValid(c)) { return false; }
return true;
}
/**
* Returns whether a given code point is a NameStartChar according to XML
* and valid Unicode.
*
* See http://www.w3.org/TR/xml/#NT-NameStartChar and isCodePointValid().
*/
public static boolean isXmlNameStartChar(int c) {
// There are some obvious ways to speed this up, but let's not bother until
// profiles show that it matters.
// NameStartChar ::= ":" | [A-Z] | "_" | [a-z]
return c == ':' || ('A' <= c && c <= 'Z') || c == '_' | ('a' <= c && c <= 'z')
// | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF]
|| (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) || (0xF8 <= c && c <= 0x2FF)
// | [#x370-#x37D] | [#x37F-#x1FFF]
|| (0x370 <= c && c <= 0x37D) || (0x37F <= c && c <= 0x1FFF)
// | [#x200C-#x200D] | [#x2070-#x218F]
|| (0x200C <= c && c <= 0x200D) || (0x2070 <= c && c <= 0x218F)
// | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
|| (0x2C00 <= c && c <= 0x2FEF) || (0x3001 <= c && c <= 0xD7FF)
// | [#xF900-#xFDCF] | [#xFDF0-#xFFFD]
|| (0xF900 <= c && c <= 0xFDCF) || (0xFDF0 <= c && c <= 0xFFFD)
// | [#x10000-#xEFFFF]
|| ((0x10000 <= c && c <= 0xEFFFF) && isCodePointValid(c));
}
/**
* Returns whether a given code point is a NameChar according to XML
* and valid Unicode.
*
* See http://www.w3.org/TR/xml/#NT-NameChar and isCodePointValid().
*/
public static boolean isXmlNameChar(int c) {
// There are some obvious ways to speed this up, but let's not bother until
// profiles show that it matters.
if (!isCodePointValid(c)) { return false; }
// NameChar ::= NameStartChar | "-" | "." | [0-9]
return isXmlNameStartChar(c) || c == '-' || c == '.' || ('0' <= c && c <= '9')
// | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
|| c == 0xB7 || (0x0300 <= c && c <= 0x036F) || (0x203F <= c && c <= 0x2040);
}
/**
* Returns whether a given string is a Name according to XML and valid
* Unicode.
*
* See http://www.w3.org/TR/xml/#NT-Name and isValidUtf16().
*/
public static boolean isXmlName(String s) {
// Name ::= NameStartChar (NameChar)*
Preconditions.checkNotNull(s, "Null XML name string");
if (s.isEmpty()) {
return false;
}
return traverseUtf16String(s, new CodePointHandler<Boolean>() {
boolean first = true;
@Override
public Boolean codePoint(int cp) {
if (first) {
if (!isXmlNameStartChar(cp)) {
return false;
}
first = false;
} else {
if (!isXmlNameChar(cp)) {
return false;
}
}
return null;
}
@Override
public Boolean unpairedSurrogate(char c) {
return false;
}
@Override
public Boolean endOfString() {
return true;
}});
}
private static final CodePointHandler<Boolean> VALID_UTF16 =
new CodePointHandler<Boolean>() {
@Override
public Boolean codePoint(int cp) {
if (!isCodePointValid(cp)) {
return false;
}
return null;
}
@Override
public Boolean unpairedSurrogate(char c) {
return false;
}
@Override
public Boolean endOfString() {
return true;
}};
public static boolean isValidUtf16(String s) {
return traverseUtf16String(s, VALID_UTF16);
}
private static final CodePointHandler<Boolean> GOOD_UTF16_FOR_BLIP =
new CodePointHandler<Boolean>() {
@Override
public Boolean codePoint(int cp) {
if (isCodePointGoodForBlip(cp) != BlipCodePointResult.OK) {
return false;
}
return null;
}
@Override
public Boolean unpairedSurrogate(char c) {
return false;
}
@Override
public Boolean endOfString() {
return true;
}};
public static boolean isGoodUtf16ForBlip(String s) {
return traverseUtf16String(s, GOOD_UTF16_FOR_BLIP);
}
private static final CodePointHandler<Boolean> GOOD_UTF16_FOR_DATA_DOCUMENT =
new CodePointHandler<Boolean>() {
@Override
public Boolean codePoint(int cp) {
if (!isCodePointGoodForDataDocument(cp)) {
return false;
}
return null;
}
@Override
public Boolean unpairedSurrogate(char c) {
return false;
}
@Override
public Boolean endOfString() {
return true;
}};
public static boolean isGoodUtf16ForDataDocument(String s) {
return traverseUtf16String(s, GOOD_UTF16_FOR_DATA_DOCUMENT);
}
}