/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.waveprotocol.wave.model.util; /** * Some Unicode-related tools based on Unicode 5.1.0. * * Chapter 16 ( http://www.unicode.org/versions/Unicode5.0.0/ch16.pdf ) * defines terminology like surrogates, noncharacters, control codes, etc. */ public final class Utf16Util { private Utf16Util() {} /** * Unicode character 'REPLACEMENT CHARACTER'. */ public static final char REPLACEMENT_CHARACTER = 0xFFFD; /** * The visitor interface used by traverseUtf16String. */ public interface CodePointHandler<T> { T codePoint(int cp); T unpairedSurrogate(char c); T endOfString(); } // java.lang.Character also has some of these, but not everything we want. So // we just have our own code for everything to reduce unnecessary indirections // that obscure the actual values of the numbers and their relations. public static boolean isCodePoint(int c) { return 0 <= c && c <= 0x10ffff; } public static boolean isSurrogate(char c) { return 0xd800 <= c && c <= 0xdfff; } public static boolean isLowSurrogate(char c) { return 0xdc00 <= c && c <= 0xdfff; } public static boolean isHighSurrogate(char c) { return 0xd800 <= c && c <= 0xdbff; } public static boolean isSurrogate(int c) { if (!isCodePoint(c)) { Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c)); } return 0xd800 <= c && c <= 0xdfff; } public static boolean isSupplementaryCodePoint(int c) { if (!isCodePoint(c)) { Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c)); } return c >= 0x10000; } /** * Traverses the given UTF-16 string from left to right, decoding surrogates * into code points, and calls the handler for each code point and unmatched * surrogate. * * The return values of the handler's methods determine whether * to continue traversal and the return value of traverseUtf16String. * * Traversal continues as long as the handler returns null. If handler * returns a non-null value, traversal immediately terminates, and * traverseUtf16String returns the value the hander returned. * If the end of the string is reached, traverseUtf16String calls * handler.endOfString() and returns its value. */ public static <T> T traverseUtf16String(String s, CodePointHandler<T> handler) { Preconditions.checkNotNull(s, "Null string"); nextCodeUnit: for (int i = 0; i < s.length(); i++) { int cp; char c = s.charAt(i); if (isSurrogate(c)) { if (isLowSurrogate(c)) { // unexpected trailing (low) surrogate T v = handler.unpairedSurrogate(c); if (v != null) { return v; } continue nextCodeUnit; } // leading (high) surrogate i++; if (i >= s.length()) { T v = handler.unpairedSurrogate(c); if (v != null) { return v; } break nextCodeUnit; } char c2 = s.charAt(i); if (isLowSurrogate(c2)) { // low surrogate as expected cp = Character.toCodePoint(c, c2); } else { // either not a surrogate, or a high surrogate T v = handler.unpairedSurrogate(c); if (v != null) { return v; } i--; continue nextCodeUnit; } } else { cp = c; } T v = handler.codePoint(cp); if (v != null) { return v; } } return handler.endOfString(); } /** * Returns the index of the first surrogate character in the given string, * or -1 if there aren't any. * * Does not check whether surrogates in s are paired correctly. */ public static int firstSurrogate(String s) { for (int i = 0; i < s.length(); i++) { if (isSurrogate(s.charAt(i))) { return i; } } return -1; } /** * Returns the index of the first non-surrogate character in the given string, * or -1 if all characters in s are surrogates. * * Does not check whether surrogates in s are paired correctly. */ public static int firstNonSurrogate(String s) { for (int i = 0; i < s.length(); i++) { if (!isSurrogate(s.charAt(i))) { return i; } } return -1; } private static final CodePointHandler<Boolean> UNPAIRED_SURROGATES = new CodePointHandler<Boolean>() { @Override public Boolean codePoint(int cp) { return null; } @Override public Boolean unpairedSurrogate(char c) { return true; } @Override public Boolean endOfString() { return false; }}; // Deprecated because I think this check is too weak to be useful. We should // always also check for noncharacter codepoints etc. @Deprecated public static boolean containsUnpairedSurrogates(String s) { return traverseUtf16String(s, UNPAIRED_SURROGATES); } /** * @param c * @return true if the code point is valid, false if it is a non-character */ public static boolean isCodePointValid(int c) { if (!isCodePoint(c)) { Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c)); } if (isSurrogate(c)) { Preconditions.illegalArgument("Code point is a surrogate: 0x" + Integer.toHexString(c)); } // noncharacters { int d = c & 0xFFFF; if (d == 0xFFFE || d == 0xFFFF) { return false; } } if (0xFDD0 <= c && c <= 0xFDEF) { return false; } return true; } public enum BlipCodePointResult { /** Character OK for blip text. All others are not OK. */ OK, /** Control characters */ CONTROL, /** Deprecated format characters */ DEPRECATED, /** Bidi markers. This restriction may be lifted in the future. */ BIDI, /** Tag characters */ TAG, /** Non-characters */ NONCHARACTER } /** * Returns whether the given code point is acceptable for blip content. * * This definition is based on RFC5198 (section 2 in particular) * and a few internal discussions. * * It may turn out to be overly restrictive, but relaxing it in the * future is easy. */ public static BlipCodePointResult isCodePointGoodForBlip(int c) { if (!isCodePoint(c)) { Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c)); } if (isSurrogate(c)) { Preconditions.illegalArgument("Code point is a surrogate: 0x" + Integer.toHexString(c)); } if (!isCodePointValid(c)) { return BlipCodePointResult.NONCHARACTER; } // control codes if (0 <= c && c <= 0x1f || 0x7f <= c && c <= 0x9f) { return BlipCodePointResult.CONTROL; } // private use // we permit these, they can be used for things like emoji //if (0xE000 <= c && c <= 0xF8FF) { return false; } //if (0xF0000 <= c && c <= 0xFFFFD) { return false; } //if (0x100000 <= c && c <= 0x10FFFD) { return false; } // deprecated format characters if (0x206A <= c && c <= 0x206F) { return BlipCodePointResult.DEPRECATED; } // TODO: investigate whether we can lift some of these restrictions // bidi markers if (c == 0x200E || c == 0x200F) { return BlipCodePointResult.BIDI; } if (0x202A <= c && c <= 0x202E) { return BlipCodePointResult.BIDI; } // tag characters, strongly discouraged if (0xE0000 <= c && c <= 0xE007F) { return BlipCodePointResult.TAG; } return BlipCodePointResult.OK; } /** * Returns whether the given code point is acceptable for data document * content. * * For now, it allows any valid Unicode. */ public static boolean isCodePointGoodForDataDocument(int c) { if (!isCodePoint(c)) { Preconditions.illegalArgument("Not a code point: 0x" + Integer.toHexString(c)); } if (isSurrogate(c)) { Preconditions.illegalArgument("Code point is a surrogate: 0x" + Integer.toHexString(c)); } if (!isCodePointValid(c)) { return false; } return true; } /** * Returns whether a given code point is a NameStartChar according to XML * and valid Unicode. * * See http://www.w3.org/TR/xml/#NT-NameStartChar and isCodePointValid(). */ public static boolean isXmlNameStartChar(int c) { // There are some obvious ways to speed this up, but let's not bother until // profiles show that it matters. // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] return c == ':' || ('A' <= c && c <= 'Z') || c == '_' | ('a' <= c && c <= 'z') // | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] || (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) || (0xF8 <= c && c <= 0x2FF) // | [#x370-#x37D] | [#x37F-#x1FFF] || (0x370 <= c && c <= 0x37D) || (0x37F <= c && c <= 0x1FFF) // | [#x200C-#x200D] | [#x2070-#x218F] || (0x200C <= c && c <= 0x200D) || (0x2070 <= c && c <= 0x218F) // | [#x2C00-#x2FEF] | [#x3001-#xD7FF] || (0x2C00 <= c && c <= 0x2FEF) || (0x3001 <= c && c <= 0xD7FF) // | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] || (0xF900 <= c && c <= 0xFDCF) || (0xFDF0 <= c && c <= 0xFFFD) // | [#x10000-#xEFFFF] || ((0x10000 <= c && c <= 0xEFFFF) && isCodePointValid(c)); } /** * Returns whether a given code point is a NameChar according to XML * and valid Unicode. * * See http://www.w3.org/TR/xml/#NT-NameChar and isCodePointValid(). */ public static boolean isXmlNameChar(int c) { // There are some obvious ways to speed this up, but let's not bother until // profiles show that it matters. if (!isCodePointValid(c)) { return false; } // NameChar ::= NameStartChar | "-" | "." | [0-9] return isXmlNameStartChar(c) || c == '-' || c == '.' || ('0' <= c && c <= '9') // | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] || c == 0xB7 || (0x0300 <= c && c <= 0x036F) || (0x203F <= c && c <= 0x2040); } /** * Returns whether a given string is a Name according to XML and valid * Unicode. * * See http://www.w3.org/TR/xml/#NT-Name and isValidUtf16(). */ public static boolean isXmlName(String s) { // Name ::= NameStartChar (NameChar)* Preconditions.checkNotNull(s, "Null XML name string"); if (s.isEmpty()) { return false; } return traverseUtf16String(s, new CodePointHandler<Boolean>() { boolean first = true; @Override public Boolean codePoint(int cp) { if (first) { if (!isXmlNameStartChar(cp)) { return false; } first = false; } else { if (!isXmlNameChar(cp)) { return false; } } return null; } @Override public Boolean unpairedSurrogate(char c) { return false; } @Override public Boolean endOfString() { return true; }}); } private static final CodePointHandler<Boolean> VALID_UTF16 = new CodePointHandler<Boolean>() { @Override public Boolean codePoint(int cp) { if (!isCodePointValid(cp)) { return false; } return null; } @Override public Boolean unpairedSurrogate(char c) { return false; } @Override public Boolean endOfString() { return true; }}; public static boolean isValidUtf16(String s) { return traverseUtf16String(s, VALID_UTF16); } private static final CodePointHandler<Boolean> GOOD_UTF16_FOR_BLIP = new CodePointHandler<Boolean>() { @Override public Boolean codePoint(int cp) { if (isCodePointGoodForBlip(cp) != BlipCodePointResult.OK) { return false; } return null; } @Override public Boolean unpairedSurrogate(char c) { return false; } @Override public Boolean endOfString() { return true; }}; public static boolean isGoodUtf16ForBlip(String s) { return traverseUtf16String(s, GOOD_UTF16_FOR_BLIP); } private static final CodePointHandler<Boolean> GOOD_UTF16_FOR_DATA_DOCUMENT = new CodePointHandler<Boolean>() { @Override public Boolean codePoint(int cp) { if (!isCodePointGoodForDataDocument(cp)) { return false; } return null; } @Override public Boolean unpairedSurrogate(char c) { return false; } @Override public Boolean endOfString() { return true; }}; public static boolean isGoodUtf16ForDataDocument(String s) { return traverseUtf16String(s, GOOD_UTF16_FOR_DATA_DOCUMENT); } }