/*
* Copyright 2009 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.template.soy.internal.i18n;
import com.google.common.annotations.VisibleForTesting;
import com.google.template.soy.data.Dir;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.ULocale;
/** Utility functions for performing common Bidi tests on strings. */
public class BidiUtils {
/** Not instantiable. */
private BidiUtils() {}
/**
* A container class for Unicode formatting characters and for directionality string constants.
*/
static final class Format {
private Format() {} // Not instantiable.
/** Unicode "Left-To-Right Embedding" (LRE) character. */
public static final char LRE = '\u202A';
/** Unicode "Right-To-Left Embedding" (RLE) character. */
public static final char RLE = '\u202B';
/** Unicode "Pop Directional Formatting" (PDF) character. */
public static final char PDF = '\u202C';
/** Unicode "Left-To-Right Mark" (LRM) character. */
public static final char LRM = '\u200E';
/** Unicode "Right-To-Left Mark" (RLM) character. */
public static final char RLM = '\u200F';
// Holding also the String representation of LRM and RLM is useful for
// several applications.
public static final String LRM_STRING = Character.toString(LRM);
public static final String RLM_STRING = Character.toString(RLM);
}
/** Returns the directionality of a locale. */
public static Dir languageDir(ULocale locale) {
return isRtlLanguage(locale) ? Dir.RTL : Dir.LTR;
}
/** Returns the directionality of a locale, given as a string in the ICU syntax. */
public static Dir languageDir(String locale) {
return isRtlLanguage(locale) ? Dir.RTL : Dir.LTR;
}
/** Returns whether a locale is RTL. */
@SuppressWarnings("deprecation")
public static boolean isRtlLanguage(ULocale locale) {
try {
return UScript.isRightToLeft(
UCharacter.getPropertyValueEnum(
UProperty.SCRIPT, ULocale.addLikelySubtags(locale).getScript()));
} catch (IllegalArgumentException e) {
return false;
}
}
/** Returns whether a locale, given as a string in the ICU syntax, is RTL. */
public static boolean isRtlLanguage(String locale) {
return isRtlLanguage(new ULocale(locale));
}
/** "right" string constant. */
public static final String RIGHT = "right";
/** "left" string constant. */
public static final String LEFT = "left";
/** An object that estimates the directionality of a given string by various methods. */
@VisibleForTesting
static class DirectionalityEstimator {
// Internal static variables and constants.
/**
* The size of the bidi character class cache. The results of the UCharacter.getDirectionality()
* calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. The 0x700
* value is designed to leave all the European and Near Eastern languages in the cache. It can
* be reduced to 0x180, restricting the cache to the Western European languages.
*/
private static final int DIR_TYPE_CACHE_SIZE = 0x700;
/** The bidi character class cache. */
private static final byte[] DIR_TYPE_CACHE;
static {
DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
DIR_TYPE_CACHE[i] = UCharacter.getDirectionality(i);
}
}
/**
* The current classification of a word, for the word count direction estimation algorithm. As
* we progress our examination through a word, the type may increase in value e.g.: NEUTRAL ->
* EN | AN -> STRONG or NEUTRAL -> PLUS -> SIGNED_EN | PLUS_AN -> STRONG. It will only decrease
* when going back down to NEUTRAL at a word break, and when a neutral character (other than a
* plus or minus sign) appears after a plus or minus sign. Please note that STRONG, URL, and
* EMBEDDED are terminal, i.e. do not change into another word type until the end of the word is
* reached.
*/
private static class WordType {
/** Word so far - if any - contains no LTR, RTL, or numeric characters. */
public static final int NEUTRAL = 0;
/** Word so far is a plus sign. */
public static final int PLUS = 1;
/** Word so far is a minus sign. */
public static final int MINUS = 2;
/**
* Word so far started with a European numeral, and had no LTR or RTL or plus/minus before the
* number; enWordCount has been incremented.
*/
public static final int EN = 3;
/**
* Word so far started with an Arabic numeral, and had no LTR or RTL or plus/minus before the
* number.
*/
public static final int AN = 4;
/**
* Word so far has been a signed European number, which has to be displayed in LTR;
* signedEnWordCount has been incremented.
*/
public static final int SIGNED_EN = 5;
/**
* Word so far has been an Arabic number with a leading plus sign, which we may choose to
* interpret as an international phone number, which has to be displayed in LTR;
* plusAnWordCount has been incremented.
*/
public static final int PLUS_AN = 6;
/**
* Word so far has been a negative Arabic number, which has to be displayed in RTL;
* minusAnWordCount has been incremented.
*/
public static final int MINUS_AN = 7;
/** Word had an LTR or RTL character; ltrWordCount or rtlWordCount has been incremented. */
public static final int STRONG = 8;
/**
* Word started with a URL prefix (http:// or https://); urlWordCount has been incremented.
*/
public static final int URL = 9;
/** A "word" between LRE/LRO/RLE/RLO and matching PDF. */
public static final int EMBEDDED = 10;
}
/**
* If at least RTL_THRESHOLD of the words containing strong LTR or RTL in the string start with
* RTL, the word count direction estimation algorithm judges the string as a whole to be RTL.
*/
private static final double RTL_THRESHOLD = 0.4;
// Internal instance variables.
/** The text to be scanned. */
private final String text;
/**
* Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and entities
* when looking for the next / preceding dir type.
*/
private final boolean isHtml;
/** The length of the text in chars. */
private final int length;
/** The current position in the text. */
private int charIndex;
/**
* The char encountered by the last dirTypeForward or dirTypeBackward call. If it encountered a
* supplementary codepoint, this contains a char that is not a valid codepoint. This is ok,
* because this member is only used to detect some well-known ASCII syntax, e.g. "http://" and
* the beginning of an HTML tag or entity.
*/
private char lastChar;
/** Number of LTR words found so far by the word count direction estimation algorithm. */
private int ltrWordCount;
/** Number of RTL words found so far by the word count direction estimation algorithm. */
private int rtlWordCount;
/** Number of URLs found so far by the word count direction estimation algorithm. */
private int urlWordCount;
/**
* Number of unsigned EN numbers found so far by the word count direction estimation algorithm.
*/
private int enWordCount;
/**
* Number of signed EN numbers found so far by the word count direction estimation algorithm.
*/
private int signedEnWordCount;
/**
* Number of plus-signed AN numbers found so far by the word count direction estimation
* algorithm.
*/
private int plusAnWordCount;
/**
* Number of minus-signed AN numbers found so far by the word count direction estimation
* algorithm.
*/
private int minusAnWordCount;
/**
* Type (so far) of the word continuing at charIndex in the string, for the word count direction
* estimation algorithm.
*/
private int wordType;
// Methods intended for use by BidiUtils.
/**
* Constructor.
*
* @param text The string to scan.
* @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
* tags and entities.
*/
DirectionalityEstimator(String text, boolean isHtml) {
this.text = text;
this.isHtml = isHtml;
length = text.length();
}
/**
* Checks if the (whole) string has any LTR characters in it.
*
* @param countEmbedding Whether LRE/RLE/LRO/RLO/PDF characters should be taken into account.
* @return Whether any LTR characters were encountered.
*/
boolean hasAnyLtr(boolean countEmbedding) {
charIndex = 0;
int embeddingLevel = 0;
while (charIndex < length) {
switch (dirTypeForward()) {
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT:
if (embeddingLevel == 0) {
return true;
}
break;
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
if (countEmbedding && embeddingLevel++ == 0) {
return true;
}
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
if (countEmbedding) {
++embeddingLevel;
}
break;
case UCharacter.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
if (countEmbedding) {
--embeddingLevel;
}
break;
}
}
return false;
}
/**
* Checks if the (whole) string has any RTL characters in it.
*
* @param countEmbedding Whether LRE/RLE/LRO/RLO/PDF characters should be taken into account.
* @return Whether any RTL characters were encountered.
*/
boolean hasAnyRtl(boolean countEmbedding) {
charIndex = 0;
int embeddingLevel = 0;
while (charIndex < length) {
switch (dirTypeForward()) {
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
if (embeddingLevel == 0) {
return true;
}
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
if (countEmbedding && embeddingLevel++ == 0) {
return true;
}
break;
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
if (countEmbedding) {
++embeddingLevel;
}
break;
case UCharacter.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
if (countEmbedding) {
--embeddingLevel;
}
break;
}
}
return false;
}
/**
* Returns the directionality of the first character with strong directionality (going forward
* from the start of the string), or Dir.NEUTRAL if none was encountered. Ignores
* LRE/RLE/LRO/RLO/PDF characters.
*/
Dir getUnicodeDir() {
charIndex = 0;
while (charIndex < length) {
switch (dirTypeForward()) {
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT:
return Dir.LTR;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
return Dir.RTL;
}
}
return Dir.NEUTRAL;
}
/**
* Returns the directionality of the first character with strong directionality in the string,
* or Dir.NEUTRAL if none was encountered. Treats a non-BN character between an LRE/RLE/LRO/RLO
* and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The
* results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
*/
Dir getEntryDir() {
// The reason for this method name, as opposed to getFirstStrongDir(), is that "first strong"
// is a commonly used description of Unicode's estimation algorithm (getUnicodeDir() above),
// but the two must treat formatting characters quite differently. Thus, we are staying away
// from both "first" and "last" in these method names to avoid confusion.
charIndex = 0;
int embeddingLevel = 0;
Dir embeddingLevelDir = null;
int firstNonEmptyEmbeddingLevel = 0;
while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
switch (dirTypeForward()) {
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
++embeddingLevel;
embeddingLevelDir = Dir.LTR;
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
++embeddingLevel;
embeddingLevelDir = Dir.RTL;
break;
case UCharacter.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
--embeddingLevel;
// To restore embeddingLevelDir to its previous value, we would need a stack, which we
// want to avoid. Thus, at this point we do not know the current embedding's
// directionality.
embeddingLevelDir = null;
break;
case UCharacter.BOUNDARY_NEUTRAL:
break;
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT:
if (embeddingLevel == 0) {
return Dir.LTR;
}
firstNonEmptyEmbeddingLevel = embeddingLevel;
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
if (embeddingLevel == 0) {
return Dir.RTL;
}
firstNonEmptyEmbeddingLevel = embeddingLevel;
break;
default:
firstNonEmptyEmbeddingLevel = embeddingLevel;
break;
}
}
// We have either found a non-empty embedding or scanned the entire string finding neither a
// non-empty embedding nor a strong character outside of an embedding.
if (firstNonEmptyEmbeddingLevel == 0) {
// We have not found a non-empty embedding. Thus, the string contains neither a non-empty
// embedding nor a strong character outside of an embedding.
return Dir.NEUTRAL;
}
// We have found a non-empty embedding.
if (embeddingLevelDir != null) {
// We know the directionality of the non-empty embedding.
return embeddingLevelDir;
}
// We do not remember the directionality of the non-empty embedding we found. So, we go
// backwards to find the start of the non-empty embedding and get its directionality.
while (charIndex > 0) {
switch (dirTypeBackward()) {
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
return Dir.LTR;
}
--embeddingLevel;
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
return Dir.RTL;
}
--embeddingLevel;
break;
case UCharacter.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
++embeddingLevel;
break;
}
}
// We should never get here.
return Dir.NEUTRAL;
}
/**
* Returns the directionality of the last character with strong directionality in the string, or
* Dir.NEUTRAL if none was encountered. For efficiency, actually scans backwards from the end of
* the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
* strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
* string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
*/
Dir getExitDir() {
// The reason for this method name, as opposed to getLastStrongDir(), is that "last strong"
// sounds like the exact opposite of "first strong", which is a commonly used description of
// Unicode's estimation algorithm (getUnicodeDir() above), but the two must treat formatting
// characters quite differently. Thus, we are staying away from both "first" and "last" in
// these method names to avoid confusion.
charIndex = length;
int embeddingLevel = 0;
int lastNonEmptyEmbeddingLevel = 0;
while (charIndex > 0) {
switch (dirTypeBackward()) {
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT:
if (embeddingLevel == 0) {
return Dir.LTR;
}
if (lastNonEmptyEmbeddingLevel == 0) {
lastNonEmptyEmbeddingLevel = embeddingLevel;
}
break;
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
return Dir.LTR;
}
--embeddingLevel;
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
if (embeddingLevel == 0) {
return Dir.RTL;
}
if (lastNonEmptyEmbeddingLevel == 0) {
lastNonEmptyEmbeddingLevel = embeddingLevel;
}
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
return Dir.RTL;
}
--embeddingLevel;
break;
case UCharacter.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
++embeddingLevel;
break;
case UCharacter.BOUNDARY_NEUTRAL:
break;
default:
if (lastNonEmptyEmbeddingLevel == 0) {
lastNonEmptyEmbeddingLevel = embeddingLevel;
}
break;
}
}
return Dir.NEUTRAL;
}
/**
* Estimates the directionality of the (whole) string based on relative word counts. See {@link
* #estimateDirection(String str)} for full description.
*
* @return the string's directionality
*/
@SuppressWarnings("fallthrough")
Dir estimateDirectionByWordCount() {
charIndex = 0;
ltrWordCount = 0;
rtlWordCount = 0;
urlWordCount = 0;
enWordCount = 0;
signedEnWordCount = 0;
plusAnWordCount = 0;
minusAnWordCount = 0;
int embedLevel = 0;
wordType = WordType.NEUTRAL;
while (charIndex < length) {
byte dirType = dirTypeForward();
// The DIRECTIONALITY_LEFT_TO_RIGHT case is taken out of the switch statement below to
// improve the performance for LTR text (i.e. the vast majority of the content encountered
// on the web).
if (dirType == UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT) {
// Strongly LTR. Convert numeric word to LTR, and a neutral word either to LTR or, if
// the character just scanned and the characters following it are a URL, to a URL.
processStrong(false /* isRtl */);
} else {
switch (dirType) {
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT:
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
// Strongly RTL. Convert neutral or numeric word to RTL.
processStrong(true /* isRtl */);
break;
case UCharacter.DIRECTIONALITY_EUROPEAN_NUMBER:
// A European digit. Convert NEUTRAL to EN, and PLUS and MINUS to SIGNED_EN.
processEuropeanDigit();
break;
case UCharacter.DIRECTIONALITY_ARABIC_NUMBER:
// An Arabic digit. Convert NEUTRAL to AN, PLUS to PLUS_AN, and MINUS to MINUS_AN.
processArabicDigit();
break;
case UCharacter.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
// Plus or minus sign. Treat as end of a numeric word, and convert NEUTRAL to PLUS or
// MINUS.
if (wordType < WordType.STRONG) {
if (wordType <= WordType.MINUS) {
switch (lastChar) {
case 0x002B: // PLUS SIGN
case 0x207A: // SUPERSCRIPT PLUS SIGN
case 0x208A: // SUBSCRIPT PLUS SIGN
case 0xFB29: // HEBREW LETTER ALTERNATIVE PLUS SIGN
case 0xFE62: // SMALL PLUS SIGN
case 0xFF0B: // FULLWIDTH PLUS SIGN
wordType = WordType.PLUS;
break;
default:
wordType = WordType.MINUS;
break;
}
} else {
wordType = WordType.NEUTRAL;
}
}
break;
case UCharacter.COMMON_NUMBER_SEPARATOR:
// Neutral used to format numbers that (with the exception of a slash, due to a
// Microsoft bug) can be relied upon to keep the digits around it displayed LTR. Reset
// PLUS and MINUS back to NEUTRAL, and treat a slash as the end of a numeric word.
if (wordType < WordType.STRONG && (wordType <= WordType.MINUS || lastChar == '/')) {
wordType = WordType.NEUTRAL;
}
break;
case UCharacter.OTHER_NEUTRAL:
case UCharacter.EUROPEAN_NUMBER_TERMINATOR:
// Neutrals not used for formatting inside numbers. Treat as end of a numeric word.
if (wordType < WordType.STRONG) {
wordType = WordType.NEUTRAL;
}
break;
case UCharacter.DIRECTIONALITY_WHITESPACE:
case UCharacter.DIRECTIONALITY_SEGMENT_SEPARATOR:
// Whitespace. Treat as end of word, unless embedded.
if (wordType < WordType.EMBEDDED) {
wordType = WordType.NEUTRAL;
}
break;
case UCharacter.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
// Paragraph break. Treat as end of word, and reset embedding level.
embedLevel = 0;
wordType = WordType.NEUTRAL;
break;
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
// LRO overrides the directionality of the characters inside it, so treat them as
// strongly LTR.
processStrong(false /* isRtl */);
// Fall through to LRE processing.
case UCharacter.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
// Start LTR embedded area.
if (embedLevel++ == 0) {
wordType = WordType.EMBEDDED;
}
break;
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
// RLO overrides the directionality of the characters inside it, so treat them as
// a strongly RTL word.
processStrong(true /* isRtl */);
// Fall through to RLE processing.
case UCharacter.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
// Start RTL embedded area.
if (embedLevel++ == 0) {
wordType = WordType.EMBEDDED;
}
break;
case UCharacter.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
// End embedded area.
if (--embedLevel == 0) {
wordType = WordType.NEUTRAL;
}
break;
default:
// Ignore control characters (DIRECTIONALITY_BOUNDARY_NEUTRAL) and non-spacing marks
// (DIRECTIONALITY_NON_SPACING_MARKS).
break;
}
}
}
return compareCounts();
}
// Internal methods
/*
* Make the final choice of estimated direction depending on the calculated word counts.
*/
Dir compareCounts() {
if (rtlWordCount > (ltrWordCount + rtlWordCount) * RTL_THRESHOLD) {
return Dir.RTL;
}
// If ltrWordCount is greater than zero, the string is LTR. Otherwise, rtlWordCount must also
// be zero, and the result depends only on the "weak" words - URLs and numbers.
if (ltrWordCount + urlWordCount + signedEnWordCount > 0 || enWordCount > 1) {
return Dir.LTR;
}
if (minusAnWordCount > 0) {
return Dir.RTL;
}
if (plusAnWordCount > 0) {
return Dir.LTR;
}
return Dir.NEUTRAL;
}
/**
* Converts a neutral or numeric word to STRONG, or, if the word had been neutral, and the
* character just scanned and the characters following are a URL, to a URL, and adjusts the word
* counts appropriately.
*/
private void processStrong(boolean isRtl) {
if (wordType >= WordType.STRONG) {
// Current word's type is final.
return;
}
switch (wordType) {
case WordType.NEUTRAL:
if (!isRtl
&& lastChar == 'h'
&& (matchForward("ttp://", true) || matchForward("ttps://", true))) {
// This is the start of a URL.
wordType = WordType.URL;
++urlWordCount;
return;
}
break;
case WordType.SIGNED_EN:
// signedEnWordCount was incremented earlier; revert it.
--signedEnWordCount;
break;
case WordType.PLUS_AN:
// plusAnWordCount was incremented earlier; revert it.
--plusAnWordCount;
break;
case WordType.MINUS_AN:
// minusAnWordCount was incremented earlier; revert it.
--minusAnWordCount;
break;
case WordType.EN:
// enWordCount was incremented earlier; revert it.
--enWordCount;
break;
default:
// No word count was incremented earlier.
break;
}
wordType = WordType.STRONG;
if (isRtl) {
++rtlWordCount;
} else {
++ltrWordCount;
}
}
/**
* Converts a NEUTRAL to EN, and PLUS and MINUS to SIGNED_EN, and adjusts the word counts
* appropriately.
*/
private void processEuropeanDigit() {
switch (wordType) {
case WordType.NEUTRAL:
// Convert a neutral word to an unsigned "European" number.
++enWordCount;
wordType = WordType.EN;
break;
case WordType.PLUS:
case WordType.MINUS:
// Convert a sign to a signed "European" number.
++signedEnWordCount;
wordType = WordType.SIGNED_EN;
break;
default:
break;
}
}
/**
* Converts a NEUTRAL to AN, PLUS to PLUS_AN, and MINUS to MINUS_AN, and adjusts the word counts
* appropriately.
*/
private void processArabicDigit() {
switch (wordType) {
case WordType.NEUTRAL:
// Convert a neutral word to an unsigned "Arabic" number. Currently, unsigned "Arabic"
// numbers do not play a part in deciding the overall directionality. Nevertheless, we
// do identify them here so we can easily change the policy on them if necessary.
wordType = WordType.AN;
break;
case WordType.PLUS:
// Convert a plus sign to a plus-signed "Arabic" number.
++plusAnWordCount;
wordType = WordType.PLUS_AN;
break;
case WordType.MINUS:
// Convert a minus sign to a minus-signed "Arabic" number.
++minusAnWordCount;
wordType = WordType.MINUS_AN;
break;
default:
break;
}
}
/**
* Returns whether the text at charIndex going forward is equal to a given string. Does NOT skip
* over HTML mark-up.
*
* @param match The string to match.
* @param advance Whether to advance charIndex to the end of a successful match.
* @return Whether the text at charIndex going forward is equal to the given string.
*/
@VisibleForTesting
boolean matchForward(String match, boolean advance) {
int matchLength = match.length();
if (matchLength > length - charIndex) {
return false;
}
for (int checkIndex = 0; checkIndex < matchLength; checkIndex++) {
if (text.charAt(charIndex + checkIndex) != match.charAt(checkIndex)) {
return false;
}
}
if (advance) {
charIndex += matchLength;
}
return true;
}
/**
* Gets the bidi character class, i.e. UCharacter.getDirectionality(), of a given char, using a
* cache for speed. Not designed for supplementary codepoints, whose results we do not cache.
*/
private static byte getCachedDirectionality(char c) {
return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : UCharacter.getDirectionality(c);
}
/**
* Returns the UCharacter.DIRECTIONALITY_... value of the next codepoint and advances charIndex.
* If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, and returns an
* appropriate dirtype.
*
* @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
*/
@VisibleForTesting
byte dirTypeForward() {
lastChar = text.charAt(charIndex);
if (UCharacter.isHighSurrogate(lastChar)) {
int codePoint = UCharacter.codePointAt(text, charIndex);
charIndex += UCharacter.charCount(codePoint);
return UCharacter.getDirectionality(codePoint);
}
charIndex++;
byte dirType = getCachedDirectionality(lastChar);
if (isHtml) {
// Process tags and entities.
if (lastChar == '<') {
dirType = skipTagForward();
} else if (lastChar == '&') {
dirType = skipEntityForward();
}
}
return dirType;
}
/**
* Returns the UCharacter.DIRECTIONALITY_... value of the preceding codepoint and advances
* charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
* entity, advances over the whole tag/entity and returns an appropriate dirtype.
*
* @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
*/
@VisibleForTesting
byte dirTypeBackward() {
lastChar = text.charAt(charIndex - 1);
if (UCharacter.isLowSurrogate(lastChar)) {
int codePoint = UCharacter.codePointBefore(text, charIndex);
charIndex -= UCharacter.charCount(codePoint);
return UCharacter.getDirectionality(codePoint);
}
charIndex--;
byte dirType = getCachedDirectionality(lastChar);
if (isHtml) {
// Process tags and entities.
if (lastChar == '>') {
dirType = skipTagBackward();
} else if (lastChar == ';') {
dirType = skipEntityBackward();
}
}
return dirType;
}
/**
* Advances charIndex forward through an HTML tag (after the opening < has already been read)
* and returns an appropriate dirtype for the tag. If there is no matching >, does not change
* charIndex and returns UCharacter.DIRECTIONALITY_OTHER_NEUTRALS (for the < that hadn't been
* part of a tag after all).
*/
private byte skipTagForward() {
int initialCharIndex = charIndex;
while (charIndex < length) {
lastChar = text.charAt(charIndex++);
if (lastChar == '>') {
// The end of the tag.
// We return BN because the tags we really expect to encounter - and know how to handle
// best - are inline ones like <span>, <b>, <i>, <a>, etc. These do not connote a word
// break (as would WS) or punctuation (as would ON), but really are most similar to
// control codes. Ideally, we should check the actual tag and return B for <br> and the
// block element tags, but perfecting handling of multi-paragraph input isn't very
// important since estimating one directionality over several paragraphs is futile anyway:
// each one should be allowed its own. More importantly, we should check for the dir
// attribute and return an appropriate embedding, override, or isolate initiator bidi
// class, and its closing dirtype for the closing tag, but finding the closing tag is
// not so easy. A poor man's approach that should be good enough without needing a stack
// could ignore the dir attribute on elements nested in an element with a dir attribute,
// and find its closing tag by counting the nesting only of its type. Still, this wouldn't
// work in skipTagBackward() - see note there.
// TODO(user): Consider checking the tag and returning BN, B, or one of the explicit
// directional formatting dirtypes, as appropriate.
return UCharacter.DIRECTIONALITY_BOUNDARY_NEUTRAL;
}
if (lastChar == '"' || lastChar == '\'') {
// Skip over a quoted attribute value inside the tag.
char quote = lastChar;
while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
}
}
// The original '<' wasn't the start of a tag after all.
charIndex = initialCharIndex;
lastChar = '<';
return UCharacter.DIRECTIONALITY_OTHER_NEUTRALS;
}
/**
* Advances charIndex backward through an HTML tag (after the closing > has already been
* read) and returns an appropriate dirtype for the tag. If there is no matching <, does not
* change charIndex and returns UCharacter.DIRECTIONALITY_OTHER_NEUTRALS (for the > that
* hadn't been part of a tag after all). Nevertheless, the running time for calling
* skipTagBackward() in a loop remains linear in the size of the text, even for a text like
* ">>>>", because skipTagBackward() also stops looking for a matching < when it
* encounters another >.
*/
private byte skipTagBackward() {
int initialCharIndex = charIndex;
while (charIndex > 0) {
lastChar = text.charAt(--charIndex);
if (lastChar == '<') {
// The start of the tag. See note in skipTagForward() regarding the dirtype we return.
// Note, however, that the "poor man's approach" described there for handling the dir
// attribute wouldn't work here, since here we see the closing tag first - and do not
// have any indication if its matching opening tag carries the dir attribute.
return UCharacter.DIRECTIONALITY_BOUNDARY_NEUTRAL;
}
if (lastChar == '>') {
break;
}
if (lastChar == '"' || lastChar == '\'') {
// Skip over a quoted attribute value inside the tag.
char quote = lastChar;
while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
}
}
// The original '>' wasn't the end of a tag after all.
charIndex = initialCharIndex;
lastChar = '>';
return UCharacter.DIRECTIONALITY_OTHER_NEUTRALS;
}
/**
* Advances charIndex forward through an HTML character entity tag (after the opening & has
* already been read) and returns UCharacter.DIRECTIONALITY_WHITESPACE. It would be best to
* figure out the actual character and return its dirtype, but this is good enough.
*/
private byte skipEntityForward() {
while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
return UCharacter.DIRECTIONALITY_WHITESPACE;
}
/**
* Advances charIndex backward through an HTML character entity tag (after the closing ; has
* already been read) and returns UCharacter.DIRECTIONALITY_WHITESPACE. It would be best to
* figure out the actual character and return its dirtype, but this is good enough. If there is
* no matching &, does not change charIndex and returns
* UCharacter.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
* all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
* linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
* also stops looking for a matching & when it encounters another ;.
*/
private byte skipEntityBackward() {
int initialCharIndex = charIndex;
while (charIndex > 0) {
lastChar = text.charAt(--charIndex);
if (lastChar == '&') {
return UCharacter.DIRECTIONALITY_WHITESPACE;
}
if (lastChar == ';') {
break;
}
}
charIndex = initialCharIndex;
lastChar = ';';
return UCharacter.DIRECTIONALITY_OTHER_NEUTRALS;
}
}
/**
* Checks if the given string has any LTR characters in it. Note that LRE/RLE/LRO/RLO/PDF
* characters are ignored.
*
* @param str the string to be tested
* @param isHtml whether str is HTML / HTML-escaped
* @return whether the string contains any LTR characters
*/
public static boolean hasAnyLtr(String str, boolean isHtml) {
return new DirectionalityEstimator(str, isHtml).hasAnyLtr(false /* countEmbedding */);
}
/**
* Like {@link #hasAnyLtr(String, boolean)}, but assumes {@code str} is not HTML / HTML-escaped.
*
* @param str the string to be tested
* @return whether the string contains any LTR characters
*/
public static boolean hasAnyLtr(String str) {
return hasAnyLtr(str, false /* isHtml */);
}
/**
* Checks if the given string has any RTL characters in it. Note that LRE/RLE/LRO/RLO/PDF
* characters are ignored.
*
* @param str the string to be tested
* @param isHtml whether str is HTML / HTML-escaped
* @return whether the string contains any RTL characters
*/
public static boolean hasAnyRtl(String str, boolean isHtml) {
return new DirectionalityEstimator(str, isHtml).hasAnyRtl(false /* countEmbedding */);
}
/**
* Like {@link #hasAnyRtl(String, boolean)}, but assumes {@code str} is not HTML / HTML-escaped.
*
* @param str the string to be tested
* @return whether the string contains any RTL characters
*/
public static boolean hasAnyRtl(String str) {
return hasAnyRtl(str, false /* isHtml */);
}
/**
* Returns the directionality of a string as defined by the UBA's rules P2 and P3, i.e. the
* directionality of its first strong (L, R, or AL) character (with LRE/RLE/LRO/RLO/PDF having no
* effect). However returns Dir.NEUTRAL if no strong characters were encountered (which P3 says
* should be treated as LTR).
*
* @param str the string to check
* @param isHtml whether str is HTML / HTML-escaped
*/
public static Dir getUnicodeDir(String str, boolean isHtml) {
return new DirectionalityEstimator(str, isHtml).getUnicodeDir();
}
/**
* Like {@link #getUnicodeDir(String, boolean)}, but assumes {@code str} is not HTML or
* HTML-escaped.
*/
public static Dir getUnicodeDir(String str) {
return getUnicodeDir(str, false /* isHtml */);
}
/**
* Returns the directionality of the first character with strong directionality in the string, or
* Dir.NEUTRAL if none was encountered. Treats a non-BN character between an LRE/RLE/LRO/RLO and
* its matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
* are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended
* use is to check whether a logically separate item that ends with a character of the string's
* entry directionality and precedes the string inline (not counting any neutral characters in
* between) would "stick" to it in an opposite-directionality context, thus being displayed in an
* incorrect position. An LRM or RLM character (the one of the context's directionality) between
* the two will prevent such sticking.
*
* @param str the string to check
* @param isHtml whether str is HTML / HTML-escaped
*/
public static Dir getEntryDir(String str, boolean isHtml) {
return new DirectionalityEstimator(str, isHtml).getEntryDir();
}
/**
* Like {@link #getEntryDir(String, boolean)}, but assumes {@code str} is not HTML or
* HTML-escaped.
*/
public static Dir getEntryDir(String str) {
return getEntryDir(str, false /* isHtml */);
}
/**
* Returns the directionality of the last character with strong directionality in the string, or
* Dir.NEUTRAL if none was encountered. For efficiency, actually scans backwards from the end of
* the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
* strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
* string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
* whether a logically separate item that starts with a number or a character of the string's exit
* directionality and follows this string inline (not counting any neutral characters in between)
* would "stick" to it in an opposite-directionality context, thus being displayed in an incorrect
* position. An LRM or RLM character (the one of the context's directionality) between the two
* will prevent such sticking.
*
* @param str the string to check
* @param isHtml whether str is HTML / HTML-escaped
*/
public static Dir getExitDir(String str, boolean isHtml) {
return new DirectionalityEstimator(str, isHtml).getExitDir();
}
/**
* Like {@link #getExitDir(String, boolean)}, but assumes {@code str} is not HTML or HTML-escaped.
*/
public static Dir getExitDir(String str) {
return getExitDir(str, false /* isHtml */);
}
/**
* Estimates the directionality of a string based on relative word counts, as detailed below.
*
* <p>The parts of the text embedded between LRE/RLE and the matching PDF are ignored, since the
* directionality in which the string as a whole is displayed will not affect their display
* anyway, and we want to base it on the remainder.
*
* <p>The parts of the text embedded between LRO/RLO and the matching PDF are considered LTR/RTL
* "words". This is primarily in order to treat "fake bidi" pseudolocalized text as RTL.
*
* <p>The remaining parts of the text are divided into "words" on whitespace and, inside numbers,
* on neutral characters that break the LTR flow around them when used inside a number in an RTL
* context. (This is most of them, the primary exceptions being period, comma, NBSP and colon,
* i.e. bidi class CS not including slash, which a long-standing Microsoft bug treats as ES)).
*
* <p>Each word is assigned a type - LTR, RTL, URL, signed "European" number, unsigned "European"
* number, negative "Arabic" number, "Arabic" number with leading plus sign, and unsigned "Arabic"
* number - as follows:
*
* <p>- Words that start with "http[s]://" (possibly preceded by some neutrals) are URLs.
*
* <p>- Of the remaining words, those that contain any strongly directional characters are
* classified as LTR or RTL based on their first strongly directional character.
*
* <p>- Of the remaining words, those that contain any digits are classified as an "European" or
* "Arabic" number based on the type of its first digit, and signed or unsigned depending on
* whether the first digit was immediately preceded by a plus or minus sign (bidi class ES).
*
* <p>- The remaining words are classified as "neutral" and ignored.
*
* <p>Once the words of each type have been counted, the directionality is decided as follows:
*
* <p>If the number of RTL words exceeds 40% of the total of LTR and RTL words, return Dir.RTL.
* The threshold favors RTL because LTR words and phrases are used in RTL sentences more commonly
* than RTL in LTR.
*
* <p>Otherwise, if there are any LTR words, return Dir.LTR.
*
* <p>Otherwise (i.e. if there are no LTR or RTL words), if there are any URLs, or any signed
* "European" numbers, or an "Arabic" number with a leading plus sign, or more than one unsigned
* "European" number, return Dir.LTR. This ensures that the text is displayed LTR even in an RTL
* context, where things like "http://www.google.com/", "-5", "+١٢٣٤٢٣٤٦٧٨٩" (assuming it is
* intended as an international phone number, not an explicitly signed positive number, which is a
* very rare use case), "3 - 2 = 1", "(03) 123 4567", and, when preceded by an Arabic letter, even
* "123-4567" and "400×300" are displayed incorrectly. (Most neutrals, including those in the last
* two examples, are treated as ending a number in order to treat such expressions as containing
* more than one "European" number, and thus to force their display in LTR.) Considering a string
* containing more than "European" number to be LTR also makes sense because math expressions in
* "European" digits need to be displayed LTR even in RTL languages. However, that probably isn't
* a very important consideration, since math expressions would usually also contain strongly LTR
* or RTL variable names that should set the overall directionality. Ranges like "$1 - $5" *are*
* an important consideration, but their preferred direction unfortunately varies among the RTL
* languages. Since LTR is preferred for ranges in Persian and Urdu, and is the more widespread
* usage in Hebrew, it seems like an OK choice. Please note that native Persian digits are
* included in the "European" class because the unary minus is preferred on the left in Persian,
* and Persian math is written LTR.
*
* <p>Otherwise, if there are any negative "Arabic" numbers, return Dir.RTL. This is because the
* unary minus is supposed to be displayed to the right of a number written in "Arabic" digits.
*
* <p>Otherwise, return Dir.NEUTRAL. This includes the common case of a single unsigned number,
* which will display correctly in either "European" or "Arabic" digits in either directionality,
* so it is best not to force it to either. It also includes an otherwise neutral string
* containing two or more "Arabic" numbers. We do *not* consider it to be RTL because it is
* unclear that it is important to display "Arabic"-digit math and ranges in RTL even in an LTR
* context, and because we have no idea how to handle phone numbers spelled (or, more likely,
* misspelled) in "Arabic" digits with non-CS separators. But it is quite clear that we do not
* want to force it to LTR.
*
* @param str the string to check
* @return the string's directionality
*/
public static Dir estimateDirection(String str) {
return estimateDirection(str, false /* isHtml */);
}
/**
* Like {@link #estimateDirection(String)}, but can treat {@code str} as HTML, ignoring HTML tags
* and escapes that would otherwise be mistaken for LTR text.
*
* @param str the string to check
* @param isHtml whether str is HTML / HTML-escaped
*/
public static Dir estimateDirection(String str, boolean isHtml) {
return new DirectionalityEstimator(str, isHtml).estimateDirectionByWordCount();
}
}