/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* $Id$ */ package org.apache.fop.complexscripts.bidi; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.fop.traits.Direction; import org.apache.fop.util.CharUtilities; // CSOFF: LineLengthCheck /** * <p>The <code>UnicodeBidiAlgorithm</code> class implements functionality prescribed by * the Unicode Bidirectional Algorithm, Unicode Standard Annex #9.</p> * * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p> */ public final class UnicodeBidiAlgorithm implements BidiConstants { /** * logging instance */ private static final Log log = LogFactory.getLog(UnicodeBidiAlgorithm.class); private UnicodeBidiAlgorithm() { } /** * Resolve the directionality levels of each character in a character seqeunce. * If some character is encoded in the character sequence as a Unicode Surrogate Pair, * then the directionality level of each of the two members of the pair will be identical. * @return null if bidirectional processing is not required; otherwise, returns an array * of integers, where each integer corresponds to exactly one UTF-16 * encoding element present in the input character sequence, and where each integer denotes * the directionality level of the corresponding encoding element * @param cs input character sequence representing a UTF-16 encoded string * @param defaultLevel the default paragraph level, which must be zero (LR) or one (RL) */ public static int[] resolveLevels(CharSequence cs, Direction defaultLevel) { int[] chars = new int [ cs.length() ]; if (convertToScalar(cs, chars) || (defaultLevel == Direction.RL)) { return resolveLevels(chars, (defaultLevel == Direction.RL) ? 1 : 0, new int [ chars.length ]); } else { return null; } } /** * Resolve the directionality levels of each character in a character seqeunce. * @return null if bidirectional processing is not required; otherwise, returns an array * of integers, where each integer corresponds to exactly one UTF-16 * encoding element present in the input character sequence, and where each integer denotes * the directionality level of the corresponding encoding element * @param chars array of input characters represented as unicode scalar values * @param defaultLevel the default paragraph level, which must be zero (LR) or one (RL) * @param levels array to receive levels, one for each character in chars array */ public static int[] resolveLevels(int[] chars, int defaultLevel, int[] levels) { return resolveLevels(chars, getClasses(chars), defaultLevel, levels, false); } /** * Resolve the directionality levels of each character in a character seqeunce. * @return null if bidirectional processing is not required; otherwise, returns an array * of integers, where each integer corresponds to exactly one UTF-16 * encoding element present in the input character sequence, and where each integer denotes * the directionality level of the corresponding encoding element * @param chars array of input characters represented as unicode scalar values * @param classes array containing one bidi class per character in chars array * @param defaultLevel the default paragraph level, which must be zero (LR) or one (RL) * @param levels array to receive levels, one for each character in chars array * @param useRuleL1 true if rule L1 should be used */ public static int[] resolveLevels(int[] chars, int[] classes, int defaultLevel, int[] levels, boolean useRuleL1) { int[] ica = classes; int[] wca = copySequence(ica); int[] ea = new int [ levels.length ]; resolveExplicit(wca, defaultLevel, ea); resolveRuns(wca, defaultLevel, ea, levelsFromEmbeddings(ea, levels)); if (useRuleL1) { resolveSeparators(ica, wca, defaultLevel, levels); } dump("RL: CC(" + ((chars != null) ? chars.length : -1) + ")", chars, classes, defaultLevel, levels); return levels; } private static int[] copySequence(int[] ta) { int[] na = new int [ ta.length ]; System.arraycopy(ta, 0, na, 0, na.length); return na; } private static void resolveExplicit(int[] wca, int defaultLevel, int[] ea) { int[] es = new int [ MAX_LEVELS ]; /* embeddings stack */ int ei = 0; /* embeddings stack index */ int ec = defaultLevel; /* current embedding level */ for (int i = 0, n = wca.length; i < n; i++) { int bc = wca [ i ]; /* bidi class of current char */ int el; /* embedding level to assign to current char */ switch (bc) { case LRE: // start left-to-right embedding case RLE: // start right-to-left embedding case LRO: // start left-to-right override case RLO: // start right-to-left override int en; /* new embedding level */ if ((bc == RLE) || (bc == RLO)) { en = ((ec & ~OVERRIDE) + 1) | 1; } else { en = ((ec & ~OVERRIDE) + 2) & ~1; } if (en < (MAX_LEVELS + 1)) { es [ ei++ ] = ec; if ((bc == LRO) || (bc == RLO)) { ec = en | OVERRIDE; } else { ec = en & ~OVERRIDE; } } else { // max levels exceeded, so don't change level or override } el = ec; break; case PDF: // pop directional formatting el = ec; if (ei > 0) { ec = es [ --ei ]; } else { // ignore isolated PDF } break; case B: // paragraph separator el = ec = defaultLevel; ei = 0; break; default: el = ec; break; } switch (bc) { case BN: break; case LRE: case RLE: case LRO: case RLO: case PDF: wca [ i ] = BN; break; default: if ((el & OVERRIDE) != 0) { wca [ i ] = directionOfLevel(el); } break; } ea [ i ] = el; } } private static int directionOfLevel(int level) { return ((level & 1) != 0) ? R : L; } private static int levelOfEmbedding(int embedding) { return embedding & ~OVERRIDE; } private static int[] levelsFromEmbeddings(int[] ea, int[] la) { assert ea != null; assert la != null; assert la.length == ea.length; for (int i = 0, n = la.length; i < n; i++) { la [ i ] = levelOfEmbedding(ea [ i ]); } return la; } private static void resolveRuns(int[] wca, int defaultLevel, int[] ea, int[] la) { if (la.length != wca.length) { throw new IllegalArgumentException("levels sequence length must match classes sequence length"); } else if (la.length != ea.length) { throw new IllegalArgumentException("levels sequence length must match embeddings sequence length"); } else { for (int i = 0, n = ea.length, lPrev = defaultLevel; i < n; ) { int s = i; int e = s; int l = findNextNonRetainedFormattingLevel(wca, ea, s, lPrev); while (e < n) { if (la [ e ] != l) { if (startsWithRetainedFormattingRun(wca, ea, e)) { e += getLevelRunLength(ea, e); } else { break; } } else { e++; } } lPrev = resolveRun(wca, defaultLevel, ea, la, s, e, l, lPrev); i = e; } } } private static int findNextNonRetainedFormattingLevel(int[] wca, int[] ea, int start, int lPrev) { int s = start; int e = wca.length; while (s < e) { if (startsWithRetainedFormattingRun(wca, ea, s)) { s += getLevelRunLength(ea, s); } else { break; } } if (s < e) { return levelOfEmbedding(ea [ s ]); } else { return lPrev; } } private static int getLevelRunLength(int[] ea, int start) { assert start < ea.length; int nl = 0; for (int s = start, e = ea.length, l0 = levelOfEmbedding(ea [ start ]); s < e; s++) { if (levelOfEmbedding(ea [ s ]) == l0) { nl++; } else { break; } } return nl; } private static boolean startsWithRetainedFormattingRun(int[] wca, int[] ea, int start) { int nl = getLevelRunLength(ea, start); if (nl > 0) { int nc = getRetainedFormattingRunLength(wca, start); return (nc >= nl); } else { return false; } } private static int getRetainedFormattingRunLength(int[] wca, int start) { assert start < wca.length; int nc = 0; for (int s = start, e = wca.length; s < e; s++) { if (wca [ s ] == BidiConstants.BN) { nc++; } else { break; } } return nc; } private static int resolveRun(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int levelPrev) { // determine start of run direction int sor = directionOfLevel(max(levelPrev, level)); // determine end of run direction int le = -1; if (end == wca.length) { le = max(level, defaultLevel); } else { for (int i = end; i < wca.length; i++) { if (wca [ i ] != BidiConstants.BN) { le = max(level, la [ i ]); break; } } if (le < 0) { le = max(level, defaultLevel); } } int eor = directionOfLevel(le); if (log.isDebugEnabled()) { log.debug("BR[" + padLeft(start, 3) + "," + padLeft(end, 3) + "] :" + padLeft(level, 2) + ": SOR(" + getClassName(sor) + "), EOR(" + getClassName(eor) + ")"); } resolveWeak(wca, defaultLevel, ea, la, start, end, level, sor, eor); resolveNeutrals(wca, defaultLevel, ea, la, start, end, level, sor, eor); resolveImplicit(wca, defaultLevel, ea, la, start, end, level, sor, eor); // if this run is all retained formatting, then return prior level, otherwise this run's level return isRetainedFormatting(wca, start, end) ? levelPrev : level; } private static void resolveWeak(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int sor, int eor) { // W1 - X BN* NSM -> X BN* X for (int i = start, n = end, bcPrev = sor; i < n; i++) { int bc = wca [ i ]; if (bc == NSM) { wca [ i ] = bcPrev; } else if (bc != BN) { bcPrev = bc; } } // W2 - AL ... EN -> AL ... AN for (int i = start, n = end, bcPrev = sor; i < n; i++) { int bc = wca [ i ]; if (bc == EN) { if (bcPrev == AL) { wca [ i ] = AN; } } else if (isStrong(bc)) { bcPrev = bc; } } // W3 - AL -> R for (int i = start, n = end; i < n; i++) { int bc = wca [ i ]; if (bc == AL) { wca [ i ] = R; } } // W4 - EN BN* ES BN* EN -> EN BN* EN BN* EN; XN BN* CS BN* XN -> XN BN* XN BN* XN for (int i = start, n = end, bcPrev = sor; i < n; i++) { int bc = wca [ i ]; if (bc == ES) { int bcNext = eor; for (int j = i + 1; j < n; j++) { if ((bc = wca [ j ]) != BN) { bcNext = bc; break; } } if ((bcPrev == EN) && (bcNext == EN)) { wca [ i ] = EN; } } else if (bc == CS) { int bcNext = eor; for (int j = i + 1; j < n; j++) { if ((bc = wca [ j ]) != BN) { bcNext = bc; break; } } if ((bcPrev == EN) && (bcNext == EN)) { wca [ i ] = EN; } else if ((bcPrev == AN) && (bcNext == AN)) { wca [ i ] = AN; } } if (bc != BN) { bcPrev = bc; } } // W5 - EN (ET|BN)* -> EN (EN|BN)*; (ET|BN)* EN -> (EN|BN)* EN for (int i = start, n = end, bcPrev = sor; i < n; i++) { int bc = wca [ i ]; if (bc == ET) { int bcNext = eor; for (int j = i + 1; j < n; j++) { bc = wca [ j ]; if ((bc != BN) && (bc != ET)) { bcNext = bc; break; } } if ((bcPrev == EN) || (bcNext == EN)) { wca [ i ] = EN; } } else if (bc != BN) { bcPrev = bc; } } // W6 - BN* (ET|ES|CS) BN* -> ON* ON ON* for (int i = start, n = end; i < n; i++) { int bc = wca [ i ]; if ((bc == ET) || (bc == ES) || (bc == CS)) { wca [ i ] = ON; resolveAdjacentBoundaryNeutrals(wca, start, end, i, ON); } } // W7 - L ... EN -> L ... L for (int i = start, n = end, bcPrev = sor; i < n; i++) { int bc = wca [ i ]; if (bc == EN) { if (bcPrev == L) { wca [ i ] = L; } } else if ((bc == L) || (bc == R)) { bcPrev = bc; } } } private static void resolveNeutrals(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int sor, int eor) { // N1 - (L|R) N+ (L|R) -> L L+ L | R R+ R; (AN|EN) N+ R -> (AN|EN) R+ R; R N+ (AN|EN) -> R R+ (AN|EN) for (int i = start, n = end, bcPrev = sor; i < n; i++) { int bc = wca [ i ]; if (isNeutral(bc)) { int bcNext = eor; for (int j = i + 1; j < n; j++) { bc = wca [ j ]; if ((bc == L) || (bc == R)) { bcNext = bc; break; } else if ((bc == AN) || (bc == EN)) { bcNext = R; break; } else if (isNeutral(bc)) { continue; } else if (isRetainedFormatting(bc)) { continue; } else { break; } } if (bcPrev == bcNext) { wca [ i ] = bcPrev; resolveAdjacentBoundaryNeutrals(wca, start, end, i, bcPrev); } } else if ((bc == L) || (bc == R)) { bcPrev = bc; } else if ((bc == AN) || (bc == EN)) { bcPrev = R; } } // N2 - N -> embedding level for (int i = start, n = end; i < n; i++) { int bc = wca [ i ]; if (isNeutral(bc)) { int bcEmbedding = directionOfLevel(levelOfEmbedding(ea [ i ])); wca [ i ] = bcEmbedding; resolveAdjacentBoundaryNeutrals(wca, start, end, i, bcEmbedding); } } } private static void resolveAdjacentBoundaryNeutrals(int[] wca, int start, int end, int index, int bcNew) { if ((index < start) || (index >= end)) { throw new IllegalArgumentException(); } else { for (int i = index - 1; i >= start; i--) { int bc = wca [ i ]; if (bc == BN) { wca [ i ] = bcNew; } else { break; } } for (int i = index + 1; i < end; i++) { int bc = wca [ i ]; if (bc == BN) { wca [ i ] = bcNew; } else { break; } } } } private static void resolveImplicit(int[] wca, int defaultLevel, int[] ea, int[] la, int start, int end, int level, int sor, int eor) { for (int i = start, n = end; i < n; i++) { int bc = wca [ i ]; // bidi class int el = la [ i ]; // embedding level int ed = 0; // embedding level delta if ((el & 1) == 0) { // even if (bc == R) { ed = 1; } else if (bc == AN) { ed = 2; } else if (bc == EN) { ed = 2; } } else { // odd if (bc == L) { ed = 1; } else if (bc == EN) { ed = 1; } else if (bc == AN) { ed = 1; } } la [ i ] = el + ed; } } /** * Resolve separators and boundary neutral levels to account for UAX#9 3.4 L1 while taking into * account retention of formatting codes (5.2). * @param ica original input class array (sequence) * @param wca working copy of original intput class array (sequence), as modified by prior steps * @param dl default paragraph level * @param la array of output levels to be adjusted, as produced by bidi algorithm */ private static void resolveSeparators(int[] ica, int[] wca, int dl, int[] la) { // steps (1) through (3) for (int i = 0, n = ica.length; i < n; i++) { int ic = ica[i]; if ((ic == BidiConstants.S) || (ic == BidiConstants.B)) { la[i] = dl; for (int k = i - 1; k >= 0; k--) { int pc = ica[k]; if (isRetainedFormatting(pc)) { continue; } else if (pc == BidiConstants.WS) { la[k] = dl; } else { break; } } } } // step (4) - consider end of input sequence to be end of line, but skip any trailing boundary neutrals and retained formatting codes for (int i = ica.length; i > 0; i--) { int k = i - 1; int ic = ica[k]; if (isRetainedFormatting(ic)) { continue; } else if (ic == BidiConstants.WS) { la[k] = dl; } else { break; } } // step (5) - per section 5.2 for (int i = 0, n = ica.length; i < n; i++) { int ic = ica[i]; if (isRetainedFormatting(ic)) { if (i == 0) { la[i] = dl; } else { la[i] = la [ i - 1 ]; } } } } private static boolean isStrong(int bc) { switch (bc) { case L: case R: case AL: return true; default: return false; } } private static boolean isNeutral(int bc) { switch (bc) { case WS: case ON: case S: case B: return true; default: return false; } } private static boolean isRetainedFormatting(int bc) { switch (bc) { case LRE: case LRO: case RLE: case RLO: case PDF: case BN: return true; default: return false; } } private static boolean isRetainedFormatting(int[] ca, int s, int e) { for (int i = s; i < e; i++) { if (!isRetainedFormatting(ca[i])) { return false; } } return true; } private static int max(int x, int y) { if (x > y) { return x; } else { return y; } } private static int[] getClasses(int[] chars) { int[] classes = new int [ chars.length ]; int bc; for (int i = 0, n = chars.length; i < n; i++) { int ch = chars [ i ]; if (ch >= 0) { bc = BidiClass.getBidiClass(chars [ i ]); } else { bc = SURROGATE; } classes [ i ] = bc; } return classes; } /** * Convert character sequence (a UTF-16 encoded string) to an array of unicode scalar values * expressed as integers. If a valid UTF-16 surrogate pair is encountered, it is converted to * two integers, the first being the equivalent unicode scalar value, and the second being * negative one (-1). This special mechanism is used to track the use of surrogate pairs while * working with unicode scalar values, and permits maintaining indices that apply both to the * input UTF-16 and out scalar value sequences. * @return a boolean indicating that content is present that triggers bidirectional processing * @param cs a UTF-16 encoded character sequence * @param chars an integer array to accept the converted scalar values, where the length of the * array must be the same as the length of the input character sequence * @throws IllegalArgumentException if the input sequence is not a valid UTF-16 string, e.g., * if it contains an isolated UTF-16 surrogate */ private static boolean convertToScalar(CharSequence cs, int[] chars) throws IllegalArgumentException { boolean triggered = false; if (chars.length != cs.length()) { throw new IllegalArgumentException("characters array length must match input sequence length"); } for (int i = 0, n = chars.length; i < n; ) { int chIn = cs.charAt(i); int chOut; if (chIn < 0xD800) { chOut = chIn; } else if (chIn < 0xDC00) { int chHi = chIn; int chLo; if ((i + 1) < n) { chLo = cs.charAt(i + 1); if ((chLo >= 0xDC00) && (chLo <= 0xDFFF)) { chOut = convertToScalar(chHi, chLo); } else { throw new IllegalArgumentException("isolated high surrogate"); } } else { throw new IllegalArgumentException("truncated surrogate pair"); } } else if (chIn < 0xE000) { throw new IllegalArgumentException("isolated low surrogate"); } else { chOut = chIn; } if (!triggered && triggersBidi(chOut)) { triggered = true; } if ((chOut & 0xFF0000) == 0) { chars [ i++ ] = chOut; } else { chars [ i++ ] = chOut; chars [ i++ ] = -1; } } return triggered; } /** * Convert UTF-16 surrogate pair to unicode scalar valuee. * @return a unicode scalar value * @param chHi high (most significant or first) surrogate * @param chLo low (least significant or second) surrogate * @throws IllegalArgumentException if one of the input surrogates is not valid */ private static int convertToScalar(int chHi, int chLo) { if ((chHi < 0xD800) || (chHi > 0xDBFF)) { throw new IllegalArgumentException("bad high surrogate"); } else if ((chLo < 0xDC00) || (chLo > 0xDFFF)) { throw new IllegalArgumentException("bad low surrogate"); } else { return (((chHi & 0x03FF) << 10) | (chLo & 0x03FF)) + 0x10000; } } /** * Determine of character CH triggers bidirectional processing. Bidirectional * processing is deemed triggerable if CH is a strong right-to-left character, * an arabic letter or number, or is a right-to-left embedding or override * character. * @return true if character triggers bidirectional processing * @param ch a unicode scalar value */ private static boolean triggersBidi(int ch) { switch (BidiClass.getBidiClass(ch)) { case R: case AL: case AN: case RLE: case RLO: return true; default: return false; } } private static void dump(String header, int[] chars, int[] classes, int defaultLevel, int[] levels) { log.debug(header); log.debug("BD: default level(" + defaultLevel + ")"); StringBuffer sb = new StringBuffer(); if (chars != null) { for (int i = 0, n = chars.length; i < n; i++) { int ch = chars [ i ]; sb.setLength(0); if ((ch > 0x20) && (ch < 0x7F)) { sb.append((char) ch); } else { sb.append(CharUtilities.charToNCRef(ch)); } for (int k = sb.length(); k < 12; k++) { sb.append(' '); } sb.append(": " + padRight(getClassName(classes[i]), 4) + " " + levels[i]); log.debug(sb); } } else { for (int i = 0, n = classes.length; i < n; i++) { sb.setLength(0); for (int k = sb.length(); k < 12; k++) { sb.append(' '); } sb.append(": " + padRight(getClassName(classes[i]), 4) + " " + levels[i]); log.debug(sb); } } } private static String getClassName(int bc) { switch (bc) { case L: // left-to-right return "L"; case LRE: // left-to-right embedding return "LRE"; case LRO: // left-to-right override return "LRO"; case R: // right-to-left return "R"; case AL: // right-to-left arabic return "AL"; case RLE: // right-to-left embedding return "RLE"; case RLO: // right-to-left override return "RLO"; case PDF: // pop directional formatting return "PDF"; case EN: // european number return "EN"; case ES: // european number separator return "ES"; case ET: // european number terminator return "ET"; case AN: // arabic number return "AN"; case CS: // common number separator return "CS"; case NSM: // non-spacing mark return "NSM"; case BN: // boundary neutral return "BN"; case B: // paragraph separator return "B"; case S: // segment separator return "S"; case WS: // whitespace return "WS"; case ON: // other neutrals return "ON"; case SURROGATE: // placeholder for low surrogate return "SUR"; default: return "?"; } } private static String padLeft(int n, int width) { return padLeft(Integer.toString(n), width); } private static String padLeft(String s, int width) { StringBuffer sb = new StringBuffer(); for (int i = s.length(); i < width; i++) { sb.append(' '); } sb.append(s); return sb.toString(); } /* not used yet private static String padRight ( int n, int width ) { return padRight ( Integer.toString ( n ), width ); } */ private static String padRight(String s, int width) { StringBuffer sb = new StringBuffer(s); for (int i = sb.length(); i < width; i++) { sb.append(' '); } return sb.toString(); } }