/* * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ****************************************************************************** * * Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** */ package sun.text.normalizer; import java.util.ArrayList; import sun.text.normalizer.UnicodeSet.SpanCondition; /* * Implement span() etc. for a set with strings. * Avoid recursion because of its exponential complexity. * Instead, try multiple paths at once and track them with an IndexList. */ class UnicodeSetStringSpan { /* * Which span() variant will be used? The object is either built for one variant and used once, * or built for all and may be used many times. */ public static final int WITH_COUNT = 0x40; // spanAndCount() may be called public static final int FWD = 0x20; public static final int BACK = 0x10; // public static final int UTF16 = 8; public static final int CONTAINED = 2; public static final int NOT_CONTAINED = 1; public static final int ALL = 0x7f; public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; /** * Special spanLength short values. (since Java has not unsigned byte type) * All code points in the string are contained in the parent set. */ static final short ALL_CP_CONTAINED = 0xff; /** The spanLength is >=0xfe. */ static final short LONG_SPAN = ALL_CP_CONTAINED - 1; /** Set for span(). Same as parent but without strings. */ private UnicodeSet spanSet; /** * Set for span(not contained). * Same as spanSet, plus characters that start or end strings. */ private UnicodeSet spanNotSet; /** The strings of the parent set. */ private ArrayList<String> strings; /** The lengths of span(), spanBack() etc. for each string. */ private short[] spanLengths; /** Maximum lengths of relevant strings. */ private int maxLength16; /** Are there strings that are not fully contained in the code point set? */ private boolean someRelevant; /** Set up for all variants of span()? */ private boolean all; /** Span helper */ private OffsetList offsets; /** * Constructs for all variants of span(), or only for any one variant. * Initializes as little as possible, for single use. */ public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { spanSet = new UnicodeSet(0, 0x10ffff); // TODO: With Java 6, just take the parent set's strings as is, // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. // Then iterate via the first() and higher() methods. // (We do not want to create multiple Iterator objects in each span().) // See ICU ticket #7454. strings = setStrings; all = (which == ALL); spanSet.retainAll(set); if (0 != (which & NOT_CONTAINED)) { // Default to the same sets. // addToSpanNotSet() will create a separate set if necessary. spanNotSet = spanSet; } offsets = new OffsetList(); // Determine if the strings even need to be taken into account at all for span() etc. // If any string is relevant, then all strings need to be used for // span(longest match) but only the relevant ones for span(while contained). // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH // and do not store UTF-8 strings if !thisRelevant and CONTAINED. // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) // Also count the lengths of the UTF-8 versions of the strings for memory allocation. int stringsLength = strings.size(); int i, spanLength; someRelevant = false; for (i = 0; i < stringsLength; ++i) { String string = strings.get(i); int length16 = string.length(); spanLength = spanSet.span(string, SpanCondition.CONTAINED); if (spanLength < length16) { // Relevant string. someRelevant = true; } if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { maxLength16 = length16; } } if (!someRelevant && (which & WITH_COUNT) == 0) { return; } // Freeze after checking for the need to use strings at all because freezing // a set takes some time and memory which are wasted if there are no relevant strings. if (all) { spanSet.freeze(); } int spanBackLengthsOffset; // Allocate a block of meta data. int allocSize; if (all) { // 2 sets of span lengths allocSize = stringsLength * (2); } else { allocSize = stringsLength; // One set of span lengths. } spanLengths = new short[allocSize]; if (all) { // Store span lengths for all span() variants. spanBackLengthsOffset = stringsLength; } else { // Store span lengths for only one span() variant. spanBackLengthsOffset = 0; } // Set the meta data and spanNotSet and write the UTF-8 strings. for (i = 0; i < stringsLength; ++i) { String string = strings.get(i); int length16 = string.length(); spanLength = spanSet.span(string, SpanCondition.CONTAINED); if (spanLength < length16) { // Relevant string. if (true /* 0 != (which & UTF16) */) { if (0 != (which & CONTAINED)) { if (0 != (which & FWD)) { spanLengths[i] = makeSpanLengthByte(spanLength); } if (0 != (which & BACK)) { spanLength = length16 - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); } } else /* not CONTAINED, not all, but NOT_CONTAINED */{ spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant // flag. } } if (0 != (which & NOT_CONTAINED)) { // Add string start and end code points to the spanNotSet so that // a span(while not contained) stops before any string. int c; if (0 != (which & FWD)) { c = string.codePointAt(0); addToSpanNotSet(c); } if (0 != (which & BACK)) { c = string.codePointBefore(length16); addToSpanNotSet(c); } } } else { // Irrelevant string. if (all) { spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; } else { // All spanXYZLengths pointers contain the same address. spanLengths[i] = ALL_CP_CONTAINED; } } } // Finish. if (all) { spanNotSet.freeze(); } } /** * Do the strings need to be checked in span() etc.? * * @return true if strings need to be checked (call span() here), * false if not (use a BMPSet for best performance). */ public boolean needsStringSpanUTF16() { return someRelevant; } /** For fast UnicodeSet::contains(c). */ public boolean contains(int c) { return spanSet.contains(c); } /** * Adds a starting or ending string character to the spanNotSet * so that a character span ends before any string. */ private void addToSpanNotSet(int c) { if (spanNotSet == null || spanNotSet == spanSet) { if (spanSet.contains(c)) { return; // Nothing to do. } spanNotSet = spanSet.cloneAsThawed(); } spanNotSet.add(c); } /* * Note: In span() when spanLength==0 * (after a string match, or at the beginning after an empty code point span) * and in spanNot() and spanNotUTF8(), * string matching could use a binary search because all string matches are done * from the same start index. * * For UTF-8, this would require a comparison function that returns UTF-16 order. * * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets * with strings have very few very short strings. For cases with many strings, it might be better to use a different * API and implementation with a DFA (state machine). */ /* * Algorithm for span(SpanCondition.CONTAINED) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then remember to continue after it. * + If a set string matches at the current position, then remember to continue after it. * + Either recursively span for each code point or string match, or recursively span * for all but the shortest one and iteratively continue the span with the shortest local match. * + Remember the longest recursive span (the farthest end point). * + If there is no match at the current position, * neither for the code point there nor for any set string, * then stop and return the longest recursive span length. * * Optimized implementation: * * (We assume that most sets will have very few very short strings. * A span using a string-less set is extremely fast.) * * Create and cache a spanSet which contains all of the single code points of the original set * but none of its strings. * * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). * - Loop: * + Try to match each set string at the end of the spanLength. * ~ Set strings that start with set-contained code points * must be matched with a partial overlap * because the recursive algorithm would have tried to match them at every position. * ~ Set strings that entirely consist of set-contained code points * are irrelevant for span(SpanCondition.CONTAINED) * because the recursive algorithm would continue after them anyway and * find the longest recursive match from their end. * ~ Rather than recursing, note each end point of a set string match. * + If no set string matched after spanSet.span(), * then return with where the spanSet.span() ended. * + If at least one set string matched after spanSet.span(), * then pop the shortest string match end point and continue the loop, * trying to match all set strings from there. * + If at least one more set string matched after a previous string match, then test if the * code point after the previous string match is also contained in the set. * Continue the loop with the shortest end point of * either this code point or a matching set string. * + If no more set string matched after a previous string match, * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). * Stop if spanLength==0, otherwise continue the loop. * * By noting each end point of a set string match, the function visits each string position at most once and * finishes in linear time. * * The recursive algorithm may visit the same string position many times * if multiple paths lead to it and finishes in exponential time. */ /* * Algorithm for span(SIMPLE) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then remember to continue after it. * + If a set string matches at the current position, then remember to continue after it. * + Continue from the farthest match position and ignore all others. * + If there is no match at the current position, then stop and return the current position. * * Optimized implementation: * * (Same assumption and spanSet as above.) * * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). * - Loop: * + Try to match each set string at the end of the spanLength. * ~ Set strings that start with set-contained code points * must be matched with a partial overlap * because the standard algorithm would have tried to match them earlier. * ~ Set strings that entirely consist of set-contained code points * must be matched with a full overlap because the longest-match algorithm * would hide set string matches that end earlier. * Such set strings need not be matched earlier inside the code point span * because the standard algorithm would then have * continued after the set string match anyway. * ~ Remember the longest set string match (farthest end point) * from the earliest starting point. * + If no set string matched after spanSet.span(), * then return with where the spanSet.span() ended. * + If at least one set string matched, * then continue the loop after the longest match from the earliest position. * + If no more set string matched after a previous string match, * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). * Stop if spanLength==0, otherwise continue the loop. */ /** * Spans a string. * * @param s The string to be spanned * @param start The start index that the span begins * @param spanCondition The span condition * @return the limit (exclusive end) of the span */ public int span(CharSequence s, int start, SpanCondition spanCondition) { if (spanCondition == SpanCondition.NOT_CONTAINED) { return spanNot(s, start, null); } int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); if (spanLimit == s.length()) { return spanLimit; } return spanWithStrings(s, start, spanLimit, spanCondition); } /** * Synchronized method for complicated spans using the offsets. * Avoids synchronization for simple cases. * * @param spanLimit = spanSet.span(s, start, CONTAINED) */ private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition) { // Consider strings; they may overlap with the span. int initSize = 0; if (spanCondition == SpanCondition.CONTAINED) { // Use offset list to try all possibilities. initSize = maxLength16; } offsets.setMaxLength(initSize); int length = s.length(); int pos = spanLimit, rest = length - spanLimit; int spanLength = spanLimit - start; int i, stringsLength = strings.size(); for (;;) { if (spanCondition == SpanCondition.CONTAINED) { for (i = 0; i < stringsLength; ++i) { int overlap = spanLengths[i]; if (overlap == ALL_CP_CONTAINED) { continue; // Irrelevant string. } String string = strings.get(i); int length16 = string.length(); // Try to match this string at pos-overlap..pos. if (overlap >= LONG_SPAN) { overlap = length16; // While contained: No point matching fully inside the code point span. overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code // point. } if (overlap > spanLength) { overlap = spanLength; } int inc = length16 - overlap; // Keep overlap+inc==length16. for (;;) { if (inc > rest) { break; } // Try to match if the increment is not listed already. if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { if (inc == rest) { return length; // Reached the end of the string. } offsets.addOffset(inc); } if (overlap == 0) { break; } --overlap; ++inc; } } } else /* SIMPLE */{ int maxInc = 0, maxOverlap = 0; for (i = 0; i < stringsLength; ++i) { int overlap = spanLengths[i]; // For longest match, we do need to try to match even an all-contained string // to find the match from the earliest start. String string = strings.get(i); int length16 = string.length(); // Try to match this string at pos-overlap..pos. if (overlap >= LONG_SPAN) { overlap = length16; // Longest match: Need to match fully inside the code point span // to find the match from the earliest start. } if (overlap > spanLength) { overlap = spanLength; } int inc = length16 - overlap; // Keep overlap+inc==length16. for (;;) { if (inc > rest || overlap < maxOverlap) { break; } // Try to match if the string is longer or starts earlier. if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) && matches16CPB(s, pos - overlap, length, string, length16)) { maxInc = inc; // Longest match from earliest start. maxOverlap = overlap; break; } --overlap; ++inc; } } if (maxInc != 0 || maxOverlap != 0) { // Longest-match algorithm, and there was a string match. // Simply continue after it. pos += maxInc; rest -= maxInc; if (rest == 0) { return length; // Reached the end of the string. } spanLength = 0; // Match strings from after a string match. continue; } } // Finished trying to match all strings at pos. if (spanLength != 0 || pos == 0) { // The position is after an unlimited code point span (spanLength!=0), // not after a string match. // The only position where spanLength==0 after a span is pos==0. // Otherwise, an unlimited code point span is only tried again when no // strings match, and if such a non-initial span fails we stop. if (offsets.isEmpty()) { return pos; // No strings matched after a span. } // Match strings from after the next string match. } else { // The position is after a string match (or a single code point). if (offsets.isEmpty()) { // No more strings matched after a previous string match. // Try another code point span from after the last string match. spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); spanLength = spanLimit - pos; if (spanLength == rest || // Reached the end of the string, or spanLength == 0 // neither strings nor span progressed. ) { return spanLimit; } pos += spanLength; rest -= spanLength; continue; // spanLength>0: Match strings from after a span. } else { // Try to match only one code point from after a string match if some // string matched beyond it, so that we try all possible positions // and don't overshoot. spanLength = spanOne(spanSet, s, pos, rest); if (spanLength > 0) { if (spanLength == rest) { return length; // Reached the end of the string. } // Match strings after this code point. // There cannot be any increments below it because UnicodeSet strings // contain multiple code points. pos += spanLength; rest -= spanLength; offsets.shift(spanLength); spanLength = 0; continue; // Match strings from after a single code point. } // Match strings from after the next string match. } } int minOffset = offsets.popMinimum(null); pos += minOffset; rest -= minOffset; spanLength = 0; // Match strings from after a string match. } } /** * Spans a string and counts the smallest number of set elements on any path across the span. * * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. * * <p>If the set does not have any fully-contained strings, then we could optimize this * like span(), but such sets are likely rare, and this is at least still linear. * * @param s The string to be spanned * @param start The start index that the span begins * @param spanCondition The span condition * @param outCount The count * @return the limit (exclusive end) of the span */ public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { if (spanCondition == SpanCondition.NOT_CONTAINED) { return spanNot(s, start, outCount); } // Consider strings; they may overlap with the span, // and they may result in a smaller count that with just code points. if (spanCondition == SpanCondition.CONTAINED) { return spanContainedAndCount(s, start, outCount); } // SIMPLE (not synchronized, does not use offsets) int stringsLength = strings.size(); int length = s.length(); int pos = start; int rest = length - start; int count = 0; while (rest != 0) { // Try to match the next code point. int cpLength = spanOne(spanSet, s, pos, rest); int maxInc = (cpLength > 0) ? cpLength : 0; // Try to match all of the strings. for (int i = 0; i < stringsLength; ++i) { String string = strings.get(i); int length16 = string.length(); if (maxInc < length16 && length16 <= rest && matches16CPB(s, pos, length, string, length16)) { maxInc = length16; } } // We are done if there is no match beyond pos. if (maxInc == 0) { outCount.value = count; return pos; } // Continue from the longest match. ++count; pos += maxInc; rest -= maxInc; } outCount.value = count; return pos; } private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { // Use offset list to try all possibilities. offsets.setMaxLength(maxLength16); int stringsLength = strings.size(); int length = s.length(); int pos = start; int rest = length - start; int count = 0; while (rest != 0) { // Try to match the next code point. int cpLength = spanOne(spanSet, s, pos, rest); if (cpLength > 0) { offsets.addOffsetAndCount(cpLength, count + 1); } // Try to match all of the strings. for (int i = 0; i < stringsLength; ++i) { String string = strings.get(i); int length16 = string.length(); // Note: If the strings were sorted by length, then we could also // avoid trying to match if there is already a match of the same length. if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && matches16CPB(s, pos, length, string, length16)) { offsets.addOffsetAndCount(length16, count + 1); } } // We are done if there is no match beyond pos. if (offsets.isEmpty()) { outCount.value = count; return pos; } // Continue from the nearest match. int minOffset = offsets.popMinimum(outCount); count = outCount.value; pos += minOffset; rest -= minOffset; } outCount.value = count; return pos; } /** * Span a string backwards. * * @param s The string to be spanned * @param spanCondition The span condition * @return The string index which starts the span (i.e. inclusive). */ public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { if (spanCondition == SpanCondition.NOT_CONTAINED) { return spanNotBack(s, length); } int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); if (pos == 0) { return 0; } int spanLength = length - pos; // Consider strings; they may overlap with the span. int initSize = 0; if (spanCondition == SpanCondition.CONTAINED) { // Use offset list to try all possibilities. initSize = maxLength16; } offsets.setMaxLength(initSize); int i, stringsLength = strings.size(); int spanBackLengthsOffset = 0; if (all) { spanBackLengthsOffset = stringsLength; } for (;;) { if (spanCondition == SpanCondition.CONTAINED) { for (i = 0; i < stringsLength; ++i) { int overlap = spanLengths[spanBackLengthsOffset + i]; if (overlap == ALL_CP_CONTAINED) { continue; // Irrelevant string. } String string = strings.get(i); int length16 = string.length(); // Try to match this string at pos-(length16-overlap)..pos-length16. if (overlap >= LONG_SPAN) { overlap = length16; // While contained: No point matching fully inside the code point span. int len1 = 0; len1 = string.offsetByCodePoints(0, 1); overlap -= len1; // Length of the string minus the first code point. } if (overlap > spanLength) { overlap = spanLength; } int dec = length16 - overlap; // Keep dec+overlap==length16. for (;;) { if (dec > pos) { break; } // Try to match if the decrement is not listed already. if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { if (dec == pos) { return 0; // Reached the start of the string. } offsets.addOffset(dec); } if (overlap == 0) { break; } --overlap; ++dec; } } } else /* SIMPLE */{ int maxDec = 0, maxOverlap = 0; for (i = 0; i < stringsLength; ++i) { int overlap = spanLengths[spanBackLengthsOffset + i]; // For longest match, we do need to try to match even an all-contained string // to find the match from the latest end. String string = strings.get(i); int length16 = string.length(); // Try to match this string at pos-(length16-overlap)..pos-length16. if (overlap >= LONG_SPAN) { overlap = length16; // Longest match: Need to match fully inside the code point span // to find the match from the latest end. } if (overlap > spanLength) { overlap = spanLength; } int dec = length16 - overlap; // Keep dec+overlap==length16. for (;;) { if (dec > pos || overlap < maxOverlap) { break; } // Try to match if the string is longer or ends later. if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) && matches16CPB(s, pos - dec, length, string, length16)) { maxDec = dec; // Longest match from latest end. maxOverlap = overlap; break; } --overlap; ++dec; } } if (maxDec != 0 || maxOverlap != 0) { // Longest-match algorithm, and there was a string match. // Simply continue before it. pos -= maxDec; if (pos == 0) { return 0; // Reached the start of the string. } spanLength = 0; // Match strings from before a string match. continue; } } // Finished trying to match all strings at pos. if (spanLength != 0 || pos == length) { // The position is before an unlimited code point span (spanLength!=0), // not before a string match. // The only position where spanLength==0 before a span is pos==length. // Otherwise, an unlimited code point span is only tried again when no // strings match, and if such a non-initial span fails we stop. if (offsets.isEmpty()) { return pos; // No strings matched before a span. } // Match strings from before the next string match. } else { // The position is before a string match (or a single code point). if (offsets.isEmpty()) { // No more strings matched before a previous string match. // Try another code point span from before the last string match. int oldPos = pos; pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); spanLength = oldPos - pos; if (pos == 0 || // Reached the start of the string, or spanLength == 0 // neither strings nor span progressed. ) { return pos; } continue; // spanLength>0: Match strings from before a span. } else { // Try to match only one code point from before a string match if some // string matched beyond it, so that we try all possible positions // and don't overshoot. spanLength = spanOneBack(spanSet, s, pos); if (spanLength > 0) { if (spanLength == pos) { return 0; // Reached the start of the string. } // Match strings before this code point. // There cannot be any decrements below it because UnicodeSet strings // contain multiple code points. pos -= spanLength; offsets.shift(spanLength); spanLength = 0; continue; // Match strings from before a single code point. } // Match strings from before the next string match. } } pos -= offsets.popMinimum(null); spanLength = 0; // Match strings from before a string match. } } /** * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) * * Theoretical algorithm: * - Iterate through the string, and at each code point boundary: * + If the code point there is in the set, then return with the current position. * + If a set string matches at the current position, then return with the current position. * * Optimized implementation: * * (Same assumption as for span() above.) * * Create and cache a spanNotSet which contains * all of the single code points of the original set but none of its strings. * For each set string add its initial code point to the spanNotSet. * (Also add its final code point for spanNotBack().) * * - Loop: * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). * + If the current code point is in the original set, then return the current position. * + If any set string matches at the current position, then return the current position. * + If there is no match at the current position, neither for the code point * there nor for any set string, then skip this code point and continue the loop. * This happens for set-string-initial code points that were added to spanNotSet * when there is not actually a match for such a set string. * * @param s The string to be spanned * @param start The start index that the span begins * @param outCount If not null: Receives the number of code points across the span. * @return the limit (exclusive end) of the span */ private int spanNot(CharSequence s, int start, OutputInt outCount) { int length = s.length(); int pos = start, rest = length - start; int stringsLength = strings.size(); int count = 0; do { // Span until we find a code point from the set, // or a code point that starts or ends some string. int spanLimit; if (outCount == null) { spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); } else { spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); outCount.value = count = count + outCount.value; } if (spanLimit == length) { return length; // Reached the end of the string. } pos = spanLimit; rest = length - spanLimit; // Check whether the current code point is in the original set, // without the string starts and ends. int cpLength = spanOne(spanSet, s, pos, rest); if (cpLength > 0) { return pos; // There is a set element at pos. } // Try to match the strings at pos. for (int i = 0; i < stringsLength; ++i) { if (spanLengths[i] == ALL_CP_CONTAINED) { continue; // Irrelevant string. } String string = strings.get(i); int length16 = string.length(); if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { return pos; // There is a set element at pos. } } // The span(while not contained) ended on a string start/end which is // not in the original set. Skip this code point and continue. // cpLength<0 pos -= cpLength; rest += cpLength; ++count; } while (rest != 0); if (outCount != null) { outCount.value = count; } return length; // Reached the end of the string. } private int spanNotBack(CharSequence s, int length) { int pos = length; int i, stringsLength = strings.size(); do { // Span until we find a code point from the set, // or a code point that starts or ends some string. pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); if (pos == 0) { return 0; // Reached the start of the string. } // Check whether the current code point is in the original set, // without the string starts and ends. int cpLength = spanOneBack(spanSet, s, pos); if (cpLength > 0) { return pos; // There is a set element at pos. } // Try to match the strings at pos. for (i = 0; i < stringsLength; ++i) { // Use spanLengths rather than a spanLengths pointer because // it is easier and we only need to know whether the string is irrelevant // which is the same in either array. if (spanLengths[i] == ALL_CP_CONTAINED) { continue; // Irrelevant string. } String string = strings.get(i); int length16 = string.length(); if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { return pos; // There is a set element at pos. } } // The span(while not contained) ended on a string start/end which is // not in the original set. Skip this code point and continue. // cpLength<0 pos += cpLength; } while (pos != 0); return 0; // Reached the start of the string. } static short makeSpanLengthByte(int spanLength) { // 0xfe==UnicodeSetStringSpan::LONG_SPAN return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; } // Compare strings without any argument checks. Requires length>0. private static boolean matches16(CharSequence s, int start, final String t, int length) { int end = start + length; while (length-- > 0) { if (s.charAt(--end) != t.charAt(length)) { return false; } } return true; } /** * Compare 16-bit Unicode strings (which may be malformed UTF-16) * at code point boundaries. * That is, each edge of a match must not be in the middle of a surrogate pair. * @param s The string to match in. * @param start The start index of s. * @param limit The limit of the subsequence of s being spanned. * @param t The substring to be matched in s. * @param tlength The length of t. */ static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { return matches16(s, start, t, tlength) && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && Character.isLowSurrogate(s.charAt(start))) && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && Character.isLowSurrogate(s.charAt(start + tlength))); } /** * Does the set contain the next code point? * If so, return its length; otherwise return its negative length. */ static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { char c = s.charAt(start); if (c >= 0xd800 && c <= 0xdbff && length >= 2) { char c2 = s.charAt(start + 1); if (UTF16.isTrailSurrogate(c2)) { int supplementary = UCharacterProperty.getRawSupplementary(c, c2); return set.contains(supplementary) ? 2 : -2; } } return set.contains(c) ? 1 : -1; } static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { char c = s.charAt(length - 1); if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { char c2 = s.charAt(length - 2); if (UTF16.isLeadSurrogate(c2)) { int supplementary = UCharacterProperty.getRawSupplementary(c2, c); return set.contains(supplementary) ? 2 : -2; } } return set.contains(c) ? 1 : -1; } /** * Helper class for UnicodeSetStringSpan. * * <p>List of offsets from the current position from where to try matching * a code point or a string. * Stores offsets rather than indexes to simplify the code and use the same list * for both increments (in span()) and decrements (in spanBack()). * * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time * are relatively dense, that is, * there are normally no gaps of hundreds or thousands of offset values. * * <p>This class optionally also tracks the minimum non-negative count for each position, * intended to count the smallest number of elements of any path leading to that position. * * <p>The implementation uses a circular buffer of count integers, * each indicating whether the corresponding offset is in the list, * and its path element count. * This avoids inserting into a sorted list of offsets (or absolute indexes) * and physically moving part of the list. * * <p>Note: In principle, the caller should setMaxLength() to * the maximum of the max string length and U16_LENGTH/U8_LENGTH * to account for "long" single code points. * * <p>Note: An earlier version did not track counts and stored only byte flags. * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, * the list could be stored as bit flags in a single integer. * Rather than handling a circular buffer with a start list index, * the integer would simply be shifted when lower offsets are removed. * UnicodeSet does not have a limit on the lengths of strings. */ private static final class OffsetList { private int[] list; private int length; private int start; public OffsetList() { list = new int[16]; // default size } public void setMaxLength(int maxLength) { if (maxLength > list.length) { list = new int[maxLength]; } clear(); } public void clear() { for (int i = list.length; i-- > 0;) { list[i] = 0; } start = length = 0; } public boolean isEmpty() { return (length == 0); } /** * Reduces all stored offsets by delta, used when the current position moves by delta. * There must not be any offsets lower than delta. * If there is an offset equal to delta, it is removed. * * @param delta [1..maxLength] */ public void shift(int delta) { int i = start + delta; if (i >= list.length) { i -= list.length; } if (list[i] != 0) { list[i] = 0; --length; } start = i; } /** * Adds an offset. The list must not contain it yet. * @param offset [1..maxLength] */ public void addOffset(int offset) { int i = start + offset; if (i >= list.length) { i -= list.length; } assert list[i] == 0; list[i] = 1; ++length; } /** * Adds an offset and updates its count. * The list may already contain the offset. * @param offset [1..maxLength] */ public void addOffsetAndCount(int offset, int count) { assert count > 0; int i = start + offset; if (i >= list.length) { i -= list.length; } if (list[i] == 0) { list[i] = count; ++length; } else if (count < list[i]) { list[i] = count; } } /** * @param offset [1..maxLength] */ public boolean containsOffset(int offset) { int i = start + offset; if (i >= list.length) { i -= list.length; } return list[i] != 0; } /** * @param offset [1..maxLength] */ public boolean hasCountAtOffset(int offset, int count) { int i = start + offset; if (i >= list.length) { i -= list.length; } int oldCount = list[i]; return oldCount != 0 && oldCount <= count; } /** * Finds the lowest stored offset from a non-empty list, removes it, * and reduces all other offsets by this minimum. * @return min=[1..maxLength] */ public int popMinimum(OutputInt outCount) { // Look for the next offset in list[start+1..list.length-1]. int i = start, result; while (++i < list.length) { int count = list[i]; if (count != 0) { list[i] = 0; --length; result = i - start; start = i; if (outCount != null) { outCount.value = count; } return result; } } // i==list.length // Wrap around and look for the next offset in list[0..start]. // Since the list is not empty, there will be one. result = list.length - start; i = 0; int count; while ((count = list[i]) == 0) { ++i; } list[i] = 0; --length; start = i; if (outCount != null) { outCount.value = count; } return result + i; } } }