/* * Copyright 2015, 2016 Tagir Valeev * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package one.util.streamex; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.function.BiConsumer; import java.util.function.BinaryOperator; import java.util.function.Function; import java.util.function.Predicate; import java.util.function.Supplier; import java.util.stream.Collector; import java.util.stream.Collectors; import static one.util.streamex.StreamExInternals.*; /** * An advanced implementation of joining {@link Collector}. This collector is * capable to join the input {@code CharSequence} elements with given delimiter * optionally wrapping into given prefix and suffix and optionally limiting the * length of the resulting string (in Unicode code units, code points or * grapheme clusters) adding the specified ellipsis sequence. This collector * supersedes the standard JDK * {@link Collectors#joining(CharSequence, CharSequence, CharSequence)} * collectors family. * * <p> * This collector is <a * href="package-summary.html#ShortCircuitReduction">short-circuiting</a> when * the string length is limited in either of ways. Otherwise it's not * short-circuiting. * * <p> * Every specific collector represented by this class is immutable, so you can * share it. A bunch of methods is provided to create a new collector based on * this one. * * <p> * To create {@code Joining} collector use {@link #with(CharSequence)} static * method and specify the delimiter. For further setup use specific instance * methods which return new {@code Joining} objects like this: * * <pre>{@code * StreamEx.of(source).collect(Joining.with(", ").wrap("[", "]") * .maxCodePoints(100).cutAtWord()); * }</pre> * * <p> * The intermediate accumulation type of this collector is the implementation * detail and not exposed to the API. If you want to cast it to * {@code Collector} type, use ? as accumulator type variable: * * <pre>{@code * Collector<CharSequence, ?, String> joining = Joining.with(", "); * }</pre> * * @author Tagir Valeev * @since 0.4.1 */ public class Joining extends CancellableCollector<CharSequence, Joining.Accumulator, String> { static final class Accumulator { List<CharSequence> data = new ArrayList<>(); int chars = 0, count = 0; } private static final int CUT_ANYWHERE = 0; private static final int CUT_CODEPOINT = 1; private static final int CUT_GRAPHEME = 2; private static final int CUT_WORD = 3; private static final int CUT_BEFORE_DELIMITER = 4; private static final int CUT_AFTER_DELIMITER = 5; private static final int LENGTH_CHARS = 0; private static final int LENGTH_CODEPOINTS = 1; private static final int LENGTH_GRAPHEMES = 2; private final String delimiter, ellipsis, prefix, suffix; private final int cutStrategy, lenStrategy, maxLength; private int limit, delimCount = -1; private Joining(String delimiter, String ellipsis, String prefix, String suffix, int cutStrategy, int lenStrategy, int maxLength) { this.delimiter = delimiter; this.ellipsis = ellipsis; this.prefix = prefix; this.suffix = suffix; this.cutStrategy = cutStrategy; this.lenStrategy = lenStrategy; this.maxLength = maxLength; } private void init() { if (delimCount == -1) { limit = maxLength - length(prefix) - length(suffix); delimCount = length(delimiter); } } private int length(CharSequence s) { switch (lenStrategy) { case LENGTH_CHARS: return s.length(); case LENGTH_CODEPOINTS: if (s instanceof String) return ((String) s).codePointCount(0, s.length()); return (int) s.codePoints().count(); case LENGTH_GRAPHEMES: BreakIterator bi = BreakIterator.getCharacterInstance(); bi.setText(s.toString()); int count = 0; for (int end = bi.next(); end != BreakIterator.DONE; end = bi.next()) count++; return count; default: throw new InternalError(); } } private static int copy(char[] buf, int pos, String str) { str.getChars(0, str.length(), buf, pos); return pos + str.length(); } private int copyCut(char[] buf, int pos, String str, int limit, int cutStrategy) { if (limit <= 0) return pos; int endPos = str.length(); switch (lenStrategy) { case LENGTH_CHARS: if (limit < str.length()) endPos = limit; break; case LENGTH_CODEPOINTS: if (limit < str.codePointCount(0, str.length())) endPos = str.offsetByCodePoints(0, limit); break; case LENGTH_GRAPHEMES: BreakIterator bi = BreakIterator.getCharacterInstance(); bi.setText(str); int count = limit, end; while (true) { end = bi.next(); if (end == BreakIterator.DONE) break; if (--count == 0) { endPos = end; break; } } break; default: throw new InternalError(); } if (endPos < str.length()) { BreakIterator bi; switch (cutStrategy) { case CUT_BEFORE_DELIMITER: case CUT_AFTER_DELIMITER: endPos = 0; break; case CUT_WORD: bi = BreakIterator.getWordInstance(); bi.setText(str); endPos = bi.preceding(endPos + 1); break; case CUT_GRAPHEME: bi = BreakIterator.getCharacterInstance(); bi.setText(str); endPos = bi.preceding(endPos + 1); break; case CUT_ANYWHERE: break; case CUT_CODEPOINT: if (Character.isHighSurrogate(str.charAt(endPos - 1)) && Character.isLowSurrogate(str.charAt(endPos))) endPos--; break; default: throw new InternalError(); } } str.getChars(0, endPos, buf, pos); return pos + endPos; } private String finisherNoOverflow(Accumulator acc) { char[] buf = new char[acc.chars + prefix.length() + suffix.length()]; int size = acc.data.size(); int pos = copy(buf, 0, prefix); for (int i = 0; i < size; i++) { if (i > 0) { pos = copy(buf, pos, delimiter); } pos = copy(buf, pos, acc.data.get(i).toString()); } copy(buf, pos, suffix); return new String(buf); } private Joining withLimit(int lenStrategy, int maxLength) { if (maxLength < 0) throw new IllegalArgumentException(maxLength + ": must be positive"); return new Joining(delimiter, ellipsis, prefix, suffix, cutStrategy, lenStrategy, maxLength); } private Joining withCut(int cutStrategy) { return new Joining(delimiter, ellipsis, prefix, suffix, cutStrategy, lenStrategy, maxLength); } /** * Returns a {@code Collector} that concatenates the input elements, * separated by the specified delimiter, in encounter order. * * <p> * This collector is similar to {@link Collectors#joining(CharSequence)}, * but can be further set up in a flexible way, for example, specifying the * maximal allowed length of the resulting {@code String}. * * @param delimiter the delimiter to be used between each element * @return A {@code Collector} which concatenates CharSequence elements, * separated by the specified delimiter, in encounter order * @see Collectors#joining(CharSequence) */ public static Joining with(CharSequence delimiter) { return new Joining(delimiter.toString(), "...", "", "", CUT_GRAPHEME, LENGTH_CHARS, -1); } /** * Returns a {@code Collector} which behaves like this collector, but * additionally wraps the result with the specified prefix and suffix. * * <p> * The collector returned by * {@code Joining.with(delimiter).wrap(prefix, suffix)} is equivalent to * {@link Collectors#joining(CharSequence, CharSequence, CharSequence)}, but * can be further set up in a flexible way, for example, specifying the * maximal allowed length of the resulting {@code String}. * * <p> * If length limit is specified for the collector, the prefix length and the * suffix length are also counted towards this limit. If the length of the * prefix and the suffix exceed the limit, the resulting collector will not * accumulate any elements and produce the same output. For example, * {@code stream.collect(Joining.with(",").wrap("prefix", "suffix").maxChars(9))} * will produce {@code "prefixsuf"} string regardless of the input stream * content. * * <p> * You may wrap several times: * {@code Joining.with(",").wrap("[", "]").wrap("(", ")")} is equivalent to * {@code Joining.with(",").wrap("([", "])")}. * * @param prefix the sequence of characters to be used at the beginning of * the joined result * @param suffix the sequence of characters to be used at the end of the * joined result * @return a new {@code Collector} which wraps the result with the specified * prefix and suffix. */ public Joining wrap(CharSequence prefix, CharSequence suffix) { return new Joining(delimiter, ellipsis, prefix.toString().concat(this.prefix), this.suffix.concat(suffix .toString()), cutStrategy, lenStrategy, maxLength); } /** * Returns a {@code Collector} which behaves like this collector, but uses * the specified ellipsis {@code CharSequence} instead of default * {@code "..."} when the string limit (if specified) is reached. * * @param ellipsis the sequence of characters to be used at the end of the * joined result to designate that not all of the input elements are * joined due to the specified string length restriction. * @return a new {@code Collector} which will use the specified ellipsis * instead of current setting. */ public Joining ellipsis(CharSequence ellipsis) { return new Joining(delimiter, ellipsis.toString(), prefix, suffix, cutStrategy, lenStrategy, maxLength); } /** * Returns a {@code Collector} which behaves like this collector, but sets * the maximal length of the resulting string to the specified number of * UTF-16 characters (or Unicode code units). This setting overwrites any * limit previously set by {@code maxChars(int)}, * {@link #maxCodePoints(int)} or {@link #maxGraphemes(int)} call. * * <p> * The {@code String} produced by the resulting collector is guaranteed to * have {@link String#length() length} which does not exceed the specified * limit. An ellipsis sequence (by default {@code "..."}) is used to * designate whether the limit was reached. Use * {@link #ellipsis(CharSequence)} to set custom ellipsis sequence. * * <p> * The collector returned by this method is <a * href="package-summary.html#ShortCircuitReduction">short-circuiting</a>: * it may not process all the input elements if the limit is reached. * * @param limit the maximal number of UTF-16 characters in the resulting * String. * @return a new {@code Collector} which will produce String no longer than * given limit. */ public Joining maxChars(int limit) { return withLimit(LENGTH_CHARS, limit); } /** * Returns a {@code Collector} which behaves like this collector, but sets * the maximal number of Unicode code points of the resulting string. This * setting overwrites any limit previously set by {@link #maxChars(int)}, * {@code maxCodePoints(int)} or {@link #maxGraphemes(int)} call. * * <p> * The {@code String} produced by the resulting collector is guaranteed to * have no more code points than the specified limit. An ellipsis sequence * (by default {@code "..."}) is used to designate whether the limit was * reached. Use {@link #ellipsis(CharSequence)} to set custom ellipsis * sequence. * * <p> * The collector returned by this method is <a * href="package-summary.html#ShortCircuitReduction">short-circuiting</a>: * it may not process all the input elements if the limit is reached. * * @param limit the maximal number of code points in the resulting String. * @return a new {@code Collector} which will produce String no longer than * given limit. */ public Joining maxCodePoints(int limit) { return withLimit(LENGTH_CODEPOINTS, limit); } /** * Returns a {@code Collector} which behaves like this collector, but sets * the maximal number of grapheme clusters. This setting overwrites any * limit previously set by {@link #maxChars(int)}, * {@link #maxCodePoints(int)} or {@code maxGraphemes(int)} call. * * <p> * The grapheme cluster is defined in <a * href="http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries" * >Unicode Text Segmentation</a> technical report. Basically, it counts * base character and the following combining characters as single object. * The {@code String} produced by the resulting collector is guaranteed to * have no more grapheme clusters than the specified limit. An ellipsis * sequence (by default {@code "..."}) is used to designate whether the * limit was reached. Use {@link #ellipsis(CharSequence)} to set custom * ellipsis sequence. * * <p> * The collector returned by this method is <a * href="package-summary.html#ShortCircuitReduction">short-circuiting</a>: * it may not process all the input elements if the limit is reached. * * @param limit the maximal number of grapheme clusters in the resulting * String. * @return a new {@code Collector} which will produce String no longer than * given limit. */ public Joining maxGraphemes(int limit) { return withLimit(LENGTH_GRAPHEMES, limit); } /** * Returns a {@code Collector} which behaves like this collector, but cuts * the resulting string at any point when limit is reached. * * <p> * The resulting collector will produce {@code String} which length is * exactly equal to the specified limit if the limit is reached. If used * with {@link #maxChars(int)}, the resulting string may be cut in the * middle of surrogate pair. * * @return a new {@code Collector} which cuts the resulting string at any * point when limit is reached. */ public Joining cutAnywhere() { return withCut(CUT_ANYWHERE); } /** * Returns a {@code Collector} which behaves like this collector, but cuts * the resulting string between any code points when limit is reached. * * <p> * The resulting collector will not split the surrogate pair when used with * {@link #maxChars(int)} or {@link #maxCodePoints(int)}. However it may * remove the combining character which may result in incorrect rendering of * the last displayed grapheme. * * @return a new {@code Collector} which cuts the resulting string between * code points. */ public Joining cutAtCodePoint() { return withCut(CUT_CODEPOINT); } /** * Returns a {@code Collector} which behaves like this collector, but cuts * the resulting string at grapheme cluster boundary when limit is reached. * This is the default behavior. * * <p> * The grapheme cluster is defined in <a * href="http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries" * >Unicode Text Segmentation</a> technical report. Thus the resulting * collector will not split the surrogate pair and will preserve any * combining characters or remove them with the base character. * * @return a new {@code Collector} which cuts the resulting string at * grapheme cluster boundary. */ public Joining cutAtGrapheme() { return withCut(CUT_GRAPHEME); } /** * Returns a {@code Collector} which behaves like this collector, but cuts * the resulting string at word boundary when limit is reached. * * <p> * The beginning and end of every input stream element or delimiter is * always considered as word boundary, so the stream of * {@code "one", "two three"} collected with * {@code Joining.with("").maxChars(n).ellipsis("").cutAtWord()} may produce * the following strings depending on {@code n}: * * <pre>{@code * "" * "one" * "onetwo" * "onetwo " * "onetwo three" * }</pre> * * @return a new {@code Collector} which cuts the resulting string at word * boundary. */ public Joining cutAtWord() { return withCut(CUT_WORD); } /** * Returns a {@code Collector} which behaves like this collector, but cuts * the resulting string before the delimiter when limit is reached. * * @return a new {@code Collector} which cuts the resulting string at before * the delimiter. */ public Joining cutBeforeDelimiter() { return withCut(CUT_BEFORE_DELIMITER); } /** * Returns a {@code Collector} which behaves like this collector, but cuts * the resulting string after the delimiter when limit is reached. * * @return a new {@code Collector} which cuts the resulting string at after * the delimiter. */ public Joining cutAfterDelimiter() { return withCut(CUT_AFTER_DELIMITER); } @Override public Supplier<Accumulator> supplier() { return Accumulator::new; } @Override public BiConsumer<Accumulator, CharSequence> accumulator() { if (maxLength == -1) return (acc, str) -> { if (!acc.data.isEmpty()) acc.chars += delimiter.length(); acc.chars += str.length(); acc.data.add(str); }; init(); return (acc, str) -> { if (acc.count <= limit) { if (!acc.data.isEmpty()) { acc.chars += delimiter.length(); acc.count += delimCount; } acc.chars += str.length(); acc.count += length(str); acc.data.add(str); } }; } @Override public BinaryOperator<Accumulator> combiner() { if (maxLength == -1) return (acc1, acc2) -> { if (acc1.data.isEmpty()) return acc2; if (acc2.data.isEmpty()) return acc1; acc1.chars += delimiter.length() + acc2.chars; acc1.data.addAll(acc2.data); return acc1; }; init(); BiConsumer<Accumulator, CharSequence> accumulator = accumulator(); return (acc1, acc2) -> { if (acc1.data.isEmpty()) return acc2; if (acc2.data.isEmpty()) return acc1; int len = acc1.count + acc2.count + delimCount; if (len <= limit) { acc1.count = len; acc1.chars += delimiter.length() + acc2.chars; acc1.data.addAll(acc2.data); } else { for (CharSequence s : acc2.data) { if (acc1.count > limit) break; accumulator.accept(acc1, s); } } return acc1; }; } @Override public Function<Accumulator, String> finisher() { if (maxLength == -1) { return this::finisherNoOverflow; } init(); if (limit <= 0) { char[] buf = new char[prefix.length() + suffix.length()]; int pos = copyCut(buf, 0, prefix, maxLength, cutStrategy); pos = copyCut(buf, pos, suffix, maxLength - length(prefix), cutStrategy); String result = new String(buf, 0, pos); return acc -> result; } return acc -> { if (acc.count <= limit) return finisherNoOverflow(acc); char[] buf = new char[acc.chars + prefix.length() + suffix.length()]; int size = acc.data.size(); int pos = copy(buf, 0, prefix); int ellipsisCount = length(ellipsis); int rest = limit - ellipsisCount; if (rest < 0) { pos = copyCut(buf, pos, ellipsis, limit, CUT_ANYWHERE); } else { for (int i = 0; i < size; i++) { String s = acc.data.get(i).toString(); int count = length(s); if (i > 0) { if (cutStrategy == CUT_BEFORE_DELIMITER && delimCount + count > rest) { break; } if (delimCount > rest) { pos = copyCut(buf, pos, delimiter, rest, cutStrategy); break; } rest -= delimCount; pos = copy(buf, pos, delimiter); } if (cutStrategy == CUT_AFTER_DELIMITER && delimCount + count > rest) { break; } if (count > rest) { pos = copyCut(buf, pos, s, rest, cutStrategy); break; } pos = copy(buf, pos, s); rest -= count; } pos = copy(buf, pos, ellipsis); } pos = copy(buf, pos, suffix); return new String(buf, 0, pos); }; } @Override public Set<Characteristics> characteristics() { init(); if (limit <= 0) return Collections.singleton(Characteristics.UNORDERED); return Collections.emptySet(); } @Override Predicate<Accumulator> finished() { if (maxLength == -1) return null; init(); if (limit <= 0) return acc -> true; return acc -> acc.count > limit; } }