/* * Copyright (c) 2013-2017 Cinchapi Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cinchapi.concourse.util; import java.util.List; import java.util.NoSuchElementException; import com.google.common.collect.Lists; import static com.cinchapi.concourse.util.SplitOption.*; /** * An in-place utility to traverse and split a string into substring. * <p> * Unlike the {@link String#split(String)} method, this utility returns tokens * as they are split on the fly so the caller can process them in place. The * traditional {@link String#split(String)} approach must make at least two * passes over the string [O(n^2)] whereas this approach is guarantee to make a * single pass [O(n)]. * </p> * <p> * <h2>Usage</h2> * * <pre> * String string = "Please split this string by space"; * StringSplitter splitter = new StringSplitter(string); * while (splitter.hasNext()) { * String next = splitter.next(); * } * </pre> * * </p> * * @author Jeff Nelson */ public class StringSplitter { /** * An integer that contains bits representing {@link SplitOption split * options} that have been enabled. To check whether an option is enabled do * * <pre> * return (options & (1 << option.mask())) != 0; * </pre> */ protected final int options; /** * The current position of the splitter. */ protected int pos = 0; /** * The char array of the string that is being split. */ private char[] chars; /** * The delimiter to use for splitting. */ private final char delimiter; /** * A flag that controls whether an attempt to split on a newline character * sequence should ignore the line feed character ('\n') because the * previous character was a carriage return (\r). Typically, a sequence of * \r\n is used by Windows to signify a newline. * * <p> * This flag is only relevant if the option to {@link #splitOnNewline()} is * enabled. * </p> */ private boolean ignoreLF = false; /** * A flag that is set in the {@link #next()} method whenever it grabs a * {@link #next} token that was determined to be at the end of line. This * means that calls to {@link #atEndOfLine()} will return {@code true} until * the next call to {@link #next()}. */ private boolean lastEOL = false; /** * The next string to return. */ private String next = null; /** * A flag that is set in the {@link #findNext()} method whenever it * determines that the {@link #next} token to be returned is at the end of * line. */ private boolean nextEOL = false; /** * A flag that controls whether we should allow {@link #findNext()} to set * {@link #next} to an empty string. Normally, whenever two delimiters * appear back to back, the splitter will return an empty string (i.e. * "foo,,bar,car" means that there is an empty token in the 2nd column). * However, when additional {@link #options} are passed to the splitter, it * may be unintuitive to return an empty string when we a character that is * relevant for one of the options and the delimiter appear back-to-back. */ private boolean overrideEmptyNext = false; /** * The start of the next token. */ private int start = 0; /** * Construct a new instance. * * @param string the string to split */ public StringSplitter(String string) { this(string, ' '); } /** * Construct a new instance. * * @param string the string to split * @param delimiter the delimiter upon which to split */ public StringSplitter(String string, char delimiter) { this(string, delimiter, SplitOption.NONE); } /** * Construct a new instance. * * @param string the string to split * @param delimiter the delimiter upon which to split * @param options an array of {@link SplitOption options} to supplement the * split behaviour */ public StringSplitter(String string, char delimiter, SplitOption... options) { this.chars = string.toCharArray(); this.delimiter = delimiter; int opts = 0; for (SplitOption option : options) { opts |= 1 << option.mask(); } this.options = opts; findNext(); } /** * Construct a new instance. * * @param string the string to split * @param options an array of {@link SplitOption options} to supplement the * split behaviour */ public StringSplitter(String string, SplitOption... options) { this(string, ' ', options); } /** * Return {@code true} if {@link SplitOption#SPLIT_ON_NEWLINE} is * {@link SplitOption#isEnabled(StringSplitter) enabled} and the last token * returned by {@link #next()} is followed immediately by a line break. * Otherwise, return {@code false}. * * @return {@code true} if the last token returned was at the end of line */ public boolean atEndOfLine() { return lastEOL; } /** * Return {@code true} if this splitter has any remaining substrings. * * @return {@code true} if there is another element */ public boolean hasNext() { return next != null; } /** * Return the next substring that results from splitting the original source * string. * * @return the new substring */ public String next() { if(next == null) { throw new NoSuchElementException(); } else { String result = next; if(lastEOL) { lastEOL = false; } if(nextEOL) { lastEOL = true; nextEOL = false; } findNext(); return result; } } /** * Reset the splitter. */ public void reset() { pos = 0; start = 0; } /** * Return an array that contains all the tokens after traversing through the * entire split process. * * @return the tokens */ public String[] toArray() { List<String> toks = Lists.newArrayList(); while (hasNext()) { toks.add(next()); } return toks.toArray(new String[0]); } /** * Before an attempt is made to {@link #setNext() set the next token} do * some analysis on the internal state of the splitter to see if its * actually appropriate to do so. If the next token should not be set, * return {@code false} from this method and also optionally change the * {@link #pos} pointer to rewind the splitter. * * @return {@code true} if the splitter is indeed ready to set the next * token */ protected boolean confirmSetNext() { return true; } /** * Determine, based on state factors that are recorded within the class, if * the splitter is actually ready to split the string on an instance of the * delimiter. By default, this method always returns {@code true}, but a * subclass can use it for awareness of certain conditions that would mean a * string should not be split on an instance of the delimiter (i.e. if the * delimiter occurs within quotes). * * @return {@code true} if the splitter is actually ready to perform a split */ protected boolean isReadyToSplit() { return true; } /** * Given a character {@code c} that is processed by the splitter, update the * state that determines whether the splitter would actually be ready to * split in the event that it encounters a delimiter character. * * @param c */ protected void updateIsReadyToSplit(char c) {/* noop */} /** * Find the next element to return. */ private void findNext() { nextEOL = false; next = null; boolean resetOverrideEmptyNext = true; boolean processOverrideEmptyNext = true; while (pos < chars.length && next == null) { boolean resetIgnoreLF = true; char c = chars[pos]; ++pos; if(c == delimiter && isReadyToSplit()) { setNext(); } else if(SPLIT_ON_NEWLINE.isEnabled(this) && c == '\n' && isReadyToSplit()) { if(ignoreLF) { start = pos; } else { setNext(); nextEOL = true; } } else if(SPLIT_ON_NEWLINE.isEnabled(this) && c == '\r' && isReadyToSplit()) { ignoreLF = true; resetIgnoreLF = false; setNext(); nextEOL = true; } else if(TOKENIZE_PARENTHESIS.isEnabled(this) && (c == '(' || c == ')') && isReadyToSplit()) { setNext(); if(next.isEmpty()) { next = Strings.valueOfCached(c); overrideEmptyNext = true; processOverrideEmptyNext = false; resetOverrideEmptyNext = false; } else { // Need to undo the modifications from #setNext() in order // to look at the parenthesis char again so it can be // returned as a single token via the if block above pos--; start = pos; } } // For SPLIT_ON_NEWLINE, we must reset #ignoreLF if the current char // is not == '\r' ignoreLF = resetIgnoreLF ? false : ignoreLF; updateIsReadyToSplit(c); } if(pos == chars.length && next == null) { // If we reach the end of the // string without finding // the delimiter, then set // next to be all the // remaining chars. if(confirmSetNext()) { int length = pos - start; if(length == 0) { next = ""; } else { length = trim(length); next = String.valueOf(chars, start, length); } ++pos; } else { findNext(); } } if(next != null && next.isEmpty()) { // For compatibility with String#split, we must detect if an empty // token occurs at the end of a string by trying to find the next // occurrence of a non delimiter char. boolean atEnd = true; for (int i = pos; i < chars.length; ++i) { if(chars[i] != delimiter) { atEnd = false; break; } } next = atEnd ? null : next; } // FOR TOKENIZE_PARENTHESIS, we must #overrideEmptyNext if the last // next was a single parenthesis in case the next char is a delimiter. // This prevents the appearance of having back-to-back delimiters. if(overrideEmptyNext && processOverrideEmptyNext) { if(next != null && next.isEmpty()) { findNext(); } resetOverrideEmptyNext = true; } overrideEmptyNext = resetOverrideEmptyNext ? false : overrideEmptyNext; if(next != null && DROP_QUOTES.isEnabled(this) && Strings.isWithinQuotes(next) && this instanceof QuoteAwareStringSplitter) { next = next.substring(1, next.length() - 1); } } /** * Set the {@link #next} element based on the current {@link #pos} and the * {@link #start} of the search. * <p> * The side effects of this method are: * <ul> * <li>{@code next} is set equal to all the chars from {@link #start} and * {@link #pos} - 2</li> * <li>{@code start} is set equal to {@link #pos}</li> * </li>The char at {@link #pos} - 1 is "dropped". This character is usually * the delimiter, so it is okay to do this, but if there is a corner case, * the caller must explicitly handle that character * </ul> * </p> */ private void setNext() { if(confirmSetNext()) { int length = pos - start - 1; if(length == 0) { next = ""; } else { length = trim(length); next = String.valueOf(chars, start, length); } start = pos; } else { findNext(); } } /** * Given the desired {@code length} for the {@link #next} token, perform any * trimming of leading and trailing white space if * {@link SplitOption#TRIM_WHITESPACE} * {@link SplitOption#isEnabled(StringSplitter) is enabled}. * <p> * This method will modify the global {@link #start} position for the * {@link #next} string. It returns the appropriate length to assign after * the trimming has been done. * </p> * * @param length the length of the untrimmed {@link #next} string. * @return the appropriate length after the trimming */ private int trim(int length) { if(SplitOption.TRIM_WHITESPACE.isEnabled(this)) { while (Character.isWhitespace(chars[start]) && length > 1) { start++; length--; } while (Character.isWhitespace(chars[(start + length) - 1]) && length > 1) { length--; } } return length; } }