/*license*\ XBN-Java: Copyright (C) 2014, Jeff Epstein (aliteralmind __DASH__ github __AT__ yahoo __DOT__ com) This software is dual-licensed under the: - Lesser General Public License (LGPL) version 3.0 or, at your option, any later version; - Apache Software License (ASL) version 2.0. Either license may be applied at your discretion. More information may be found at - http://en.wikipedia.org/wiki/Multi-licensing. The text of both licenses is available in the root directory of this project, under the names "LICENSE_lgpl-3.0.txt" and "LICENSE_asl-2.0.txt". The latest copies may be downloaded at: - LGPL 3.0: https://www.gnu.org/licenses/lgpl-3.0.txt - ASL 2.0: http://www.apache.org/licenses/LICENSE-2.0.txt \*license*/ package com.github.xbn.regexutil; import com.github.xbn.io.SimpleDebuggable; import com.github.xbn.lang.Copyable; import com.github.xbn.lang.CrashIfObject; import com.github.xbn.lang.IllegalArgumentStateException; import com.github.xbn.regexutil.z.RegexTokenizer_Fieldable; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.regex.Matcher; import java.util.regex.Pattern; /** <p>Splits a string based on a regular-expression separator, returning the matches, "betweens", or both.</p> <p><i>Derived from {@code RETokenize}: <a href="http://www.exampledepot.com/egs/java.util.regex/Tokenize.html">{@code http://www.exampledepot.com/egs/java.util.regex/Tokenize.html}</a>, downloaded 8/13/2010.</i></p> <A NAME="cfg"></a><h3>Builder Configuration: {@link com.github.xbn.regexutil.z.RegexTokenizer_Cfg RegexTokenizer_Cfg}</h3> <p><ul> <li><b>Regex separator:</b> <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#separator(Pattern) separator}(p)</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#separator(String, int) separator}(s,i)</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#separator(String) separator}(s)</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#separatorLiteral(String) separatorLiteral}(s)</code></li> <li><b>What to return:</b><ul> <li><b>Only:</b> <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#allBetweensOnly() allBetweensOnly}()</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#nonEmptyBetweensOnly() nonEmptyBetweensOnly}()</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#separatorsOnly() separatorsOnly}()</code></li> <li><b>Non-only:</b> <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#separators() separators}()</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#emptyBetweens() emptyBetweens}()</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#nonEmptyBetweens() nonEmptyBetweens}()</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#allBetweens() allBetweens}()</code></li> </ul></li> <li><b>Other:</b> <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#toTokenize(Object) toTokenize}(o)</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#lineNumber(int) lineNumber}(i)</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#debugTo(Appendable) debugTo}(apbl)</code>, <code>{@link com.github.xbn.regexutil.z.RegexTokenizer_CfgForNeeder#chainID(boolean, Object) chainID}(b,o)</code></li> </ul></p> {@.codelet.and.out com.github.xbn.examples.regexutil.TemplateTokenizer%eliminateCommentBlocksAndPackageDecl()} {@.codelet.and.out com.github.xbn.examples.regexutil.RegexTokenizerXmpl%eliminateCommentBlocksAndPackageDecl()} <p>An interesting problem solved with both java.util.regex and {@code RegexTokenizer}. From (viewed 12/31/2013) <br/>     {@code <a href="http://stackoverflow.com/questions/20859278/regex-split-up-parentheses-group">http://stackoverflow.com/questions/20859278/regex-split-up-parentheses-group</a>}</p> * @since 0.1.0 * @author Copyright (C) 2014, Jeff Epstein ({@code aliteralmind __DASH__ github __AT__ yahoo __DOT__ com}), dual-licensed under the LGPL (version 3.0 or later) or the ASL (version 2.0). See source code for details. <a href="http://xbnjava.aliteralmind.com">{@code http://xbnjava.aliteralmind.com}</a>, <a href="https://github.com/aliteralmind/xbnjava">{@code https://github.com/aliteralmind/xbnjava}</a> **/ public class RegexTokenizer extends SimpleDebuggable implements Iterator<TokenizerElement>, Copyable, PatternHaser { //config: immutable private final SimplePatternHaser sph; //config: mutable private int iLnNum = -1 ; private boolean bRtnSep = false; private boolean bRtnNEBtw = false; private boolean bRtnEBtw = false; //state...START private String sOrig = null; private Matcher m = null; private TokenizerElement rteNextBetween = null; private TokenizerElement rteNextSeparator = null; //Wanted to replace this with rteNextSeparator.getIdxEndX(), but that //wouldn't exist for a moment after calling next() private int ixLastMEnd = 0; //state...END //internal /** <p>Create a new regex tokenizer, where every match is manipulated before returned by {@code next()}.</p> * @param fieldable May not be {@code null}. */ public RegexTokenizer(RegexTokenizer_Fieldable fieldable) { iLnNum = fieldable.getLineNumber(); bRtnSep = fieldable.doReturnSeparators(); bRtnNEBtw = fieldable.doReturnNonEmptyBetweens(); bRtnEBtw = fieldable.doReturnEmptyBetweens(); if(!bRtnSep && !bRtnNEBtw && !bRtnEBtw) { throw new IllegalArgumentStateException("fieldable.doReturnSeparators(), fieldable.doReturnNonEmptyBetweens(), and fieldable.doReturnEmptyBetweens() are all false. Nothing to do."); } sph = (new SimplePatternHaser()).pattern(fieldable.getPattern(), "fieldable.getPattern()"). matcherUses(MatcherUses.FIND); sOrig = fieldable.getToTokenize(); try { m = getPattern().matcher(sOrig); } catch(RuntimeException rx) { throw CrashIfObject.nullOrReturnCause(sOrig, "fieldable.getToTokenize()", null, rx); } setDebug(fieldable.getDebugDestOnIfNonNull(), (fieldable.getDebugDestOnIfNonNull() != null)); initState(); hasNext(); } private void initState() { rteNextBetween = null; rteNextSeparator = null; ixLastMEnd = 0; sph.declareNotMatched(); } public RegexTokenizer(RegexTokenizer to_copy) { this(to_copy, (Pattern)null, null, -1); } public RegexTokenizer(RegexTokenizer to_copy, String findWhat_regex, Object string_toSearch, int line_num) { this(to_copy, NewPatternFor.regex(findWhat_regex, "findWhat_regex"), string_toSearch, line_num); } public RegexTokenizer(RegexTokenizer to_copy, String findWhat_regex, int bit_flags, Object string_toSearch, int line_num) { this(to_copy, NewPatternFor.regex(findWhat_regex, bit_flags, "findWhat_regex"), string_toSearch, line_num); } /** <p>Create a new {@code RegexTokenizer} as a duplicate of another, but for a new search-string. This leaves debugging on, if it is already on in the original. This goes against the "exception".......................</p> * @param to_copy May not be {@code null}. */ public RegexTokenizer(RegexTokenizer to_copy, Pattern new_findWhat, Object string_toSearch, int line_num) { super(to_copy); try { bRtnSep = to_copy.bRtnSep; } catch(RuntimeException rx) { throw CrashIfObject.nullOrReturnCause(to_copy, "to_copy", null, rx); } bRtnNEBtw = to_copy.bRtnNEBtw; bRtnEBtw = to_copy.bRtnEBtw; sph = (new SimplePatternHaser(to_copy, new_findWhat)); iLnNum = line_num; sOrig = ((string_toSearch == null) ? to_copy.getOriginal() : string_toSearch.toString()); m = getPattern().matcher(sOrig); initState(); hasNext(); } public void setNewSearch(Object string_toSearch, int line_num) { iLnNum = line_num; try { sOrig = string_toSearch.toString(); } catch(RuntimeException rx) { throw new NullPointerException("string_toSearch"); } m.reset(sOrig); initState(); hasNext(); } public void setNewSearch(Pattern find_whatPtrn, Object string_toSearch, int line_num) { sph.pattern(find_whatPtrn, "find_whatPtrn"); iLnNum = line_num; try { sOrig = string_toSearch.toString(); } catch(RuntimeException rx) { throw new NullPointerException("string_toSearch"); } m = getPattern().matcher(sOrig); initState(); hasNext(); } public void setNewSearch(String findWhat_regex, Object string_toSearch, int line_num) { setNewSearch(NewPatternFor.regex(findWhat_regex, "findWhat_regex"), string_toSearch, line_num); } public void setNewSearch(String findWhat_regex, int bit_flags, Object string_toSearch, int line_num) { setNewSearch(NewPatternFor.regex(findWhat_regex, bit_flags, "findWhat_regex"), string_toSearch, line_num); } public void setNewSearchLiteral(String findWhat_literal, Object string_toSearch, int line_num) { setNewSearch(NewPatternFor.literal(findWhat_literal, "findWhat_literal"), string_toSearch, line_num); } //Composition implementation: null...START public Pattern getPattern() { return sph.getPattern(); } public int getMatchedIndex() { return sph.getMatchedIndex(); } public int getMatchCount() { return sph.getMatchCount(); } public boolean wasJustMatched() { return sph.wasJustMatched(); } /** <p>Should the string be matched as a whole?. Should <code><i>[{@link java.util.regex.Matcher}]</i>.{@link java.util.regex.Matcher#matches() matches}()</code> be used?.</p> * @return {@code true} The string is matched as a whole. * @see com.github.xbn.regexutil.z.RegexGroupExtractor_Cfg#useMatches() Cfg.useMatches() * @see com.github.xbn.regexutil.z.RegexGroupExtractor_Cfg#useFind() Cfg.useFind() * @see com.github.xbn.regexutil.z.RegexGroupExtractor_Cfg#useLookingAt() Cfg.useLookingAt() */ public MatcherUses getMatcherUses() { return sph.getMatcherUses(); } //Composition implementation: null...END /** <p>Get the original string-to-tokenize.</p> * @return <b>{@code string_toSearch}</b>, as provided to the constructor. */ public final String getOriginal() { return sOrig; } /** <p>Are regex-separators retrieved?.</p> */ public final boolean doReturnSeparators() { return bRtnSep; } /** <p>Are betweens retrieved?.</p> */ public final boolean doReturnNonEmptyBetweens() { return bRtnNEBtw; } /** <p>Are empty-string betweens retrieved?.</p> */ public final boolean doReturnEmptyBetweens() { return bRtnEBtw; } /** <p>Is there another token (match) or between to get?.</p> * @return <b>{@code true}</b> If there is another match or between remaining in the {@link #getOriginal() string-to-tokenize}, and it is wanted. */ public final boolean hasNext() { if (rteNextBetween != null || rteNextSeparator != null) { return true; } if (m == null) { return false; } sph.declareNotMatched(); int i2tLen = getOriginal().length(); if (m.find()) { sph.matchedIndex(m.start()); if (doReturnNonEmptyBetweens()) { String sBt = getOriginal().subSequence(ixLastMEnd, m.start()).toString(); if(sBt.length() != 0 || doReturnEmptyBetweens()) { rteNextBetween = new TokenizerElement(iLnNum, false, ixLastMEnd, sBt, isDebugOn(), getDebugAptr()); } } if (doReturnSeparators()) { rteNextSeparator = new TokenizerElement(iLnNum, true, m.start(), m.group(), isDebugOn(), getDebugAptr()); } ixLastMEnd = m.end(); } else if (doReturnNonEmptyBetweens() && ixLastMEnd < i2tLen) { //The very last one :) String sBt = getOriginal().subSequence(ixLastMEnd, i2tLen).toString(); rteNextBetween = new TokenizerElement(iLnNum, false, ixLastMEnd, sBt, isDebugOn(), getDebugAptr()); ixLastMEnd = i2tLen; // Need to remove the m since it appears to automatically // reset itself once it reaches the end. m = null; } return !(rteNextBetween == null && rteNextSeparator == null); } /** <p>Get the next regex-match or between.</p> * @exception NoSuchElementException If {@link #hasNext() hasNext}{@code ()} is false. * @see #isNextASeparator() * @see #isNextABetween() */ public final TokenizerElement next() { TokenizerElement rte = null; if(rteNextBetween != null) { rte = rteNextBetween; rteNextBetween = null; } else if (rteNextSeparator != null) { rte = rteNextSeparator; rteNextSeparator = null; } else { throw new NoSuchElementException("next"); } hasNext(); return rte; } /** <p>Is the next token a regex-match?.</p> * @see #next() * @see #isNextABetween() */ public boolean isNextASeparator() { return (rteNextBetween == null && rteNextSeparator != null); } /** <p>Is the next token a between?.</p> * @see #next() * @see #isNextASeparator() */ public boolean isNextABetween() { //Betweens have precedence return (rteNextBetween != null); } public String toString() { return "doReturnSeparators()=" + doReturnSeparators() + ", doReturnNonEmptyBetweens()=" + doReturnNonEmptyBetweens() + ", doReturnEmptyBetweens()=" + doReturnEmptyBetweens(); } /** <p>Unsupported.</p> * @exception UnsupportedOperationException */ public final void remove() { throw new UnsupportedOperationException("remove"); } /** <p>Duplicate this <code>RegexTokenizer</code>.</p> * @return <code>(new <a href="#RegexTokenizer(RegexTokenizer)">RegexTokenizer</a>(this))</code> */ public RegexTokenizer getObjectCopy() { return (new RegexTokenizer(this)); } }