package squidpony; import regexodus.*; import squidpony.annotation.Beta; import squidpony.squidmath.CrossHash; import squidpony.squidmath.IntVLA; import java.io.Reader; import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Iterator; import static squidpony.ArrayTools.letters; /** * A simple format parser for String-based configuration or data files where JSON is overkill. * Supports only one type, String, but allows each String to have arbitrary nested levels of * String children as if in sub-lists. You can interpret the Strings however you want, and * quoting each String isn't necessary if they are just one word ("bare words" are allowed). * <br> * The main way of using this is to get an ObText.ItemIterator value using {@link #iterator()}, * which acts as a normal Iterator over the top-level Strings (not children of anything), but * to call its {@link ItemIterator#hasChild()} method when you expect potential child elements, * then {@link ItemIterator#children()} to get another ItemIterator over the child elements if * you want to explore deeper. * <br> * This implements Collection of String but is (mostly) unmodifiable; you can call * {@link #parse(CharSequence)} to append the results of parsing more formatted text, or call * {@link #clear()} to remove all data. {@link #add(Object)} and {@link #remove(Object)} are * not implemented and throw exceptions. A quirk of how this implements Collection is that it * only considers the top-level Strings to be part of the Collection for length and for * {@link #contains(Object)}, and will ignore child strings unless you access them via * {@link ItemIterator#children()} on an item that has at least one child. * <br> * Format example: * <pre> * hello world * 'how are you today?' [just great thanks] * hooray! * * complexity? * [it is possible [yes this is a good example] * 'escapes like \[\'\] all work' * ] * * comments are allowed // like this * comments can have different forms # like this * // block comments like in c are allowed * / * but because this example is in javadoc, this example is not actually a comment * / * // remove the spaces between each slash and asterisk to make the last line a comment. * /[delimit/or block comments with delimiters/delimit]/ * * ''' * raw strings (heredocs) look like this normally. * they permit characters without escapes, ]][][[ \/\/\ , * except for triple quotes. * they keep newlines and indentation intact, * except for up to one newline ignored adjacent to each triple quote. * ''' * * [[different[ * if you may need triple quotes * in the raw string, use a different syntax that allows delimiters. * here, the delimiter is '''different''', just to be different.]different]] * </pre> * <br> * Inspired strongly by STOB, http://stobml.org/ , but no code is shared and the format is * slightly different. The main differences are that ObText supports nested block comments * using the syntax {@code /[delimiter/contents/delimiter]/} where delimiter may be empty * but must match on both sides, and contents is the body of the comment. ObText uses Python- * like "heredoc" syntax for raw strings surrounded by triple-apostrophes '''like so''' with * optional initial and final newlines in the raw string ignored. An alternate raw string * syntax is present that allows delimiters, using the syntax * {@code [[delimiter[contents]delimiter]]}, where again delimiter may be empty and contents * is the body of the raw string. We use square brackets in place of STOB's curly braces to * mark children associated with a string. */ @Beta public class ObText extends AbstractCollection<String>{ public static final Pattern pattern = Pattern.compile( "(?>'''(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?({=s}.*?)(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?''')" + "|(?>\\[\\[({=q}[^\\[\\]]*)\\[(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?({=s}.*?)(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?\\]{\\q}\\]\\])" + "|(?>({=q}[\"'])({=s}.*?)(?<!\\\\){\\q})" + "|(?>(?>//|#)(?>\\V*))" + "|(?>/\\*(?:.*?)\\*/)" + "|(?>/\\[({=q}\\S*)/(?:.*?)/{\\q}\\]/)" + "|({=s}[^\\s\\[\\]\"'#\\\\]+)" + "|({=o}\\[)" + "|({=c}\\])", REFlags.DOTALL | REFlags.UNICODE ), patternRelaxed = Pattern.compile( "(?>'''(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?({=s}.*?)(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?''')" + "|(?>\\[\\[({=q}[^\\[\\]]*)\\[(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?({=s}.*?)(?:[\n\u000C\f\r\u0085\u2028\u2029]|\r\n)?\\]{\\q}\\]\\])" + "|(?>({=q}[\"'])({=s}.*?)(?<!\\\\){\\q})" + //"|(?>(?>//|#)(?>\\V*))" + //"|(?>/\\*(?:.*?)\\*/)" + //"|(?>/\\[({=q}\\S*)/(?:.*?)/{\\q}\\]/)" + "|({=s}[^\\s\\[\\]\"'\\\\]+)" , REFlags.DOTALL | REFlags.UNICODE ); public static final int stringId = pattern.groupId("s"), openId = pattern.groupId("o"), closeId = pattern.groupId("c"); protected static final Pattern illegalBareWord = Pattern.compile("[\\s\\[\\]\"'#\\\\]|(?:/[/\\*])"), reallyIllegalBareWord = Pattern.compile("[\\s\\[\\]\"'\\\\]"), needsRaw = Pattern.compile("(?<!\\\\)[\\[\\]]|\\\\$"); protected static final Matcher m = pattern.matcher(); protected static final Matcher bare = illegalBareWord.matcher(), raw = needsRaw.matcher(), reallyBare = reallyIllegalBareWord.matcher(); protected final ArrayList<String> strings = new ArrayList<String>(64); protected final IntVLA neighbors = new IntVLA(64); private final IntVLA nesting = new IntVLA(16); protected int length = 0; public ObText() { } public ObText(CharSequence text) { parse(text); } /** * Parses the given text (a String or other CharSequence) and appends it into this ObText. * @param text a CharSequence (such as a String) using ObText formatting, as described in this class' JavaDocs * @return this ObText object after appending the parsed text, for chaining */ public ObText parse(CharSequence text) { m.setTarget(text); nesting.clear(); int t = -1; while (m.find()) { if (m.isCaptured(stringId)) { strings.add(m.group(stringId)); neighbors.add(1); if(nesting.isEmpty()) length++; } else if(m.isCaptured(openId)) { nesting.add(neighbors.size - 1); } else if(m.isCaptured(closeId)) { neighbors.incr(t = nesting.pop(), neighbors.size - t - 1); if(t < neighbors.size - 1) neighbors.set(neighbors.size-1, 0); } } return this; } @Override public void clear() { strings.clear(); neighbors.clear(); length = 0; } @Override public int size() { return length; } /** * Returns an iterator over elements of type {@code T}. * * @return an Iterator. */ @Override public ItemIterator iterator() { return new ItemIterator(); } /** * A one-way iterator through this ObText's String items. Note that this can get * an iterator into a child sequence with {@link #children()}, which should * only be called after checking that a child exists with {@link #hasChild()}. * {@link #remove()} is not supported, but {@link #next()} and {@link #hasNext()} * are, of course. */ public class ItemIterator implements Iterator<String> { ItemIterator() { } ItemIterator(int i) { current = i % neighbors.size; index = current; } int index = 0, current = -1; /** * Returns {@code true} if the iteration has more elements. * (In other words, returns {@code true} if {@link #next} would * return an element rather than throwing an exception.) * * @return {@code true} if the iteration has more elements */ @Override public boolean hasNext() { return index < neighbors.size && (current < 0 || neighbors.get(current) > 0); } /** * Returns {@code true} if the ObText.Item has any child elements. * (In other words, returns {@code true} if {@link #children} would * return an ItemIterator rather than throwing an exception.) * * @return {@code true} if this has any children */ public boolean hasChild() { return index < neighbors.size - 1 && ((current < 0 && neighbors.get(0) > 1) || neighbors.get(current) > 1); } /** * Returns the next element in the iteration. * * @return the next element in the iteration * @throws java.util.NoSuchElementException if the iteration has no more elements */ @Override public String next() { if(current < 0) { if(strings.isEmpty()) throw new java.util.NoSuchElementException("No more sibling items in ObText object"); current = 0; index = neighbors.get(0); return strings.get(0); } if(index >= neighbors.size || neighbors.get(current) <= 0) throw new java.util.NoSuchElementException("No more sibling items in ObText object"); index = neighbors.get(index) + (current = index); return strings.get(current); } /** * Returns the first child of this ObText.Item and descends into the sequence of child elements. * * @return the first child of this item * @throws java.util.NoSuchElementException if the iteration has no children */ public ItemIterator children() { if(current < 0) { if(neighbors.size <= 0 || neighbors.get(0) == 1) throw new java.util.NoSuchElementException("No current children in ObText object"); return new ItemIterator(1); } if(current >= neighbors.size - 2 || neighbors.get(current) == 1) throw new java.util.NoSuchElementException("No current children in ObText object"); return new ItemIterator(current+1); } @Override public void remove() { throw new UnsupportedOperationException("remove() not supported"); } } // Used to generate randomized delimiters using up to 9 non-English letters. // call while assigning your state with randomChars(state += 0x9E3779B97F4A7C15L, myChars) // that assumes you have a 9-element char[] called myChars // as long as z/state is deterministic (i.e. based on a hash), this should be too private static void randomChars(long z, char[] mut) { z = (z ^ (z >>> 30)) * 0xBF58476D1CE4E5B9L; z = (z ^ (z >>> 27)) * 0x94D049BB133111EBL; z ^= (z >>> 31); mut[0] = letters[(int)(128 + (z & 127))]; mut[1] = letters[(int)(128 + (z >>> 7 & 127))]; mut[2] = letters[(int)(128 + (z >>> 14 & 127))]; mut[3] = letters[(int)(128 + (z >>> 21 & 127))]; mut[4] = letters[(int)(128 + (z >>> 28 & 127))]; mut[5] = letters[(int)(128 + (z >>> 35 & 127))]; mut[6] = letters[(int)(128 + (z >>> 42 & 127))]; mut[7] = letters[(int)(128 + (z >>> 49 & 127))]; mut[8] = letters[(int)(128 + (z >>> 56 & 127))]; } public static void appendQuoted(StringBuilder sb, String text) { appendQuoted(sb, text, reallyBare); } public static void appendQuotedObText(StringBuilder sb, String text) { appendQuoted(sb, text, bare); } protected static void appendQuoted(StringBuilder sb, String text, Matcher bareFinder) { if(text == null || text.isEmpty()) { sb.append("''"); return; } bareFinder.setTarget(text); if(!bareFinder.find()) sb.append(text); else { raw.setTarget(text); if(raw.find()) { if (text.contains("'''")) { long state = CrossHash.Wisp.hash64(text); char[] myChars = new char[9]; int count; do { randomChars(state += 0x9E3779B97F4A7C15L, myChars); count = StringKit.containsPart(text, myChars, "]", "]]"); } while (count == 12); sb.append("[[").append(myChars, 0, count).append("[\n").append(text).append("\n]") .append(myChars, 0, count).append("]]"); } else { sb.append("'''\n").append(text).append("\n'''"); } } else if(!text.contains("'")) { sb.append('\'').append(text).append('\''); } else { if(text.contains("\"")) { if(text.contains("'''")) { long state = CrossHash.Wisp.hash64(text); char[] myChars = new char[9]; int count; do { randomChars(state += 0x9E3779B97F4A7C15L, myChars); count = StringKit.containsPart(text, myChars); }while(count == 9); sb.append("[[").append(myChars, 0, count).append("[\n").append(text).append("\n]") .append(myChars, 0, count).append("]]"); } else { sb.append("'''\n").append(text).append("\n'''"); } } else { sb.append('"').append(text).append('"'); } } } } @Override public String toString() { return "ObText object: [[[[\n" + serializeToString() + "\n]]]]"; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ObText o2 = (ObText) o; if (!strings.equals(o2.strings)) return false; return neighbors.equals(o2.neighbors); } @Override public int hashCode() { return CrossHash.Wisp.hash(strings) + CrossHash.Wisp.hash(neighbors.items); } public String serializeToString() { StringBuilder sb = new StringBuilder(100); iterate(sb, iterator()); return sb.toString(); } /** * Deserializes an ObText that was serialized by {@link #serializeToString()} or {@link #toString()}, and will * ignore the prefix and suffix that toString appends for readability (these are "ObText object: [[[[ " and " ]]]]", * for reference). This is otherwise the same as calling the constructor {@link #ObText(CharSequence)}. * @param data a String that is usually produced by serializeToString or toString on an ObText * @return a new ObText produced by parsing data (disregarding any prefix or suffix from toString() ) */ public static ObText deserializeFromString(String data) { if(data.startsWith("ObText object: [[[[\n")) { return new ObText(data.substring(20, data.length() - 5)); } return new ObText(data); } private static void iterate(StringBuilder sb, ObText.ItemIterator it) { while (it.hasNext()) { appendQuotedObText(sb, it.next()); sb.append('\n'); if (it.hasChild()) { sb.append("[\n"); iterate(sb, it.children()); sb.append("]\n"); } } } /** * Can be used to help reading sequences of Strings with ObText-style quotation marking their boundaries. * This returns a {@link ContentMatcher} object that you must call setTarget on before using it. * The argument(s) to setTarget should be the text that might contain quotes, heredoc-style quotes, or just bare * words. Calling {@link ContentMatcher#find()} will try to find the next String, returning false if there's nothing * left or returning true and advancing the search if a String was found. The String might be a special term in some * cases, like "[" and "]" without quotes being syntax in ObText that don't contain usable Strings. That's why, * after a String was found with find(), you should check {@link ContentMatcher#hasMatch()} to verify that a match * was successful, and if that's true, then you can call {@link ContentMatcher#getMatch()} to get the un-quoted * contents of the next String in the target. * @return a {@link ContentMatcher} that must have one of its setTarget() methods called before it can be used */ public static ContentMatcher makeMatcher() { return new ContentMatcher(); } /** * Can be used to help reading sequences of Strings with ObText-style quotation marking their boundaries. * This returns a {@link ContentMatcher} object that is already configured to read from {@code text}. * The {@code text} should contain Strings that may be surrounded by quotes, heredoc-style quotes, or just bare * words. Calling {@link ContentMatcher#find()} will try to find the next String, returning false if there's nothing * left or returning true and advancing the search if a String was found. The String might be a special term in some * cases, like "[" and "]" without quotes being syntax in ObText that don't contain usable Strings. That's why, * after a String was found with find(), you should check {@link ContentMatcher#hasMatch()} to verify that a match * was successful, and if that's true, then you can call {@link ContentMatcher#getMatch()} to get the un-quoted * contents of the next String in the target. * @param text the target String that should probably have at least one sub-string that might be quoted * @return a {@link ContentMatcher} that can be used immediately by calling {@link ContentMatcher#find()} */ public static ContentMatcher makeMatcher(CharSequence text) { return new ContentMatcher(text); } /** * Can be used to help reading sequences of Strings with ObText-style quotation marking their boundaries, but no * comments (which allows some additional characters to be used in bare words, like '#'). * This returns a {@link ContentMatcher} object that is already configured to read from {@code text}. * The {@code text} should contain Strings that may be surrounded by quotes, heredoc-style quotes, or just bare * words. Calling {@link ContentMatcher#find()} will try to find the next String, returning false if there's nothing * left or returning true and advancing the search if a String was found. Unlike the ContentMatcher produced by * {@link #makeMatcher(CharSequence)}, you can call {@link ContentMatcher#getMatch()} after any successful call to * {@link ContentMatcher#find()}, which will get the un-quoted contents of the next String in the target. * @param text the target String that should probably have at least one sub-string that might be quoted * @return a {@link ContentMatcher} that can be used immediately by calling {@link ContentMatcher#find()} */ public static ContentMatcher makeMatcherNoComments(CharSequence text) { return new ContentMatcher(text, patternRelaxed); } public static class ContentMatcher extends Matcher { /** * Constructs a ContentMatcher that will need to have its target set with {@link #setTarget(CharSequence)} or * one of its overloads. The target should contain multiple substrings that may have quotation around them; this * class is meant to skip the quotation in ObText's style. */ public ContentMatcher() { super(pattern); } /** * Constructs a ContentMatcher that already has its target set to {@code text}. * @param text the CharSequence, such as a String, to find possibly-quoted Strings in. */ public ContentMatcher(CharSequence text) { super(pattern, text); } /** * Constructs a ContentMatcher that already has its target set to {@code text} and uses an alternate Pattern. */ ContentMatcher(CharSequence text, Pattern altPattern) { super(altPattern, text); } /** * Supplies a text to search in/match with. * Resets current search position to zero. * * @param text - a data * @see Matcher#setTarget(Matcher, int) * @see Matcher#setTarget(CharSequence, int, int) * @see Matcher#setTarget(char[], int, int) * @see Matcher#setTarget(Reader, int) */ @Override public void setTarget(CharSequence text) { super.setTarget(text); } /** * Supplies a text to search in/match with, as a part of String. * Resets current search position to zero. * * @param text - a data source * @param start - where the target starts * @param len - how long is the target * @see Matcher#setTarget(Matcher, int) * @see Matcher#setTarget(CharSequence) * @see Matcher#setTarget(char[], int, int) * @see Matcher#setTarget(Reader, int) */ @Override public void setTarget(CharSequence text, int start, int len) { super.setTarget(text, start, len); } /** * Supplies a text to search in/match with, as a part of char array. * Resets current search position to zero. * * @param text - a data source * @param start - where the target starts * @param len - how long is the target * @see Matcher#setTarget(Matcher, int) * @see Matcher#setTarget(CharSequence) * @see Matcher#setTarget(CharSequence, int, int) * @see Matcher#setTarget(Reader, int) */ @Override public void setTarget(char[] text, int start, int len) { super.setTarget(text, start, len); } /** * Returns true if {@link #find()} has returned true and the found text is a usable String (not some syntax). * If this returns true, you can reasonably get a (possibly empty) String using {@link #getMatch()}. * @return true if there is a usable String found that can be obtained with {@link #getMatch()} */ public boolean hasMatch() { return isCaptured(stringId); } /** * Returns the contents of the latest String successfully found with {@link #find()}, without quotation. * You should typically call {@link #hasMatch()} even if find() has returned true, to ensure there is a valid * String that can be acquired (this will return an empty String if hasMatch() returns false, but an empty * String is also potentially a valid result in a successful match, so it should be distinguished). * @return the contents of the latest String successfully found with {@link #find()} */ public String getMatch() { return group(stringId); } } }