package com.fasterxml.aalto.util; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.ext.LexicalHandler; import org.codehaus.stax2.typed.Base64Variant; import org.codehaus.stax2.typed.TypedArrayDecoder; import org.codehaus.stax2.typed.TypedXMLStreamException; import org.codehaus.stax2.ri.typed.CharArrayBase64Decoder; import com.fasterxml.aalto.in.ReaderConfig; /** * Class conceptually similar to {@link java.lang.StringBuilder}, but * that allows for bit more efficient building, using segmented internal * buffers, and direct access to these buffers. */ public final class TextBuilder { final static char[] sNoChars = new char[0]; /** * Size of the first text segment buffer to allocate. Need not contain * the biggest segment, since new ones will get allocated as needed. * However, it's sensible to use something that often is big enough * to contain typical segments. */ final static int DEF_INITIAL_BUFFER_SIZE = 500; // 1k final static int MAX_SEGMENT_LENGTH = 256 * 1024; final static int INT_SPACE = 0x0020; // // // Configuration: private final ReaderConfig _config; // // // Internal non-shared collector buffers: /** * List of segments prior to currently active segment. */ private ArrayList<char[]> _segments; // // // Currently used segment; not (yet) contained in _segments /** * Amount of characters in segments in {@link _segments} */ private int _segmentSize; private char[] _currentSegment; /** * Number of characters in currently active (last) segment */ private int _currentSize; // // // Temporary caching for Objects to return /** * String that will be constructed when the whole contents are * needed; will be temporarily stored in case asked for again. */ private String _resultString; private char[] _resultArray; /** * Indicator for length of data with <code>_resultArray</code>, iff * the primary indicator (_currentSize) is invalid (-1). */ private int _resultLen; /* /********************************************************************** /* Support for decoding, for Typed Access API /********************************************************************** */ private char[] _decodeBuffer; private int _decodePtr; private int _decodeEnd; /* /********************************************************************** /* Support for optimizating indentation segments: /********************************************************************** */ /** * Marker to know if the contents currently stored were created * using "indentation detection". If so, it's known to be all * white space */ private boolean _isIndentation = false; // // // Canonical indentation objects (up to 32 spaces, 8 tabs) public final static int MAX_INDENT_SPACES = 32; public final static int MAX_INDENT_TABS = 8; // Let's add one more space at the end, for safety... private final static String sIndSpaces = // 123456789012345678901234567890123 "\n "; private final static char[] sIndSpacesArray = sIndSpaces.toCharArray(); private final static String[] sIndSpacesStrings = new String[sIndSpacesArray.length]; private final static String sIndTabs = // 1 2 3 4 5 6 7 8 9 "\n\t\t\t\t\t\t\t\t\t"; private final static char[] sIndTabsArray = sIndTabs.toCharArray(); private final static String[] sIndTabsStrings = new String[sIndTabsArray.length]; /* /********************************************************************** /* Life-cycle /********************************************************************** */ private TextBuilder(ReaderConfig cfg) { _config = cfg; } public static TextBuilder createRecyclableBuffer(ReaderConfig cfg) { return new TextBuilder(cfg); } /** * Method called to indicate that the underlying buffers should now * be recycled if they haven't yet been recycled. Although caller * can still use this text buffer, it is not advisable to call this * method if that is likely, since next time a buffer is needed, * buffers need to reallocated. * Note: calling this method automatically also clears contents * of the buffer. */ public void recycle(boolean force) { if (_config != null && _currentSegment != null) { if (force) { /* shouldn't call resetWithEmpty, as that would allocate * initial buffer; but need to inline */ _resultString = null; _resultArray = null; } else { /* But if there's non-shared data (ie. buffer is still * in use), can't return it yet: */ if ((_segmentSize + _currentSize) > 0) { return; } } // If no data (or only shared data), can continue if (_segments != null && _segments.size() > 0) { // No need to use anything from list, curr segment not null _segments.clear(); _segmentSize = 0; } char[] buf = _currentSegment; _currentSegment = null; _config.freeMediumCBuffer(buf); } } /** * Method called to clear out any content text buffer may have, and * initializes and returns the first segment to add characters to. */ public char[] resetWithEmpty() { _resultString = null; _resultArray = null; _isIndentation = false; // And then reset internal input buffers, if necessary: if (_segments != null && _segments.size() > 0) { /* Since the current segment should be the biggest one * (as we allocate 50% bigger each time), let's retain it, * and clear others */ _segments.clear(); _segmentSize = 0; } _currentSize = 0; if (_currentSegment == null) { _currentSegment = allocBuffer(0); } return _currentSegment; } public void resetWithIndentation(int indCharCount, char indChar) { // First reset internal input buffers, if necessary: if (_segments != null && _segments.size() > 0) { _segments.clear(); _segmentSize = 0; } _currentSize = -1; _isIndentation = true; String text; int strlen = indCharCount+1; _resultLen = strlen; if (indChar == '\t') { // tabs? _resultArray = sIndTabsArray; text = sIndTabsStrings[indCharCount]; if (text == null) { sIndTabsStrings[indCharCount] = text = sIndTabs.substring(0, strlen); } } else { // nope, spaces (should assert indChar?) _resultArray = sIndSpacesArray; text = sIndSpacesStrings[indCharCount]; if (text == null) { sIndSpacesStrings[indCharCount] = text = sIndSpaces.substring(0, strlen); } } _resultString = text; } /** * Method called to initialize the buffer with just a single char */ public void resetWithChar(char c) { _resultString = null; _resultArray = null; _isIndentation = false; // And then reset internal input buffers, if necessary: if (_segments != null && _segments.size() > 0) { _segments.clear(); _segmentSize = 0; } _currentSize = 1; if (_currentSegment == null) { _currentSegment = allocBuffer(1); } _currentSegment[0] = c; } public void resetWithSurrogate(int c) { _resultString = null; _resultArray = null; _isIndentation = false; // And then reset internal input buffers, if necessary: if (_segments != null && _segments.size() > 0) { _segments.clear(); _segmentSize = 0; } _currentSize = 2; if (_currentSegment == null) { _currentSegment = allocBuffer(2); } _currentSegment[0] = (char) (0xD800 | (c >> 10)); _currentSegment[1] = (char) (0xDC00 | (c & 0x3FF)); } public char[] getBufferWithoutReset() { return _currentSegment; } /* /********************************************************************** /* Accessors for implementing StAX interface: /********************************************************************** */ /** * @return Number of characters currently stored by this collector */ public int size() { int size = _currentSize; // Will be -1 only if we have shared white space if (size < 0) { return _resultLen; } return size + _segmentSize; } public char[] getTextBuffer() { // Does it fit in just one segment? if (_segments == null || _segments.size() == 0) { // But is it whitespace, actually? if (_resultArray != null) { return _resultArray; } return _currentSegment; } // Nope, need to have/create a non-segmented array and return it return contentsAsArray(); } /* /********************************************************************** /* Accessors for text contained /********************************************************************** */ public String contentsAsString() { if (_resultString == null) { // Has array been requested? Can make a shortcut, if so: if (_resultArray != null) { _resultString = new String(_resultArray); } else { // Let's optimize common case: nothing in extra segments: int segLen = _segmentSize; int currLen = _currentSize; if (segLen == 0) { _resultString = (currLen == 0) ? "" : new String(_currentSegment, 0, currLen); return _resultString; } // Nope, need to combine: StringBuilder sb = new StringBuilder(segLen + currLen); // First stored segments if (_segments != null) { for (int i = 0, len = _segments.size(); i < len; ++i) { char[] curr = (char[]) _segments.get(i); sb.append(curr, 0, curr.length); } } // And finally, current segment: sb.append(_currentSegment, 0, currLen); _resultString = sb.toString(); } } return _resultString; } public char[] contentsAsArray() { char[] result = _resultArray; if (result == null) { _resultArray = result = buildResultArray(); } return result; } public int contentsToArray(int srcStart, char[] dst, int dstStart, int len) { /* Could also check if we have array, but that'd only help with * brain dead clients that get full array first, then segments... * which hopefully aren't that common */ // Copying from segmented array is bit more involved: int totalAmount = 0; if (_segments != null) { for (int i = 0, segc = _segments.size(); i < segc; ++i) { char[] segment = (char[]) _segments.get(i); int segLen = segment.length; int amount = segLen - srcStart; if (amount < 1) { // nothing from this segment? srcStart -= segLen; continue; } if (amount >= len) { // can get rest from this segment? System.arraycopy(segment, srcStart, dst, dstStart, len); return (totalAmount + len); } // Can get some from this segment, offset becomes zero: System.arraycopy(segment, srcStart, dst, dstStart, amount); totalAmount += amount; dstStart += amount; len -= amount; srcStart = 0; } } // Need to copy anything from last segment? if (len > 0) { int maxAmount = _currentSize - srcStart; if (len > maxAmount) { len = maxAmount; } if (len > 0) { // should always be true System.arraycopy(_currentSegment, srcStart, dst, dstStart, len); totalAmount += len; } } return totalAmount; } /** * Method that will stream contents of this buffer into specified * Writer. */ public int rawContentsTo(Writer w) throws IOException { // Let's first see if we have created helper objects: if (_resultArray != null) { w.write(_resultArray); return _resultArray.length; } if (_resultString != null) { w.write(_resultString); return _resultString.length(); } // Nope, need to do full segmented output int rlen = 0; if (_segments != null) { for (int i = 0, len = _segments.size(); i < len; ++i) { char[] ch = (char[]) _segments.get(i); w.write(ch); rlen += ch.length; } } if (_currentSize > 0) { w.write(_currentSegment, 0, _currentSize); rlen += _currentSize; } return rlen; } public boolean isAllWhitespace() { if (_isIndentation) { return true; } // Need to do full segmented output, otherwise if (_segments != null) { for (int i = 0, len = _segments.size(); i < len; ++i) { char[] buf = (char[]) _segments.get(i); for (int j = 0, len2 = buf.length; j < len2; ++j) { if (buf[j] > 0x0020) { return false; } } } } char[] buf = _currentSegment; for (int i = 0, len = _currentSize; i < len; ++i) { if (buf[i] > 0x0020) { return false; } } return true; } /** * Method that can be used to check if the contents of the buffer end * in specified String. * * @return True if the textual content buffer contains ends with the * specified String; false otherwise */ public boolean endsWith(String str) { int segIndex = (_segments == null) ? 0 : _segments.size(); int inIndex = str.length() - 1; char[] buf = _currentSegment; int bufIndex = _currentSize-1; while (inIndex >= 0) { if (str.charAt(inIndex) != buf[bufIndex]) { return false; } if (--inIndex == 0) { break; } if (--bufIndex < 0) { if (--segIndex < 0) { // no more data? return false; } buf = (char[]) _segments.get(segIndex); bufIndex = buf.length-1; } } return true; } /** * Note: it is assumed that this method is not used often enough to * be a bottleneck, or for long segments. Based on this, it is optimized * for common simple cases where there is only one single character * segment to use; fallback for other cases is to create such segment. */ public boolean equalsString(String str) { int expLen = str.length(); // Otherwise, segments: if (expLen != size()) { return false; } char[] seg; if (_segments == null || _segments.size() == 0) { // just one segment, still easy seg = _currentSegment; } else { /* Ok; this is the sub-optimal case. Could obviously juggle through * segments, but probably not worth the hassle, we seldom if ever * get here... */ seg = contentsAsArray(); } for (int i = 0; i < expLen; ++i) { if (seg[i] != str.charAt(i)) { return false; } } return true; } /* /********************************************************************** /* Methods for generating SAX events /********************************************************************** */ /** * This is a specialized "accessor" method, which is basically * to fire SAX characters() events in an optimal way, based on * which internal buffers are being used */ public void fireSaxCharacterEvents(ContentHandler h) throws SAXException { if (_resultArray != null) { // only happens for indentation h.characters(_resultArray, 0, _resultLen); } else { if (_segments != null) { for (int i = 0, len = _segments.size(); i < len; ++i) { char[] ch = (char[]) _segments.get(i); h.characters(ch, 0, ch.length); } } if (_currentSize > 0) { h.characters(_currentSegment, 0, _currentSize); } } } public void fireSaxSpaceEvents(ContentHandler h) throws SAXException { if (_resultArray != null) { // only happens for indentation h.ignorableWhitespace(_resultArray, 0, _resultLen); } else { if (_segments != null) { for (int i = 0, len = _segments.size(); i < len; ++i) { char[] ch = (char[]) _segments.get(i); h.ignorableWhitespace(ch, 0, ch.length); } } if (_currentSize > 0) { h.ignorableWhitespace(_currentSegment, 0, _currentSize); } } } public void fireSaxCommentEvent(LexicalHandler h) throws SAXException { // Comment can not be split, so may need to combine the array if (_resultArray != null) { // only happens for indentation h.comment(_resultArray, 0, _resultLen); } else if (_segments != null && _segments.size() > 0) { char[] ch = contentsAsArray(); h.comment(ch, 0, ch.length); } else { h.comment(_currentSegment, 0, _currentSize); } } /* /********************************************************************** /* Support for validation /********************************************************************** */ /* public void validateText(XMLValidator vld, boolean lastSegment) throws XMLValidationException { // Can either create a combine buffer, or construct // a String. While former could be more efficient, let's do latter // for now since current validator implementations work better // with Strings. vld.validateText(contentsAsString(), lastSegment); } */ /* /********************************************************************** /* Public mutators: /********************************************************************** */ public void append(char c) { _resultString = null; _resultArray = null; // Room in current segment? char[] curr = _currentSegment; if (_currentSize >= curr.length) { expand(1); } curr[_currentSize++] = c; } public void appendSurrogate(int surr) { append((char) (0xD800 | (surr >> 10))); append((char) (0xDC00 | (surr & 0x3FF))); } public void append(char[] c, int start, int len) { _resultString = null; _resultArray = null; // Room in current segment? char[] curr = _currentSegment; int max = curr.length - _currentSize; if (max >= len) { System.arraycopy(c, start, curr, _currentSize, len); _currentSize += len; } else { // No room for all, need to copy part(s): if (max > 0) { System.arraycopy(c, start, curr, _currentSize, max); start += max; len -= max; } /* And then allocate new segment; we are guaranteed to now * have enough room in segment. */ expand(len); // note: curr != _currentSegment after this System.arraycopy(c, start, _currentSegment, 0, len); _currentSize = len; } } public void append(String str) { _resultString = null; _resultArray = null; int len = str.length(); // Room in current segment? char[] curr = _currentSegment; int max = curr.length - _currentSize; if (max >= len) { str.getChars(0, len, curr, _currentSize); _currentSize += len; } else { // No room for all, need to copy part(s): if (max > 0) { str.getChars(0, max, curr, _currentSize); len -= max; } /* And then allocate new segment; we are guaranteed to now * have enough room in segment. */ expand(len); str.getChars(max, max+len, _currentSegment, 0); _currentSize = len; } } /* /********************************************************************** /* Raw access, for high-performance use: /********************************************************************** */ public int getCurrentLength() { return _currentSize; } public void setCurrentLength(int len) { _currentSize = len; } public char[] finishCurrentSegment() { if (_segments == null) { _segments = new ArrayList<char[]>(); } _segments.add(_currentSegment); int oldLen = _currentSegment.length; _segmentSize += oldLen; char[] curr = new char[calcNewSize(oldLen)]; _currentSize = 0; _currentSegment = curr; return curr; } private int calcNewSize(int latestSize) { // Let's grow segments by 50%, when over 8k int incr = (latestSize < 8000) ? latestSize : (latestSize >> 1); int size = latestSize + incr; // but let's not create too big chunks return Math.min(size, MAX_SEGMENT_LENGTH); } /* /********************************************************************** /* Methods for implementing Typed Access API /********************************************************************** */ /** * Method called by the stream reader to decode space-separated tokens * that are part of the current text event (contents of which * are stored within this buffer), using given decoder. */ public int decodeElements(TypedArrayDecoder tad, boolean reset) throws TypedXMLStreamException { if (reset) { resetForDecode(); } int ptr = _decodePtr; final char[] buf = _decodeBuffer; int count = 0; // And then let's decode int start = ptr; try { final int end = _decodeEnd; decode_loop: while (ptr < end) { // First, any space to skip? while (buf[ptr] <= INT_SPACE) { if (++ptr >= end) { break decode_loop; } } // Then let's figure out non-space char (token) start = ptr; ++ptr; while (ptr < end && buf[ptr] > INT_SPACE) { ++ptr; } ++count; int tokenEnd = ptr; ++ptr; // to skip trailing space (or, beyond end) // And there we have it if (tad.decodeValue(buf, start, tokenEnd)) { break; } _decodePtr = ptr; } _decodePtr = ptr; } catch (IllegalArgumentException iae) { // Need to convert to a checked stream exception to return lexical // -1 to move it back after being advanced earlier (to skip trailing space) String lexical = new String(buf, start, (ptr-start-1)); throw new TypedXMLStreamException(lexical, iae.getMessage(), iae); } return count; } /** * Method called to initialize given base64 decoder with data * contained in this text buffer (for the current event). */ public void resetForBinaryDecode(Base64Variant v, CharArrayBase64Decoder dec, boolean firstChunk) { // just one special case, indentation... if (_segments == null || _segments.size() == 0) { // single segment if (_isIndentation) { // but special one, indent/ws dec.init(v, firstChunk, _resultArray, 0, _resultArray.length, null); return; } } dec.init(v, firstChunk, _currentSegment, 0, _currentSize, _segments); } private final void resetForDecode() { /* This is very similar to getTextBuffer(), except * for assignment to _decodeXxx fields */ _decodePtr = 0; if (_segments == null || _segments.size() == 0) { // single segment if (_isIndentation) { // but special one, indent/ws _decodeBuffer = _resultArray; _decodeEnd = _resultArray.length; } else { // nope, just a regular buffer _decodeBuffer = _currentSegment; _decodeEnd = _currentSize; } } else { // Nope, need to have/create a non-segmented array and return it _decodeBuffer = contentsAsArray(); _decodeEnd = _decodeBuffer.length; } } /* /********************************************************************** /* Standard methods: /********************************************************************** */ /** * Note: calling this method may not be as efficient as calling * {@link #contentsAsString}, since it is guaranteed that resulting * String is NOT cached (to ensure we see no stale data) */ @Override public String toString() { _resultString = null; _resultArray = null; return contentsAsString(); } /* /********************************************************************** /* Internal methods: /********************************************************************** */ private final char[] allocBuffer(int minNeeded) { int size = Math.max(DEF_INITIAL_BUFFER_SIZE, minNeeded); char[] buf = null; if (_config != null) { buf = _config.allocMediumCBuffer(size); if (buf != null) { return buf; } } return new char[size]; } /** * Method called when current segment is full, to allocate new * segment. */ private void expand(int roomNeeded) { // First, let's move current segment to segment list: if (_segments == null) { _segments = new ArrayList<char[]>(); } char[] curr = _currentSegment; _segments.add(curr); int oldLen = curr.length; _segmentSize += oldLen; int newSize = Math.max(roomNeeded, calcNewSize(oldLen)); curr = new char[newSize]; _currentSize = 0; _currentSegment = curr; } private char[] buildResultArray() { if (_resultString != null) { // Can take a shortcut... return _resultString.toCharArray(); } char[] result; int size = size(); if (size < 1) { return sNoChars; } int offset = 0; result = new char[size]; if (_segments != null) { for (int i = 0, len = _segments.size(); i < len; ++i) { char[] curr = _segments.get(i); int currLen = curr.length; System.arraycopy(curr, 0, result, offset, currLen); offset += currLen; } } System.arraycopy(_currentSegment, 0, result, offset, _currentSize); return result; } }