package folioxml.core; import folioxml.utils.Stopwatch; import java.io.IOException; import java.io.Reader; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Base class for tokenizing readers which need to operate on a stream instead of a s String. Provides line/col counting, buffering, and regex matching. Uses the .hitEnd() property of regexes to determine whether more data needs to be buffered * for a complete token match. * * @author nathanael */ public abstract class TokenReaderBase { /** * Subclasses should override this, and return an array of the token patterns for getNextMatch() to iterate through. They must start with \\G. (using ^ instead bug issue with \\G!! in java.util.regex!! Order only matters if one of the patterns is a subset of another - like tags are with comments. * * @return */ protected abstract Pattern[] getTokenPatterns(); /** * The number of chars that get transferred at a time when more data is needed. Should optimally contain the largest comment or plaintext segment in a file * This much contiguous memory must be available */ protected int readSize = 0; protected static int READ_SIZE_DEFAULT = 2048;//32768; /** * The index within 'textWindow' that we are parsing at. Increases each time a token is parsed, resets to 0 on cleanup of textWindow. */ protected int index = 0; /** * Dynamically growing parsing window. New data is pulled in when a regex hits the end (regardless of success). */ protected StringBuilder textWindow = null; /** * The underlying reader */ protected Reader reader; /** * True if reader is at end of file. Becomes true when a reader.read() returns no data. Doesn't mean that there isn't data left in textWindow */ protected boolean atEOF = false; /** * Used to transfer data from the underlying reader. */ protected char[] readerToBufferBuffer = null; /** * Keeps track of our line/col position within the reader. */ protected LineColTracker tracker = null; /** * The token regexps. Cached from getTokenPatterns() */ private Pattern[] tokenPatterns = null; /** * @param reader Should be at position 0. The token reader tracks line/col positions at the tokenizing level, so offset readers or any interference will throw that off. * @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file. */ public TokenReaderBase(Reader reader, int readBlockSize) { this.reader = reader; //Initialize buffers this.readSize = readBlockSize; this.textWindow = new StringBuilder(this.readSize * 2); this.readerToBufferBuffer = new char[this.readSize]; this.tokenPatterns = getTokenPatterns(); this.tracker = new LineColTracker(); } public TokenReaderBase(Reader reader) { this(reader, READ_SIZE_DEFAULT); } /** * Returns false after a read() call returns null. Sometimes returns false without a failed read() call, for example * if the last read() call discoveres the end-of-file but still succeeds. Always null-check read() */ public boolean canRead() { if (atEOF && index >= textWindow.length()) return false; else return true; } /* public Reader getReader(){ return reader; }*/ /** * Closes the underlying reader. * * @throws java.io.IOException */ public void close() throws IOException { reader.close(); } public Stopwatch bufferTime = new Stopwatch(); public Stopwatch matchTime = new Stopwatch(); public Stopwatch getNextMatchTime = new Stopwatch(); public int getNextMatchLoops = 0; public int matchLoops = 0; /** * Returns true if more text was added to textWindow. Returns false if EOF. * Cleans up whenver a realloc is pending. Cleanup will cause index = 0; * * @return */ protected boolean bufferMore() throws IOException { bufferTime.start(); //Fill as much of readerToBufferBuffer as possible, returns the number of characters filled. int result = reader.read(readerToBufferBuffer); //Check for end of file atEOF = (result < 1); if (!atEOF) { //Determine the most efficient way. If we already have enough space, let StringBuilder do its thing. if (result <= textWindow.capacity() - textWindow.length()) { //Add - we already have the allocated space. textWindow.append(readerToBufferBuffer, 0, result); } else { //The StringBuilder would have to reallocate to add the new data //Let's try to avoid that. //Can we make enough room by cleaning up? if (result <= index) { textWindow.delete(0, index); textWindow.append(readerToBufferBuffer, 0, result); } else { //Looks like we have to reallocate. //Let's clean up at the same time, so we don't get an insanely long string //calculate the minimum amount of space needed to hold the unparsed+new data int minSize = textWindow.length() - index + result; //Double it. Once a StringBuilder is big enough, this shouldn't run again - cleanup will happen as a delete. StringBuilder newSB = new StringBuilder(minSize * 2); // System.out.println("Reallocated StringBuilder from " + textWindow.capacity() + " to " + newSB.capacity()); newSB.append(textWindow, index, textWindow.length()); newSB.append(readerToBufferBuffer, 0, result); textWindow = newSB; } index = 0; } } bufferTime.stop(); return atEOF; } /** * Returns null if we have already parsed the last token. * WATCH OUT!!!! textWindow is mutable, so Matcher instances will become corrupt if bufferMore() is called. * * @return */ protected Matcher getNextMatch() throws IOException, InvalidMarkupException { //If we're out of text in the reader and the buffer, return null if (!canRead()) return null; getNextMatchTime.start(); getNextMatchLoops++; try { Matcher match = null; //If the buffer is empty, we really need to pull in more data boolean needsData = (index >= textWindow.length()) && !atEOF; do { if (needsData) { //if (atEOF) throw new Exception(); assert (!atEOF); //Should never be set true if we're at the end of the file. bufferMore(); //If we're out of text in the reader and the buffer, return null if (!canRead()) return null; needsData = false; } //Loop through the token types, add data when needed. for (int i = 0; i < tokenPatterns.length; i++) { //Create a matcher for the current textWindow Matcher m = tokenPatterns[i].matcher(textWindow); //Optimizer note: Matchers are only cacheable if the pattern and text are the same. You're only saving on the initialization cost of 2 tiny int[] arrays. Not worth it. //Seek for a match //matchTime.reset(); long start = matchTime.hasValue() ? matchTime.toValue() : 0; matchTime.start(); matchLoops++; m.reset(); m.region(index, textWindow.length()); boolean isMatch = m.find(); matchTime.stop(); if (matchTime.toValue() - start > 1000) { String region = textWindow.substring(index, textWindow.length()); String all = textWindow.toString(); System.out.println(region); System.out.println(all); System.out.println(m.pattern().toString()); System.out.println(matchTime.toValue() + " ms"); //assert(false); } //If the regex bumped into the end of textWindow, buffer more text and try again. if (m.hitEnd() && !atEOF) { needsData = true; break; } //Only return a successful match that doesn't hit the end (or hits eof) if (isMatch) { match = m; break; } } } while (needsData); //invalid token if (match == null && canRead()) { int end = index + 20; if (end >= textWindow.length()) end = textWindow.length(); throw new InvalidMarkupException("Invalid token:" + textWindow.substring(index, end)); //assert(false); //Invalid token } if (!canRead()) { assert (true); } //Increment line/col numbers if (match != null) tracker.add(textWindow, match.start(), match.end()); return match; } finally { getNextMatchTime.stop(); } } }