TokenReaderBase.java example

Explorer

folioxml-master
- commandline
  - src
    - folioxml
      - command
        Main.java
      - export
        ExportRunner.java
  - testsrc
    - folioxml
      - export
        TestExportRunner.java
- contrib
  - folioxml-lucene
    - src
      - folioxml
        export
        plugins
        ResolveHyperlinks.java
        lucene
        FieldCollector.java
        IndexFieldOpts.java
        IndexFieldOptsProvider.java
        InfobaseFieldOptsSet.java
        InfobaseSetIndexer.java
        analysis
        AnalyzerPicker.java
        DynamicAnalyzer.java
        ListAnalyzer.java
        ListTokenizer.java
        LowercaseKeywordAnalyzer.java
        folio
        FolioEnuAnalyzer.java
        FolioEnuPhraseAnalyzer.java
        FolioEnuTokenizer.java
        LookAroundCharTokenizer.java
        TokenCombiner.java
        folioQueryParser
        QueryParser.java
        QueryToken.java
        QueryTokenReader.java
    - testsrc
      - apache
        lucene
        CharTokenizer.java
      - folioxml
        directexport
        SimultaneousTest.java
        lucene
        analysis
        folio
        TokenCombinerTest.java
        folioQueryParser
        QueryParserTest.java
        tests
        Indexer.java
- core
  - folioxml
- diff_match_patch
  - oldtest
    - name
      - fraser
        neil
        plaintext
        diff_match_patch_test.java
  - src
    - name
      - fraser
        neil
        plaintext
        diff_match_patch.java

package folioxml.core;

import folioxml.utils.Stopwatch;

import java.io.IOException;
import java.io.Reader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Base class for tokenizing readers which need to operate on a stream instead of a s String. Provides line/col counting, buffering, and regex matching. Uses the .hitEnd() property of regexes to determine whether more data needs to be buffered
 * for a complete token match.
 *
 * @author nathanael
 */
public abstract class TokenReaderBase {

    /**
     * Subclasses should override this, and return an array of the token patterns for getNextMatch() to iterate through. They must start with \\G. (using ^ instead bug issue with \\G!! in java.util.regex!! Order only matters if one of the patterns is a subset of another - like tags are with comments.
     *
     * @return
     */
    protected abstract Pattern[] getTokenPatterns();

    /**
     * The number of chars that get transferred at a time when more data is needed. Should optimally contain the largest comment or plaintext segment in a file
     * This much contiguous memory must be available
     */
    protected int readSize = 0;
    protected static int READ_SIZE_DEFAULT = 2048;//32768;

    /**
     * The index within 'textWindow' that we are parsing at. Increases each time a token is parsed, resets to 0 on cleanup of textWindow.
     */
    protected int index = 0;
    /**
     * Dynamically growing parsing window. New data is pulled in when a regex hits the end (regardless of success).
     */
    protected StringBuilder textWindow = null;
    /**
     * The underlying reader
     */
    protected Reader reader;


    /**
     * True if reader is at end of file. Becomes true when a reader.read() returns no data. Doesn't mean that there isn't data left in textWindow
     */
    protected boolean atEOF = false;

    /**
     * Used to transfer data from the underlying reader.
     */
    protected char[] readerToBufferBuffer = null;

    /**
     * Keeps track of our line/col position within the reader.
     */
    protected LineColTracker tracker = null;

    /**
     * The token regexps. Cached from getTokenPatterns()
     */
    private Pattern[] tokenPatterns = null;

    /**
     * @param reader        Should be at position 0. The token reader tracks line/col positions at the tokenizing level, so offset readers or any interference will throw that off.
     * @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file.
     */
    public TokenReaderBase(Reader reader, int readBlockSize) {

        this.reader = reader;
        //Initialize buffers
        this.readSize = readBlockSize;
        this.textWindow = new StringBuilder(this.readSize * 2);
        this.readerToBufferBuffer = new char[this.readSize];
        this.tokenPatterns = getTokenPatterns();
        this.tracker = new LineColTracker();

    }

    public TokenReaderBase(Reader reader) {
        this(reader, READ_SIZE_DEFAULT);
    }

    /**
     * Returns false after a read() call returns null. Sometimes returns false without a failed read() call, for example
     * if the last read() call discoveres the end-of-file but still succeeds. Always null-check read()
     */
    public boolean canRead() {
        if (atEOF && index >= textWindow.length()) return false;
        else return true;
    }

   /* public Reader getReader(){
        return reader;
    }*/

    /**
     * Closes the underlying reader.
     *
     * @throws java.io.IOException
     */
    public void close() throws IOException {
        reader.close();
    }

    public Stopwatch bufferTime = new Stopwatch();
    public Stopwatch matchTime = new Stopwatch();
    public Stopwatch getNextMatchTime = new Stopwatch();
    public int getNextMatchLoops = 0;
    public int matchLoops = 0;

    /**
     * Returns true if more text was added to textWindow. Returns false if EOF.
     * Cleans up whenver a realloc is pending. Cleanup will cause index = 0;
     *
     * @return
     */
    protected boolean bufferMore() throws IOException {
        bufferTime.start();

        //Fill as much of readerToBufferBuffer as possible, returns the number of characters filled.
        int result = reader.read(readerToBufferBuffer);
        //Check for end of file
        atEOF = (result < 1);
        if (!atEOF) {
            //Determine the most efficient way. If we already have enough space, let StringBuilder do its thing.
            if (result <= textWindow.capacity() - textWindow.length()) {
                //Add - we already have the allocated space.
                textWindow.append(readerToBufferBuffer, 0, result);
            } else {
                //The StringBuilder would have to reallocate to add the new data
                //Let's try to avoid that.

                //Can we make enough room by cleaning up?
                if (result <= index) {
                    textWindow.delete(0, index);
                    textWindow.append(readerToBufferBuffer, 0, result);
                } else {
                    //Looks like we have to reallocate.
                    //Let's clean up at the same time, so we don't get an insanely long string

                    //calculate the minimum amount of space needed to hold the unparsed+new data
                    int minSize = textWindow.length() - index + result;

                    //Double it. Once a StringBuilder is big enough, this shouldn't run again - cleanup will happen as a delete.
                    StringBuilder newSB = new StringBuilder(minSize * 2);
                    // System.out.println("Reallocated StringBuilder from " + textWindow.capacity() + " to " + newSB.capacity());
                    newSB.append(textWindow, index, textWindow.length());
                    newSB.append(readerToBufferBuffer, 0, result);
                    textWindow = newSB;
                }
                index = 0;
            }
        }
        bufferTime.stop();

        return atEOF;

    }


    /**
     * Returns null if we have already parsed the last token.
     * WATCH OUT!!!! textWindow is mutable, so Matcher instances will become corrupt if bufferMore() is called.
     *
     * @return
     */
    protected Matcher getNextMatch() throws IOException, InvalidMarkupException {
        //If we're out of text in the reader and the buffer, return null
        if (!canRead())
            return null;

        getNextMatchTime.start();
        getNextMatchLoops++;
        try {

            Matcher match = null;

            //If the buffer is empty, we really need to pull in more data
            boolean needsData = (index >= textWindow.length()) && !atEOF;

            do {
                if (needsData) {
                    //if (atEOF) throw new Exception();
                    assert (!atEOF); //Should never be set true if we're at the end of the file.
                    bufferMore();
                    //If we're out of text in the reader and the buffer, return null
                    if (!canRead())
                        return null;
                    needsData = false;
                }
                //Loop through the token types, add data when needed.
                for (int i = 0; i < tokenPatterns.length; i++) {
                    //Create a matcher for the current textWindow
                    Matcher m = tokenPatterns[i].matcher(textWindow);
                    //Optimizer note: Matchers are only cacheable if the pattern and text are the same. You're only saving on the initialization cost of 2 tiny int[] arrays. Not worth it.
                    //Seek for a match
                    //matchTime.reset();

                    long start = matchTime.hasValue() ? matchTime.toValue() : 0;
                    matchTime.start();
                    matchLoops++;
                    m.reset();
                    m.region(index, textWindow.length());
                    boolean isMatch = m.find();

                    matchTime.stop();
                    if (matchTime.toValue() - start > 1000) {
                        String region = textWindow.substring(index, textWindow.length());
                        String all = textWindow.toString();
                        System.out.println(region);
                        System.out.println(all);
                        System.out.println(m.pattern().toString());
                        System.out.println(matchTime.toValue() + " ms");
                        //assert(false);
                    }

                    //If the regex bumped into the end of textWindow, buffer more text and try again.
                    if (m.hitEnd() && !atEOF) {
                        needsData = true;
                        break;

                    }
                    //Only return a successful match that doesn't hit the end (or hits eof)
                    if (isMatch) {
                        match = m;
                        break;
                    }
                }

            } while (needsData);

            //invalid token
            if (match == null && canRead()) {
                int end = index + 20;
                if (end >= textWindow.length()) end = textWindow.length();
                throw new InvalidMarkupException("Invalid token:" + textWindow.substring(index, end));
                //assert(false); //Invalid token
            }
            if (!canRead()) {
                assert (true);
            }


            //Increment line/col numbers
            if (match != null) tracker.add(textWindow, match.start(), match.end());

            return match;
        } finally {
            getNextMatchTime.stop();
        }
    }


}