FolioTokenReader.java example

Explorer

folioxml-master
- commandline
  - src
    - folioxml
      - command
        Main.java
      - export
        ExportRunner.java
  - testsrc
    - folioxml
      - export
        TestExportRunner.java
- contrib
  - folioxml-lucene
    - src
      - folioxml
        export
        plugins
        ResolveHyperlinks.java
        lucene
        FieldCollector.java
        IndexFieldOpts.java
        IndexFieldOptsProvider.java
        InfobaseFieldOptsSet.java
        InfobaseSetIndexer.java
        analysis
        AnalyzerPicker.java
        DynamicAnalyzer.java
        ListAnalyzer.java
        ListTokenizer.java
        LowercaseKeywordAnalyzer.java
        folio
        FolioEnuAnalyzer.java
        FolioEnuPhraseAnalyzer.java
        FolioEnuTokenizer.java
        LookAroundCharTokenizer.java
        TokenCombiner.java
        folioQueryParser
        QueryParser.java
        QueryToken.java
        QueryTokenReader.java
    - testsrc
      - apache
        lucene
        CharTokenizer.java
      - folioxml
        directexport
        SimultaneousTest.java
        lucene
        analysis
        folio
        TokenCombinerTest.java
        folioQueryParser
        QueryParserTest.java
        tests
        Indexer.java
- core
  - folioxml
- diff_match_patch
  - oldtest
    - name
      - fraser
        neil
        plaintext
        diff_match_patch_test.java
  - src
    - name
      - fraser
        neil
        plaintext
        diff_match_patch.java

package folioxml.folio;

import folioxml.core.FileIncludeResolver;
import folioxml.core.IIncludeResolutionService;
import folioxml.core.InvalidMarkupException;
import folioxml.core.TokenInfo;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Reads a series of FolioToken instances from the specififed Reader input stream.
 * Fetches DI and FI preprocessor includes using the specified IIncludeResolutionService.
 *
 * @author nathanael
 */
public class FolioTokenReader extends folioxml.core.TokenReaderBase {


    /**
     * @param reader
     * @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file.
     * @throws IOException
     * @throws FileNotFoundException
     * @throws UnsupportedEncodingException
     */
    public FolioTokenReader(Reader reader, int readBlockSize) {
        super(reader, readBlockSize);

    }

    /**
     * Uses the Windows-1252 encoding
     */
    public FolioTokenReader(File path) throws UnsupportedEncodingException, FileNotFoundException, IOException {
        this(new InputStreamReader(new FileInputStream(path), "Windows-1252"), new FileIncludeResolver(path.getAbsolutePath()));

    }


    /**
     * @param reader
     * @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file.
     */
    public FolioTokenReader(Reader reader, IIncludeResolutionService referenceResolver) throws IOException {
        this(reader, READ_SIZE_DEFAULT, referenceResolver, null);

    }

    /**
     * @param reader
     * @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file.
     */
    public FolioTokenReader(Reader reader, int readBlockSize, IIncludeResolutionService referenceResolver) throws IOException {
        this(reader, readBlockSize, referenceResolver, null);
    }

    /**
     * @param reader              A FileReader or BufferedReader containing Folio Flat File document or definition codes.
     * @param readBlockSize       How much data to add to the buffer from 'reader' when more data is needed. The buffer is not fixed size, and will
     *                            strech as needed for a large token (such as a massive comment). The buffer will first clean out used-up space/
     * @param referenceResolver
     * @param parentDocumentPaths
     * @throws java.io.IOException
     */
    public FolioTokenReader(Reader reader, int readBlockSize, IIncludeResolutionService referenceResolver, List<String> parentDocumentPaths) throws IOException {
        super(reader, readBlockSize);
        this.resolver = referenceResolver;
        this.parentDocumentPaths = parentDocumentPaths;
        //Add the default element to parentDocumentPaths
        if (this.resolver != null) {
            //Cannot be null if resolver exists
            if (this.parentDocumentPaths == null) this.parentDocumentPaths = new ArrayList<String>();
            //Add base document path always - we don't want 2nd level files to re-reference the first.
            if (!this.parentDocumentPaths.contains(resolver.getHash())) {
                this.parentDocumentPaths.add(resolver.getHash());
            }
        }
    }

    public FolioTokenReader(Reader reader) {
        this(reader, READ_SIZE_DEFAULT);
    }

    /**
     * If initialized, this class will be used to perform on-the-fly file includes.
     */
    private IIncludeResolutionService resolver = null;

    /**
     * Used to track and prevent circular references
     */
    private List<String> parentDocumentPaths = null;

    /**
     * Used to read in included files. Null when finished.
     */
    private FolioTokenReader currentInnerReader = null;

    /**
     * Jan 20, 09. Can't use posessive quantifiers here - sorry.
     */
    public static String CommentRegex = "<CM>(.*?)</CM>";
    /**
     * Matches a comment tag and any intermediate comments. Lazy, of course.
     */
    private static Pattern rComment = Pattern.compile("^" + CommentRegex, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

    /**
     * Jan 20, 09 : Added possessive quantifiers throughout. previously (?:[^<]+|<[^A-Za-z/])+
     */
    public static String TextRegex = "(?:[^<]++|<[^A-Za-z/])++";
    /**
     * Matches text that doesn't contain any open brackets that are directly followed by a letter or a closing slash.
     */
    private static Pattern rText = Pattern.compile("^" + TextRegex); //non <, expect doubles


    /**
     * Matches any two-letter tag (and +/-), and captures (optional) options. group 1 and 2, respectively.
     * Tag options must have matching quote pairs, (single quotes are encoded like "").
     * Opening brackets can be entered by entering two.
     * Opening and closing brackets can be used literally as long as they exist in pairs, are not nested, and don't contain quotes.
     * Opening and closing brackets can be used arbitrarily within a quoted string.
     * <BR:AL:0.15,0.0291667,FC:255,255, caused problems.
     * Jan 20, 09. Added posessive quantifiers throughout regex.
     */
    public static String TagRegex = "<(/)?+([A-Z-a-z][A-Z-a-z][\\+\\-]?+)(?:\\s*+[:,;]++\\s*+((?:[^><\"]++|<<|\"(?:[^\"]|(?:\"\"))*+\"|<[^<>\"]*+>)++))?+>";

    /* old regex 86 seconds on <BR:AL:0.15,0.0291667,FC:255,255,
      public static String TagRegex = "<(/)?([A-Z-a-z][A-Za-z][\\+\\-]?)(?:\\s*[:,;]+\\s*((?:[^><\"]+|<<|\"(?:[^\"]|(?:\"\"))*\"|<[^<>\"]*>)+?))?>";
    */


    private static Pattern rTag = Pattern.compile("^" + TagRegex);

    /**
     * An array of the patterns we look for, in the correct order.
     */
    private static Pattern[] tokenPatterns = new Pattern[]{rText, rComment, rTag}; //rComment should come before rTag, since rTag matches opening comment tags.

    protected Pattern[] getTokenPatterns() {
        return tokenPatterns;
    }

    /**
     * Matches any single open bracket. Uses negative lookahead and lookbehind assertions
     */
    private static Pattern rSingleBracket = Pattern.compile("^(?<!\\<)<(?!\\<)");

    public long tokensRead = 0;

    public FolioToken read() throws IOException, InvalidMarkupException {
        tokensRead++;

        //Delegate if ready. Delete reference when done
        if (this.currentInnerReader != null) {
            FolioToken st = this.currentInnerReader.read();
            if (st != null) return st;
            else {
                this.currentInnerReader.close();
                this.currentInnerReader = null;
            }
        }

        //Store current position. After getNextMatch() is called, these values will be incremented to the *next* token.
        TokenInfo ti = tracker.getTokenInfo();


        //Or read from main stream
        Matcher m = getNextMatch();

        if (m == null) return null; //eof

        FolioToken ft = null;


        //Build comment tokens
        if (m.pattern() == rComment) {
            ft = new FolioToken(FolioToken.TokenType.Comment);
            ft.text = m.group(1);

            //Build text tokens
        } else if (m.pattern() == rText) {
            ft = new FolioToken(FolioToken.TokenType.Text);
            ft.text = m.group();

            //Check for single brackets (not pairs). They shouldn't be in text, so while we parse them, we call a warning.
            Matcher msb = rSingleBracket.matcher(ft.text);
            //Uncommented originally
            //while (msb.find()){
            //   msb.start();//TODO warning
            //}

        } else if (m.pattern() == rTag) {
            ft = new FolioToken(FolioToken.TokenType.Tag);
            if (m.group(1) != null) {
                ft.isClosing(true);
            }
            ft.text = m.group();
            ft.tagName = m.group(2);
            //ft.stackID = ft.tagName;
            ft.tagOptions = m.group(3);
        }

        //Save debugging info
        ft.info = ti;
        ft.info.length = m.end() - m.start();
        ft.info.parentService = this.resolver;
        if (m.pattern() == rComment)
            ft.info.text = m.group();
        else
            ft.info.text = ft.text; //it's already parsed

        index = m.end();

        //Check for stray comment tags.
        if (ft.matches("CM")) {
            throw new InvalidMarkupException("Comment tags cannot specify options and must be present in pairs.", ft);
        }

        //We may have locking issues here...
        //Check for includes (if we have a resolver)
        if (ft.type == FolioToken.TokenType.Tag) {
            //Insert both Definition and Flat File includes inline. We parse them the same
            if (ft.matches("DI|FI")) {
                if (this.resolver == null) {
                    throw new InvalidMarkupException("File include requested, but no IncludeResolutionService was specified.", ft);
                } else {
                    assert (ft.count() == 1);
                    String path = ft.get(0);
                    IIncludeResolutionService child = this.resolver.getChild(path);
                    //Check for circular references!!!
                    String hash = child.getHash();
                    if (this.parentDocumentPaths.contains(hash)) {
                        //That's right, the child is the circular reference that is also the parent.
                        throw new InvalidMarkupException("Circular reference: " + this.resolver.getDescription() + " contains a reference to parent document " + child.getDescription() + "... which is including " + this.resolver.getDescription());

                    } else {
                        Reader r = child.getReader();
                        List<String> newPathChain = new ArrayList<String>();
                        newPathChain.addAll(this.parentDocumentPaths);
                        newPathChain.add(hash);
                        this.currentInnerReader = new FolioTokenReader(r, this.readSize, child, newPathChain);
                        return this.read(); //Recursive - we've set up the delegation reader, so re-call this function. If the file is empty, it will start where it leftoff.
                    }
                }
            }
        }


        return ft;

    }

    @Override
    public boolean canRead() {
        if (this.currentInnerReader != null && this.currentInnerReader.canRead()) return true;
        return super.canRead();
    }

    @Override
    public void close() throws IOException {
        if (this.currentInnerReader != null) {
            this.currentInnerReader.close();
            this.currentInnerReader = null;
        }

        super.close();
    }
}