package com.fulmicoton.multiregexp; import java.io.CharArrayReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; public class Scanner<T extends Enum> { private static final int BUFFER_NUM_BITS = 8; private static final int MASK = (1 << BUFFER_NUM_BITS) - 1; private final MultiPatternAutomaton automaton; private final char[] circularBuffer = new char[1 << BUFFER_NUM_BITS]; public Reader reader; private boolean endOfReader = false; private final ArrayList<T> tokenTypes; // private int cursor; private int readUntil; public T type; public int start = 0; public int end = 0; public int readerLength = Integer.MAX_VALUE; public void reset(final Reader reader) { this.reader = reader; this.start = 0; this.end = 0; this.endOfReader = false; this.type = null; this.readerLength = Integer.MAX_VALUE; this.readUntil = 0; } private static Reader readerFromCharSequence(final CharSequence charSeq) { final int numChars = charSeq.length(); final char[] chars = new char[numChars]; for (int i=0; i<numChars; i++) { chars[i] = charSeq.charAt(i); } return new CharArrayReader(chars); } public Scanner(final MultiPatternAutomaton automaton, final CharSequence charSequence, final ArrayList<T> tokenTypes) { this(automaton, readerFromCharSequence(charSequence), tokenTypes); } public Scanner(final MultiPatternAutomaton automaton, final Reader reader, final ArrayList<T> tokenTypes) { this.automaton = automaton; this.reader = reader; this.tokenTypes = tokenTypes; } /** * Same as next(), but throws unchecked Exception. */ boolean nextUnchecked() { try { return this.next(); } catch (final ScanException | IOException e) { throw new RuntimeException(e); } } private void put(final int i, final char c) { this.circularBuffer[i & MASK] = c; } private char get(final int i) { return this.circularBuffer[i & MASK]; } private char readOne(final int i) throws IOException { if (i < this.readUntil) { return this.circularBuffer[i & MASK]; } if (i == this.readUntil) { if (this.endOfReader) { return 0; } final int cInt = this.reader.read(); if (cInt < 0) { this.endOfReader = true; this.readerLength = i; return 0; } else { this.readUntil += 1; final char chr = (char)cInt; this.put(i, chr); return chr; } } throw new IOException(""); } public boolean next() throws ScanException, IOException { // we start at the end of the last emitted token if (this.end == this.readerLength) { return false; } this.start = this.end; int p = 0; int highestPriorityMatch = Integer.MAX_VALUE; int lastLetter = start; for (int cursor = start; cursor < this.readerLength; cursor++) { final char chr = this.readOne(cursor); if (chr == 0) { break; } p = this.automaton.step(p, chr); if (p == -1) { break; } else { final int[] accept = this.automaton.accept[p]; if (accept.length > 0) { final int minAccept = accept[0]; if (minAccept <= highestPriorityMatch) { // HighPriority = low value. // If we find a match with a higher priority // we prefer than one, // If it is the same pattern which is // match we take that too for the sake of greediness. highestPriorityMatch = minAccept; lastLetter = cursor; } } // when a match is found, we keep matching // as a longer prefix might match a pattern // with a higher priority. } } // No tokens have been found. Raised an expression // with a bit of context, and the offset in the string. if (highestPriorityMatch == Integer.MAX_VALUE) { if (this.start == 0) { return false; } final int contextStart = Math.max(0, this.start - 10); final int contextEnd = Math.min(this.start + 10, this.readUntil); final String context = this.subSequence(contextStart, this.start) + "|" + this.subSequence(this.start, contextEnd); throw new ScanException(context, this.start); } this.end = lastLetter + 1; this.type = this.tokenTypes.get(highestPriorityMatch); return true; } private CharSequence subSequence(final int start, final int end) { return new CharSeq(this.circularBuffer, start, end-start); } public static class CharSeq implements CharSequence { private final char[] buffer; private final int start; private final int length; public CharSeq(final char[] buffer, final int start, final int length) { this.buffer = buffer; this.start = start; this.length = length; } public String toString() { return new StringBuilder(this).toString(); } @Override public int length() { return this.length; } @Override public char charAt(final int index) { return this.buffer[(this.start + index) & MASK]; } @Override public CharSequence subSequence(final int newStart, final int newEnd) { return new CharSeq(this.buffer, (this.start + newStart) & MASK, newEnd - newStart); } } public CharSequence tokenString() { return new CharSeq(this.circularBuffer, this.start, this.end - this.start); } }