package com.fulmicoton.multiregexp;
import java.io.CharArrayReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
public class Scanner<T extends Enum> {
private static final int BUFFER_NUM_BITS = 8;
private static final int MASK = (1 << BUFFER_NUM_BITS) - 1;
private final MultiPatternAutomaton automaton;
private final char[] circularBuffer = new char[1 << BUFFER_NUM_BITS];
public Reader reader;
private boolean endOfReader = false;
private final ArrayList<T> tokenTypes;
// private int cursor;
private int readUntil;
public T type;
public int start = 0;
public int end = 0;
public int readerLength = Integer.MAX_VALUE;
public void reset(final Reader reader) {
this.reader = reader;
this.start = 0;
this.end = 0;
this.endOfReader = false;
this.type = null;
this.readerLength = Integer.MAX_VALUE;
this.readUntil = 0;
}
private static Reader readerFromCharSequence(final CharSequence charSeq) {
final int numChars = charSeq.length();
final char[] chars = new char[numChars];
for (int i=0; i<numChars; i++) {
chars[i] = charSeq.charAt(i);
}
return new CharArrayReader(chars);
}
public Scanner(final MultiPatternAutomaton automaton,
final CharSequence charSequence,
final ArrayList<T> tokenTypes) {
this(automaton, readerFromCharSequence(charSequence), tokenTypes);
}
public Scanner(final MultiPatternAutomaton automaton,
final Reader reader,
final ArrayList<T> tokenTypes) {
this.automaton = automaton;
this.reader = reader;
this.tokenTypes = tokenTypes;
}
/**
* Same as next(), but throws unchecked Exception.
*/
boolean nextUnchecked() {
try {
return this.next();
} catch (final ScanException | IOException e) {
throw new RuntimeException(e);
}
}
private void put(final int i, final char c) {
this.circularBuffer[i & MASK] = c;
}
private char get(final int i) {
return this.circularBuffer[i & MASK];
}
private char readOne(final int i) throws IOException {
if (i < this.readUntil) {
return this.circularBuffer[i & MASK];
}
if (i == this.readUntil) {
if (this.endOfReader) {
return 0;
}
final int cInt = this.reader.read();
if (cInt < 0) {
this.endOfReader = true;
this.readerLength = i;
return 0;
}
else {
this.readUntil += 1;
final char chr = (char)cInt;
this.put(i, chr);
return chr;
}
}
throw new IOException("");
}
public boolean next() throws ScanException, IOException {
// we start at the end of the last emitted token
if (this.end == this.readerLength) {
return false;
}
this.start = this.end;
int p = 0;
int highestPriorityMatch = Integer.MAX_VALUE;
int lastLetter = start;
for (int cursor = start; cursor < this.readerLength; cursor++) {
final char chr = this.readOne(cursor);
if (chr == 0) {
break;
}
p = this.automaton.step(p, chr);
if (p == -1) {
break;
}
else {
final int[] accept = this.automaton.accept[p];
if (accept.length > 0) {
final int minAccept = accept[0];
if (minAccept <= highestPriorityMatch) {
// HighPriority = low value.
// If we find a match with a higher priority
// we prefer than one,
// If it is the same pattern which is
// match we take that too for the sake of greediness.
highestPriorityMatch = minAccept;
lastLetter = cursor;
}
}
// when a match is found, we keep matching
// as a longer prefix might match a pattern
// with a higher priority.
}
}
// No tokens have been found. Raised an expression
// with a bit of context, and the offset in the string.
if (highestPriorityMatch == Integer.MAX_VALUE) {
if (this.start == 0) {
return false;
}
final int contextStart = Math.max(0, this.start - 10);
final int contextEnd = Math.min(this.start + 10, this.readUntil);
final String context = this.subSequence(contextStart, this.start) + "|" + this.subSequence(this.start, contextEnd);
throw new ScanException(context, this.start);
}
this.end = lastLetter + 1;
this.type = this.tokenTypes.get(highestPriorityMatch);
return true;
}
private CharSequence subSequence(final int start, final int end) {
return new CharSeq(this.circularBuffer, start, end-start);
}
public static class CharSeq implements CharSequence {
private final char[] buffer;
private final int start;
private final int length;
public CharSeq(final char[] buffer, final int start, final int length) {
this.buffer = buffer;
this.start = start;
this.length = length;
}
public String toString() {
return new StringBuilder(this).toString();
}
@Override
public int length() {
return this.length;
}
@Override
public char charAt(final int index) {
return this.buffer[(this.start + index) & MASK];
}
@Override
public CharSequence subSequence(final int newStart, final int newEnd) {
return new CharSeq(this.buffer, (this.start + newStart) & MASK, newEnd - newStart);
}
}
public CharSequence tokenString() {
return new CharSeq(this.circularBuffer, this.start, this.end - this.start);
}
}