package folioxml.folio;
import folioxml.core.FileIncludeResolver;
import folioxml.core.IIncludeResolutionService;
import folioxml.core.InvalidMarkupException;
import folioxml.core.TokenInfo;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Reads a series of FolioToken instances from the specififed Reader input stream.
* Fetches DI and FI preprocessor includes using the specified IIncludeResolutionService.
*
* @author nathanael
*/
public class FolioTokenReader extends folioxml.core.TokenReaderBase {
/**
* @param reader
* @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file.
* @throws IOException
* @throws FileNotFoundException
* @throws UnsupportedEncodingException
*/
public FolioTokenReader(Reader reader, int readBlockSize) {
super(reader, readBlockSize);
}
/**
* Uses the Windows-1252 encoding
*/
public FolioTokenReader(File path) throws UnsupportedEncodingException, FileNotFoundException, IOException {
this(new InputStreamReader(new FileInputStream(path), "Windows-1252"), new FileIncludeResolver(path.getAbsolutePath()));
}
/**
* @param reader
* @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file.
*/
public FolioTokenReader(Reader reader, IIncludeResolutionService referenceResolver) throws IOException {
this(reader, READ_SIZE_DEFAULT, referenceResolver, null);
}
/**
* @param reader
* @param readBlockSize Should (optimally) be the length of the largest comment or text segment in the file.
*/
public FolioTokenReader(Reader reader, int readBlockSize, IIncludeResolutionService referenceResolver) throws IOException {
this(reader, readBlockSize, referenceResolver, null);
}
/**
* @param reader A FileReader or BufferedReader containing Folio Flat File document or definition codes.
* @param readBlockSize How much data to add to the buffer from 'reader' when more data is needed. The buffer is not fixed size, and will
* strech as needed for a large token (such as a massive comment). The buffer will first clean out used-up space/
* @param referenceResolver
* @param parentDocumentPaths
* @throws java.io.IOException
*/
public FolioTokenReader(Reader reader, int readBlockSize, IIncludeResolutionService referenceResolver, List<String> parentDocumentPaths) throws IOException {
super(reader, readBlockSize);
this.resolver = referenceResolver;
this.parentDocumentPaths = parentDocumentPaths;
//Add the default element to parentDocumentPaths
if (this.resolver != null) {
//Cannot be null if resolver exists
if (this.parentDocumentPaths == null) this.parentDocumentPaths = new ArrayList<String>();
//Add base document path always - we don't want 2nd level files to re-reference the first.
if (!this.parentDocumentPaths.contains(resolver.getHash())) {
this.parentDocumentPaths.add(resolver.getHash());
}
}
}
public FolioTokenReader(Reader reader) {
this(reader, READ_SIZE_DEFAULT);
}
/**
* If initialized, this class will be used to perform on-the-fly file includes.
*/
private IIncludeResolutionService resolver = null;
/**
* Used to track and prevent circular references
*/
private List<String> parentDocumentPaths = null;
/**
* Used to read in included files. Null when finished.
*/
private FolioTokenReader currentInnerReader = null;
/**
* Jan 20, 09. Can't use posessive quantifiers here - sorry.
*/
public static String CommentRegex = "<CM>(.*?)</CM>";
/**
* Matches a comment tag and any intermediate comments. Lazy, of course.
*/
private static Pattern rComment = Pattern.compile("^" + CommentRegex, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
/**
* Jan 20, 09 : Added possessive quantifiers throughout. previously (?:[^<]+|<[^A-Za-z/])+
*/
public static String TextRegex = "(?:[^<]++|<[^A-Za-z/])++";
/**
* Matches text that doesn't contain any open brackets that are directly followed by a letter or a closing slash.
*/
private static Pattern rText = Pattern.compile("^" + TextRegex); //non <, expect doubles
/**
* Matches any two-letter tag (and +/-), and captures (optional) options. group 1 and 2, respectively.
* Tag options must have matching quote pairs, (single quotes are encoded like "").
* Opening brackets can be entered by entering two.
* Opening and closing brackets can be used literally as long as they exist in pairs, are not nested, and don't contain quotes.
* Opening and closing brackets can be used arbitrarily within a quoted string.
* <BR:AL:0.15,0.0291667,FC:255,255, caused problems.
* Jan 20, 09. Added posessive quantifiers throughout regex.
*/
public static String TagRegex = "<(/)?+([A-Z-a-z][A-Z-a-z][\\+\\-]?+)(?:\\s*+[:,;]++\\s*+((?:[^><\"]++|<<|\"(?:[^\"]|(?:\"\"))*+\"|<[^<>\"]*+>)++))?+>";
/* old regex 86 seconds on <BR:AL:0.15,0.0291667,FC:255,255,
public static String TagRegex = "<(/)?([A-Z-a-z][A-Za-z][\\+\\-]?)(?:\\s*[:,;]+\\s*((?:[^><\"]+|<<|\"(?:[^\"]|(?:\"\"))*\"|<[^<>\"]*>)+?))?>";
*/
private static Pattern rTag = Pattern.compile("^" + TagRegex);
/**
* An array of the patterns we look for, in the correct order.
*/
private static Pattern[] tokenPatterns = new Pattern[]{rText, rComment, rTag}; //rComment should come before rTag, since rTag matches opening comment tags.
protected Pattern[] getTokenPatterns() {
return tokenPatterns;
}
/**
* Matches any single open bracket. Uses negative lookahead and lookbehind assertions
*/
private static Pattern rSingleBracket = Pattern.compile("^(?<!\\<)<(?!\\<)");
public long tokensRead = 0;
public FolioToken read() throws IOException, InvalidMarkupException {
tokensRead++;
//Delegate if ready. Delete reference when done
if (this.currentInnerReader != null) {
FolioToken st = this.currentInnerReader.read();
if (st != null) return st;
else {
this.currentInnerReader.close();
this.currentInnerReader = null;
}
}
//Store current position. After getNextMatch() is called, these values will be incremented to the *next* token.
TokenInfo ti = tracker.getTokenInfo();
//Or read from main stream
Matcher m = getNextMatch();
if (m == null) return null; //eof
FolioToken ft = null;
//Build comment tokens
if (m.pattern() == rComment) {
ft = new FolioToken(FolioToken.TokenType.Comment);
ft.text = m.group(1);
//Build text tokens
} else if (m.pattern() == rText) {
ft = new FolioToken(FolioToken.TokenType.Text);
ft.text = m.group();
//Check for single brackets (not pairs). They shouldn't be in text, so while we parse them, we call a warning.
Matcher msb = rSingleBracket.matcher(ft.text);
//Uncommented originally
//while (msb.find()){
// msb.start();//TODO warning
//}
} else if (m.pattern() == rTag) {
ft = new FolioToken(FolioToken.TokenType.Tag);
if (m.group(1) != null) {
ft.isClosing(true);
}
ft.text = m.group();
ft.tagName = m.group(2);
//ft.stackID = ft.tagName;
ft.tagOptions = m.group(3);
}
//Save debugging info
ft.info = ti;
ft.info.length = m.end() - m.start();
ft.info.parentService = this.resolver;
if (m.pattern() == rComment)
ft.info.text = m.group();
else
ft.info.text = ft.text; //it's already parsed
index = m.end();
//Check for stray comment tags.
if (ft.matches("CM")) {
throw new InvalidMarkupException("Comment tags cannot specify options and must be present in pairs.", ft);
}
//We may have locking issues here...
//Check for includes (if we have a resolver)
if (ft.type == FolioToken.TokenType.Tag) {
//Insert both Definition and Flat File includes inline. We parse them the same
if (ft.matches("DI|FI")) {
if (this.resolver == null) {
throw new InvalidMarkupException("File include requested, but no IncludeResolutionService was specified.", ft);
} else {
assert (ft.count() == 1);
String path = ft.get(0);
IIncludeResolutionService child = this.resolver.getChild(path);
//Check for circular references!!!
String hash = child.getHash();
if (this.parentDocumentPaths.contains(hash)) {
//That's right, the child is the circular reference that is also the parent.
throw new InvalidMarkupException("Circular reference: " + this.resolver.getDescription() + " contains a reference to parent document " + child.getDescription() + "... which is including " + this.resolver.getDescription());
} else {
Reader r = child.getReader();
List<String> newPathChain = new ArrayList<String>();
newPathChain.addAll(this.parentDocumentPaths);
newPathChain.add(hash);
this.currentInnerReader = new FolioTokenReader(r, this.readSize, child, newPathChain);
return this.read(); //Recursive - we've set up the delegation reader, so re-call this function. If the file is empty, it will start where it leftoff.
}
}
}
}
return ft;
}
@Override
public boolean canRead() {
if (this.currentInnerReader != null && this.currentInnerReader.canRead()) return true;
return super.canRead();
}
@Override
public void close() throws IOException {
if (this.currentInnerReader != null) {
this.currentInnerReader.close();
this.currentInnerReader = null;
}
super.close();
}
}