DTDFilterReader.java example

Explorer
openbd-core-master
- src
/* 
 *  Copyright (C) 2000 - 2010 TagServlet Ltd
 *
 *  This file is part of Open BlueDragon (OpenBD) CFML Server Engine.
 *  
 *  OpenBD is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  Free Software Foundation,version 3.
 *  
 *  OpenBD is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with OpenBD.  If not, see http://www.gnu.org/licenses/
 *  
 *  Additional permission under GNU GPL version 3 section 7
 *  
 *  If you modify this Program, or any covered work, by linking or combining 
 *  it with any of the JARS listed in the README.txt (or a modified version of 
 *  (that library), containing parts covered by the terms of that JAR, the 
 *  licensors of this Program grant you additional permission to convey the 
 *  resulting work. 
 *  README.txt @ http://www.openbluedragon.org/license/README.txt
 *  
 *  http://www.openbluedragon.org/
 */

package com.naryx.tagfusion.cfm.xml.parse;

import java.io.IOException;
import java.io.Reader;

/**
 * FilterReader that handles the <!DOCTYPE ...> element as the document is read.
 * according the the mode. If mode == READ_ADD, then, this will read the
 * existing <!DOCTYPE ...> element or add a <!DOCTYPE ...> element with a new
 * SYSTEM identifier, and CUSTOM_DTD. If mode == REMOVE, the <!DOCTYPE ...>
 * element is simply removed from the document stream and parsing proceeds
 * without it. If mode == NO_CHANGE, no changes wil be made.
 * 
 */
public class DTDFilterReader extends XmlFilterReader {
	/** Internal SYSTEM identifier for our modified DTD declarations */
	public static final String CUSTOM_DTD = "http://www.newatlanta.com/bluedragondtd";

	/**
	 * Using mode == READ_ADD, the existing doctype will be read. If no doctype
	 * exists in the xml data, then one will be inserted that contains the
	 * CUSTOM_DTD SYSTEM identifier. Using mode == REMOVE_MODIFY will result in
	 * any existing doctype declaration being updated so that it has no
	 * SYSTEM/PUBLIC identifier. Using mode == REMOVE will result in the <!DOCTYPE
	 * ...> element simply being removed from the document stream. Using mode ==
	 * NO_CHANGE, no changes will be made.
	 */
	public static final byte READ_ADD = 1;

	public static final byte REMOVE_MODIFY = 2;

	public static final byte REMOVE = 3;

	public static final byte NO_CHANGE = 4;

	private int state = 0;

	private int startPos = -1;

	private int startDTDPos = -1;

	private int endDTDPos = -1;

	private boolean more = true;

	protected byte mode = READ_ADD;

	protected StringBuilder inputBuffer = null;

	protected DTDListener listener = null;

	/**
	 * Default constructor. Takes the mode value. If mode == READ_ADD, then, this
	 * will read the existing <!DOCTYPE ...> element or add a <!DOCTYPE ...>
	 * element with a new SYSTEM identifier and CUSTOM_DTD. If mode ==
	 * REMOVE_MODIFY, any existing doctype declaration will be updated so that it
	 * has no SYSTEM/PUBLIC identifier. If mode == REMOVE, the <!DOCTYPE ...>
	 * element is simply removed from the document stream and parsing proceeds
	 * without it. If mode == NO_CHANGE, no changes will be made.
	 * 
	 * @param r
	 *          Reader to filter
	 * @param mode
	 *          either READ_ADD, REMOVE, REMOVE_MODIFY, or NO_CHANGE
	 */
	public DTDFilterReader(Reader r, byte mode) {
		super(r);
		this.mode = mode;
		this.state = 0;
		this.startPos = -1;
		this.startDTDPos = -1;
		this.endDTDPos = -1;
		this.more = true;
		this.inputBuffer = new StringBuilder();
	}

	/**
	 * Returns true if comment filtering should still continue, false otherwise.
	 * 
	 * @return true if comment filtering should still continue, false otherwise.
	 */
	protected boolean stillFiltering() {
		return (state < 16);
	}

	/**
	 * Inheritors must implement this method. It reads from the underlying Reader
	 * instance and fills the localBuffer. Note, implementations should not call
	 * any public methods in this class or infinite recursion will result. Returns
	 * true if reading from the underlying Reader is not limited. Returns false if
	 * the end of the data stream is reached during this read.
	 * 
	 * @param minCount
	 *          minimum number of characters that should be read for this call
	 * @return true if more data can be read, false otherwise
	 * @throws IOException
	 */
	protected boolean readUnderlying(int minCount) throws IOException {
		// Make sure we read at least minCount new data
		minCount += localBuffer.length();
		while (localBuffer.length() < minCount && more) {
			// Read the next bit of data into our input
			char[] chars = new char[512];
			int r = in.read(chars, 0, chars.length);
			if (r != -1) {
				// Process the input, look for <?, <!DOCTYPE, or other.
				inputBuffer.append(chars, 0, r);
				parseInput();
				more = true;
			} else {
				// Need to flush whatever's left in the input because we
				// won't be filtering after this call.
				localBuffer.append(inputBuffer);
				inputBuffer.setLength(0);
				more = false;
			}
		}

		return more;
	}

	/**
	 * Does the heavy work of parsing the read xml data and correctly keeping
	 * state so that the DTD can be filtered appropriately.
	 * 
	 * @throws IOException
	 */
	protected void parseInput() throws IOException {
		for (int pos = 0; pos < inputBuffer.length(); pos++) {
			char c = inputBuffer.charAt(pos);
			switch (state) {
			case 0: // Not in any tag
				if (c == '<') {
					// Opening of some tag. Don't add to the output just yet
					startPos = pos;
					state = 1;
				} else {
					// Add whatever it is (should be whitespace) to output
					localBuffer.append(c);
					inputBuffer.deleteCharAt(pos);
					pos = -1;
				}
				break;
			case 1: // In some (unknown tag)
				if (c == '?') {
					// Opening of either a PI or xml decl. OK, now we can add
					// the open bracket to the output, and this char too.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				} else if (c == '!') {
					// Opening of either a comment or DTD
					state = 3;
				} else {
					// Must be opening of the document element (by elimination)
					state = 4;
				}
				break;
			case 2: // In either a PI, xml decl, or comment tag
				// Add whatever it is to output
				localBuffer.append(c);
				inputBuffer.deleteCharAt(pos);
				pos = -1;
				if (c == '>') {
					// Back to looking for DTD and document element
					state = 0;
				}
				break;
			case 3: // In either a comment or DTD
				if (c == 'D') {
					// Opening of the DTD (most likely).
					state = 5;
				} else {
					// Opening of a comment. OK, now we can add the open
					// bracket etc. to the output.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				}
				break;
			case 4: // In the document element tag
				if (c == '>') {
					handleDTD(startPos, pos + 1);
					// Flush everything to output buffer
					localBuffer.append(inputBuffer.toString());
					inputBuffer.setLength(0);
					pos = -1;
					// Done filtering/scanning
					state = 16;
				}
				break;
			case 5: // In a DTD
				if (c == 'O') {
					// Opening of the DTD (most likely).
					state = 6;
				} else {
					// Opening of some other tag. OK, now we can add the open
					// bracket etc. to the output.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				}
				break;
			case 6: // In a DTD
				if (c == 'C') {
					// Opening of the DTD (most likely).
					state = 7;
				} else {
					// Opening of some other tag. OK, now we can add the open
					// bracket etc. to the output.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				}
				break;
			case 7: // In a DTD
				if (c == 'T') {
					// Opening of the DTD (most likely).
					state = 8;
				} else {
					// Opening of some other tag. OK, now we can add the open
					// bracket etc. to the output.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				}
				break;
			case 8: // In a DTD
				if (c == 'Y') {
					// Opening of the DTD (most likely).
					state = 9;
				} else {
					// Opening of some other tag. OK, now we can add the open
					// bracket etc. to the output.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				}
				break;
			case 9: // In a DTD
				if (c == 'P') {
					// Opening of the DTD (most likely).
					state = 10;
				} else {
					// Opening of some other tag. OK, now we can add the open
					// bracket etc. to the output.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				}
				break;
			case 10: // In a DTD
				if (c == 'E') {
					// Opening of the DTD.
					state = 11;
				} else {
					// Opening of some other tag. OK, now we can add the open
					// bracket etc. to the output.
					localBuffer.append(inputBuffer.substring(startPos, pos + 1));
					inputBuffer.delete(startPos, pos + 1);
					pos = -1;
					state = 2;
				}
				break;
			case 11: // In a DTD (for sure now)
				if (c == '[') {
					// DTD has internal subset
					state = 12;
				} else if (c == '>') {
					// Closing the DTD.
					startDTDPos = startPos;
					endDTDPos = pos + 1;
					state = 13;
				}
				break;
			case 12: // In a DTD internal subset
				if (c == ']') {
					// Back to just DTD
					state = 11;
				}
				break;
			case 13: // Not in any tag (after finding the DTD)
				if (c == '<') {
					// Opening of some tag. Don't add to the output just yet
					startPos = pos;
					state = 14;
				}
				break;
			case 14: // In some (unknown tag) (after finding the DTD)
				if (c == '?') {
					// Opening of a PI.
					state = 15;
				} else if (c == '!') {
					// Opening of a comment
					state = 15;
				} else {
					// Must be opening of the document element (by elimination)
					state = 4;
				}
				break;
			case 15: // In a PI or comment tag (after finding the DTD)
				if (c == '>') {
					// Close the tag
					state = 0;
				}
				break;
			case 16: // No longer filtering
				// Add whatever it is (should be whitespace) to output
				localBuffer.append(c);
				inputBuffer.deleteCharAt(pos);
				pos = -1;
				break;
			}
		}
	}

	/**
	 * Handles manipulating the doctype data in the specified StringBuilder
	 * buffer. Returns the emptied StringBuilder instance after the manipulations
	 * are complete.
	 * 
	 * @param dtdBuffer
	 *          StringBuilder containing the doctype element and document element
	 *          data
	 * @return emptied StringBuilder instance
	 */
	protected void handleDTD(int startDocElem, int endDocElem) throws IOException {
		if (mode == REMOVE) {
			if (startDTDPos != -1) {
				inputBuffer.delete(startDTDPos, endDTDPos);
			}
		} else if (mode == REMOVE_MODIFY) {
			if (startDTDPos != -1) {
				String newDTD = replaceId(inputBuffer.substring(startDTDPos, endDTDPos));
				inputBuffer.delete(startDTDPos, endDTDPos);
				inputBuffer.insert(startDTDPos, newDTD);
			}
		} else if (mode == READ_ADD) {
			if (startDTDPos != -1) {
				replaceId(inputBuffer.substring(startDTDPos, endDTDPos));
			} else {
				String newDTD = "<!DOCTYPE " + readElementName(inputBuffer.substring(startDocElem, endDocElem)) + " SYSTEM \"" + CUSTOM_DTD + "\">";
				inputBuffer.insert(startDocElem, newDTD);
			}
		} else if (mode == NO_CHANGE) {
			// Don't change anything
		}
	}

	/**
	 * Reads the next xml element name from the specified string and returns it.
	 * 
	 * @param str
	 *          String to parse
	 * @return next xml element name
	 */
	protected String readElementName(String str) {
		StringBuilder buffy = new StringBuilder();
		boolean readingName = false;
		for (int i = str.indexOf('<') + 1; i < str.length(); i++) {
			if (!Character.isWhitespace(str.charAt(i))) {
				buffy.append(str.charAt(i));
				readingName = true;
			} else {
				if (readingName)
					break;
			}
		}
		return buffy.toString();
	}

	/**
	 * Replaces or removes the value of the SYSTEM/PUBLIC identifier in the
	 * specified <!DOCTYPE ...> String with an internal identifier (see
	 * ValidationInputSource.CUSTOM_DTD). Returns the updated <!DOCTYPE ...>
	 * String.
	 * 
	 * @param str
	 *          <!DOCTYPE ...> String to alter
	 * @return altered <!DOCTYPE ...> String
	 */
	private String replaceId(String str) throws IOException {
		char c = ' ';
		int localState = 0;
		StringBuilder buffy = new StringBuilder();
		str = str.trim();
		for (int i = 0; i < str.length(); i++) {
			c = str.charAt(i);
			switch (localState) {
			case 0: // Reading the <!DOCTYPE token
				if (Character.isWhitespace(c))
					localState = 1;
				buffy.append(c);
				break;
			case 1: // Reading the name token
				if (Character.isWhitespace(c))
					localState = 2;
				buffy.append(c);
				break;
			case 2: // Reading either SYSTEM or PUBLIC or [ or >
				if (c == 'S') {
					if (str.length() > i + 6 && str.substring(i, i + 6).equals("SYSTEM")) {
						// Remove or replace this SYSTEM identifier
						for (int x = i + 6; x < str.length(); x++) {
							c = str.charAt(x);
							if (Character.isWhitespace(c)) {
								continue;
							} else if (c == '\'' || c == '"') {
								if (mode == READ_ADD) {
									// Replace and read
									String existingSysId = str.substring(x + 1, str.indexOf(c, x + 1));
									if (this.listener != null)
										this.listener.setDTD(null, existingSysId);
									buffy.append("SYSTEM ");
									buffy.append(c);
									buffy.append(CUSTOM_DTD);
									buffy.append(c);
									buffy.append(str.substring(str.indexOf(c, x + 1) + 1));
									return buffy.toString(); // Done!
								} else if (mode == REMOVE_MODIFY) {
									// Remove
									buffy.append(str.substring(str.indexOf(c, x + 1) + 1));
									return buffy.toString(); // Done!
								} else {
									// Should never reach here
									throw new IOException("Invalid DTD Filter mode: " + mode + ". Expecting ADD_MODIFY (" + READ_ADD + ") or REMOVE_MODIFY (" + REMOVE_MODIFY + ").");
								}
							} else {
								throw new IOException("Invalid doctype declaration. Expecting quoted SYSTEM " + "literal: " + str);
							}
						}
					} else {
						throw new IOException("Invalid doctype declaration. Expecting SYSTEM identifier: " + str);
					}
				} else if (c == 'P') {
					if (str.length() > i + 6 && str.substring(i, i + 6).equals("PUBLIC")) {
						// Remove or replace this PUBLIC identifier
						boolean tookCareOfPubId = false;
						for (int x = i + 6; x < str.length(); x++) {
							c = str.charAt(x);
							if (Character.isWhitespace(c)) {
								// Just continue
							} else if (c == '\'' || c == '"') {
								if (!tookCareOfPubId) {
									// Eat the first quoted string
									x = str.indexOf(c, x + 1);
									tookCareOfPubId = true;
									// Continue on
								} else {
									if (mode == READ_ADD) {
										// Replace and read
										String existingPubId = str.substring(x + 1, str.indexOf(c, x + 1));
										if (this.listener != null)
											this.listener.setDTD(existingPubId, null);
										buffy.append("SYSTEM ");
										buffy.append(c);
										buffy.append(CUSTOM_DTD);
										buffy.append(c);
										buffy.append(str.substring(str.indexOf(c, x + 1) + 1));
										return buffy.toString(); // Done!
									} else if (mode == REMOVE_MODIFY) {
										// Remove
										buffy.append(str.substring(str.indexOf(c, x + 1) + 1));
										return buffy.toString(); // Done!
									} else {
										// Should never reach here
										throw new IOException("Invalid DTD Filter mode: " + mode + ". Expecting ADD_MODIFY (" + READ_ADD + ") or REMOVE_MODIFY (" + REMOVE_MODIFY + ").");
									}
								}
							} else {
								throw new IOException("Invalid doctype declaration. Expecting quoted PUBLIC " + "literal: " + str);
							}
						}
					} else {
						throw new IOException("Invalid doctype declaration. Expecting PUBLIC identifier: " + str);
					}
				} else if (c == '[' || c == '>') {
					buffy.append(str.substring(i));
					return buffy.toString();
				} else if (Character.isWhitespace(c)) {
					buffy.append(c);
				} else {
					throw new IOException("Invalid doctype declaration. Expecting SYSTEM/PUBLIC identifier or " + "entity references, or ], or >: " + str);
				}
				break;
			default: // Should not reach here
				throw new IOException("Invalid doctype declaration. Expecting SYSTEM/PUBLIC identifier or " + "entity references, or ], or >: " + str);
			}
		}
		return buffy.toString();
	}

	/**
	 * Sets the DTDListener for this DTDFilterReader.
	 * 
	 * @param list
	 *          DTDListener for this DTDFilterReader
	 */
	public void setListener(DTDListener list) {
		this.listener = list;
	}

	/**
	 * Callback interface for objects interested in the DTD from the filtered xml
	 * document.
	 * 
	 * @author mattj
	 * 
	 */
	public interface DTDListener {
		/**
		 * Sets the public id and system id from the read <!DOCTYPE ...> element.
		 * 
		 * @param publicId
		 *          public id from the <!DOCTYPE ...> element
		 * @param systemId
		 *          system id from the <!DOCTYPE ...> element
		 */
		public void setDTD(String publicId, String systemId);
	}
}