/*
* (C) Copyright IBM Corp. 2013
*
* LICENSE: Eclipse Public License v1.0
* http://www.eclipse.org/legal/epl-v10.html
*/
package com.ibm.gaiandb.webservices.scanner.sax;
import java.io.IOException;
import java.io.InputStream;
/**
* The purpose of this class is to generate a XHTML InputStream from a HTML InputStream.
*
* @author remi - IBM Hursley
*
*/
public class HTMLFilterInputStream extends InputStream {
// ----------------------------------------------------------------------------------
// ----------------------------------------------------------------------- ATTRIBUTES
// =========================================================================== Public
// --------------------------------------------------------------------------- Static
// Use PROPRIETARY notice if class contains a main() method, otherwise use
// COPYRIGHT notice.
public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2013";
// -------------------------------------------------------------------------- Dynamic
// ======================================================================== Protected
// --------------------------------------------------------------------------- Static
// -------------------------------------------------------------------------- Dynamic
// ========================================================================== Private
// --------------------------------------------------------------------------- Static
/**
* Used as a default value when the index going through the buffer doesn't
* point to any values.
*/
private static final int NO_VALUES_TO_READ = -1;
/**
* Used as a default returned value when a filter does not have to be applied
* on a given character.
*/
private static final int FILTER_NOT_RELEVANT = -1;
// -------------------------------------------------------------------------- Dynamic
private InputStream inputStreamToFilter;
private byte[] valuesToRead;
private int indexValueToRead;
private String[] tagsContentToRemove = new String[0];
private String[] tagsToRemove = new String[0];//{"meta"}, "link"}; //new String[0];
// ----------------------------------------------------------------------------------
// ---------------------------------------------------------------------------- TOOLS
// ----------------------------------------------------------------------------------
// -------------------------------------------------------------------------- METHODS
// ===================================================================== Constructors
// --------------------------------------------------------------------------- Public
public HTMLFilterInputStream(InputStream inputStreamToFilter) {
super();
System.out.println("Apply Filter");
this.inputStreamToFilter = inputStreamToFilter;
this.valuesToRead = new byte[0];
this.indexValueToRead = NO_VALUES_TO_READ;
}
// -------------------------------------------------------------------------- Private
// =========================================================================== Public
// --------------------------------------------------------------------------- Static
// -------------------------------------------------------------------------- Dynamic
public void setTagContentsToRemove(String[] contents) {
this.tagsContentToRemove = contents;
}
public void setTagsToRemove(String[] tagNames) {
this.tagsToRemove = tagNames;
}
@Override
public int read() throws IOException {
int nextValue = this.readAndFilter();
// Checks the result of the conversion (int)-1 > char > int
if (nextValue == 65535) nextValue = -1;
// char c = (char)nextValue;
//System.out.println("" + c);
return nextValue;//(int)c;
}
@Override
public void close() throws IOException {
this.inputStreamToFilter.close();
super.close();
}
// ======================================================================== Protected
// --------------------------------------------------------------------------- Static
// -------------------------------------------------------------------------- Dynamic
// ========================================================================== Private
// --------------------------------------------------------------------------- Static
// -------------------------------------------------------------------------- Dynamic
private int readAndFilter() throws IOException {
// Checks if the buffer contains values to read
if (this.indexValueToRead != NO_VALUES_TO_READ
&& this.indexValueToRead < this.valuesToRead.length) {
// read the first value of the buffer
int ret = (int)this.valuesToRead[this.indexValueToRead];
// Updates index
this.indexValueToRead++;
if (this.indexValueToRead >= this.valuesToRead.length) {
this.valuesToRead = new byte[0];
this.indexValueToRead = NO_VALUES_TO_READ;
}
//System.out.println("read:" + new Character((char)ret).toString());
return ret;
}
// If the buffer is empty
else {
// If it is the beginning of a tag: "<"
char currentChar = (char)this.inputStreamToFilter.read();
if (currentChar == '<') {
StringBuilder valueToReadIfNormalTag = new StringBuilder();
valueToReadIfNormalTag.append(currentChar);
// --- Checks if it is a end tag
currentChar = (char)this.inputStreamToFilter.read();
if (currentChar == '/') {
valueToReadIfNormalTag.append(currentChar);
// If it is, we do not consider this as a tag to remove
// So we store the read value and keep going
this.valuesToRead = valueToReadIfNormalTag.toString().getBytes();
this.indexValueToRead = 0;
return this.readAndFilter();
}
// --- Scans the name of the tag
// Remove whites
while (currentChar == ' ' || currentChar == '\t'
|| currentChar == '\r' || currentChar == '\n'
|| currentChar == '\f') {
valueToReadIfNormalTag.append(currentChar);
currentChar = (char)this.inputStreamToFilter.read();
}
// Build the name of the tag
StringBuilder nameTag = new StringBuilder();
while (currentChar != '>' && currentChar != ' '
&& currentChar != '\t' && currentChar != '\r'
&& currentChar != '\n' && currentChar != '\f') {
valueToReadIfNormalTag.append(currentChar);
nameTag.append(currentChar);
currentChar = (char)this.inputStreamToFilter.read();
}
valueToReadIfNormalTag.append(currentChar);
// --- If it is a tag which content has to be removed
String currentTagName = nameTag.toString().toLowerCase();
String currentTagNameLowerStr = currentTagName.toLowerCase();
for (String tagContentToRemove :
this.tagsContentToRemove ) {
if (tagContentToRemove.equals(currentTagNameLowerStr)) {
// scans until we find the corresponding end tag
this.ignoreUntilEndTag(currentTagName);
// The char just after the end of the end tag just opened
currentChar = (char)this.readAndFilter();
//System.out.println("read:" + new Character((char)currentChar).toString());
// return the next char
return (int)currentChar;
}
}
// Else, if it is the name of a tag to remove
for (String tagContentToRemove :
this.tagsToRemove ) {
if (tagContentToRemove.equals(currentTagName)) {
// scans until the end of the tag
this.ignoreCurrentTag(currentChar);
currentChar = (char)this.readAndFilter();
//System.out.println("read:" + new Character((char)currentChar).toString());
// return the next char
return (int)currentChar;
}
}
// Else, if it a normal tag, need to fill the buffer
// Problem: going to return first char of valueToReadIfNormalTag
// and buffer all the others (plus the last read value stampNam)
this.valuesToRead = valueToReadIfNormalTag.toString().getBytes();
this.indexValueToRead = 0;
return this.readAndFilter();
}
// If it is something else,
else {
//System.out.println("read:" + new Character((char)currentChar).toString());
// returns the value
return this.filterChar(currentChar);
}
}
}
/**
* Calls the method this.inputStreamToFilter.read() until the end of the end tag
* which name is given as a parameter.
* @param endTag
* end tag name delimiting when the ignoring has to stop.
* @throws IOException if any exceptions occur during the
* this.inputStreamToFilter.read().
*/
private void ignoreUntilEndTag(String endTag) throws IOException {
char stamp = (char)this.inputStreamToFilter.read();
boolean hasBeFound = false;
while (!hasBeFound) {
// Does nothing until it find a '<'
while (stamp != '<') {
stamp = (char)this.inputStreamToFilter.read();
}
stamp = (char)this.inputStreamToFilter.read();
// Checks if it is a closig tag
if (stamp == '/') {
// Remove whites before the name
stamp = (char)this.inputStreamToFilter.read();
while (stamp == ' ' || stamp == '\t'
|| stamp == '\r' || stamp == '\n'
|| stamp == '\f') {
stamp = (char)this.inputStreamToFilter.read();
}
// Gets the name
StringBuilder nameTag = new StringBuilder();
while (stamp != '>' && stamp != ' '
&& stamp != '\t' && stamp != '\n'
&& stamp != '\f' && stamp != '\r') {
nameTag.append(stamp);
stamp = (char)this.inputStreamToFilter.read();
}
// Checks if it closes the tag we are ignoring
if (nameTag.toString().equalsIgnoreCase(endTag)) {
// Ignore all whites until (including) the end of the tag
while (stamp != '>') {
stamp = (char)this.inputStreamToFilter.read();
}
hasBeFound = true;
}
// If it is not the right tag that it is closing, keep going!
}
}
}
/**
* Calls the method this.inputStreamToFilter.read() until the end of the
* current tag.
* @param lastReadChar
* last char read before the call of this method.
* @throws IOException if any exceptions occur during the
* this.inputStreamToFilter.read().
*/
private void ignoreCurrentTag(char lastReadChar) throws IOException {
while (lastReadChar != '>') {
lastReadChar = (char)this.inputStreamToFilter.read();
}
}
/**
* Checks if it is a specific char and add values to the buffer if so.
* @param charToTest
* The character to test.
* @return charToTest if no filter is relevant for this char, the first
* character of the buffer if it changes have to be done.
* @throws IOException if any exceptions occur during the
* this.inputStreamToFilter.read().
*/
private int filterChar(int charToTest) throws IOException {
if (charToTest == -1) return charToTest;
char testedChar = (char)charToTest;
int filtered = this.filterCharAnd(testedChar);
if (filtered != FILTER_NOT_RELEVANT) {
return filtered;
}
return charToTest;
}
/**
* If the charToTest equals '&', checks if it is the beginning of
* '&' (XML value for '&'), '<' (XML value for '<') or for
* '>', '"' or ''', and if not, add "amp;" to the buffer and returns
* '&'.
* @param charToTest
* Character to test.
* @return FILTER_NOT_RELEVANT if charToTest is different of '&',
* '&' otherwise.
* @throws IOException if any exceptions occur during the
* this.inputStreamToFilter.read().
*/
private int filterCharAnd(char charToTest) throws IOException {
if (charToTest != '&') {
return FILTER_NOT_RELEVANT;
}
StringBuilder toBuffer = new StringBuilder();
toBuffer.append((char)charToTest);
charToTest = (char)this.inputStreamToFilter.read();
// If the '&' is before a space character
if ((char)charToTest == ' ' || (char)charToTest == '\t'
|| (char)charToTest == '\r' || (char)charToTest == '\n'
|| (char)charToTest == '\f') {
toBuffer.append("amp;");
toBuffer.append(charToTest);
this.valuesToRead = toBuffer.toString().getBytes();
this.indexValueToRead = 0;
return this.readAndFilter();
}
// gets 4 more char and compare the generated string with
// the definition of the specific XML characters.
toBuffer.append(charToTest);
for (int i = 0; i < 4; i++) {
toBuffer.append((char)this.inputStreamToFilter.read());
}
// replace '&' by "&" if it is an illegal '&' in the current file
String currentBuffer = toBuffer.toString();
if (!(currentBuffer.toLowerCase().startsWith("&")
|| currentBuffer.toLowerCase().startsWith("<")
|| currentBuffer.toLowerCase().startsWith(">")
|| currentBuffer.toLowerCase().startsWith(""")
|| currentBuffer.toLowerCase().startsWith("'"))) {
currentBuffer.replaceAll("&", "&");
}
// Loads the buffer
this.valuesToRead = currentBuffer.getBytes();
this.indexValueToRead = 0;
return this.readAndFilter();
}
}