HTMLFilterInputStream.java example

Explorer

gaiandb-master
- java
  - Asset
  - IntegrationSamples
    - com
      - ibm
        gaiandb
        distributed
        SampleProcedures.java
        SampleUDFs.java
        embedding
        SampleApplicationEmbeddingGaian.java
        policy
        SamplePolicyForClusterPrivacy.java
        SamplePolicyNoFilters.java
        vti
        SampleHelloWorldVTI.java
        SamplePluralizedVTI.java
  - PolicyPlugins
    - com
      - ibm
        gaiandb
        plugins
        dsm
        GaianNeighbourProvider.java
        OpinionsBasedRouterDSM.java
        wpml
        GenericPolicyPluginForWPML.java
        PerfTestingPlugin.java
        PolicyEnabledFilter.java
        Util.java
        schema
        AccessLogger.java
        Column.java
        DataSource.java
        IColumn.java
        IRow.java
        QueryContext.java
        RMG_specific_access.java
        Row.java
  - Prototype
    - Connectors
      - com
        ibm
        gaiandb
        clientseeker
        AbstractSeeker.java
        GaianRegistryDescriptorImpl.java
        NetworkUtils.java
        RegistryDescriptor.java
        RegistrySeeker.java
        ResourceNotFoundException.java
        jdbc
        discoveryclient
        ConnectionPoolDataSource.java
        DiscoveryDriver.java
        GaianConnectionSeeker.java
        GaianPooledConnection.java
        lite
        LiteConnection.java
        LiteDriver.java
        LiteGaianStatement.java
        LiteParameterMetaData.java
        LitePreparedStatement.java
        TableResultExtractor.java
        webserver
        WebServer.java
        windowsservice
        WindowsService.java
    - SVA-Security
      - com
        ibm
        gaiandb
        security
        client
        GSSPrivilegedAction.java
        GSSPrivilegedDelegateAction.java
        JavaAuth.java
        common
        KerberosToken.java
        SecurityToken.java
        server
        GSSServerPrivilegedAction.java
        authn
        KerberosUserAuthenticator.java
        TokenUserAuthenticator.java
        TrustedUserAuthenticator.java
    - UDPDriver
      - com
        ibm
        gaiandb
        udpdriver
        client
        ClientListener.java
        UDPConnection.java
        UDPDriver.java
        UDPDriverClientException.java
        UDPParameterMetaData.java
        UDPPreparedStatement.java
        UDPResultSet.java
        UDPResultSetMetaData.java
        UDPStatement.java
        ValuesDecoder.java
        common
        RowsFilter.java
        SocketHelper.java
        Util.java
        protocol
        CloseRequest.java
        ExecuteQueryRequest.java
        ExecuteQueryResponse.java
        Message.java
        MessageFactory.java
        MetaData.java
        NextValuesRequest.java
        NextValuesResponse.java
        QueryRequest.java
        ResponseWithValues.java
        UDPProtocolException.java
        server
        ClientState.java
        ConnectionManager.java
        RunnableWorker.java
        ServerListener.java
        SpecificByteArrayOutputStream.java
        UDPDriverServer.java
        UDPDriverServerException.java
        ValuesEncoder.java

/*
 * (C) Copyright IBM Corp. 2013
 *
 * LICENSE: Eclipse Public License v1.0
 * http://www.eclipse.org/legal/epl-v10.html
 */
package com.ibm.gaiandb.webservices.scanner.sax;

import java.io.IOException;
import java.io.InputStream;

/**
 * The purpose of this class is to generate a XHTML InputStream from a HTML InputStream. 
 * 
 * @author remi - IBM Hursley
 *
 */
public class HTMLFilterInputStream extends InputStream {

	// ----------------------------------------------------------------------------------
	// ----------------------------------------------------------------------- ATTRIBUTES

	// =========================================================================== Public
	// --------------------------------------------------------------------------- Static

	// Use PROPRIETARY notice if class contains a main() method, otherwise use
	// COPYRIGHT notice.
	public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2013";

	// -------------------------------------------------------------------------- Dynamic

	// ======================================================================== Protected
	// --------------------------------------------------------------------------- Static
	// -------------------------------------------------------------------------- Dynamic

	// ========================================================================== Private
	// --------------------------------------------------------------------------- Static
	
	/** 
	 * Used as a default value when the index going through the buffer doesn't 
	 * point to any values.
	 */
	private static final int NO_VALUES_TO_READ = -1;
	
	/**
	 * Used as a default returned value when a filter does not have to be applied 
	 * on a given character.
	 */
	private static final int FILTER_NOT_RELEVANT = -1;
	
	
	// -------------------------------------------------------------------------- Dynamic
	
	private InputStream inputStreamToFilter;
	
	private byte[] valuesToRead;
	
	private int indexValueToRead;

	private String[] tagsContentToRemove = new String[0];
	
	private String[] tagsToRemove = new String[0];//{"meta"}, "link"}; //new String[0];

	// ----------------------------------------------------------------------------------
	// ---------------------------------------------------------------------------- TOOLS

	// ----------------------------------------------------------------------------------
	// -------------------------------------------------------------------------- METHODS

	// ===================================================================== Constructors
	// --------------------------------------------------------------------------- Public
	
	public HTMLFilterInputStream(InputStream inputStreamToFilter) {
		super();
System.out.println("Apply Filter");
		this.inputStreamToFilter = inputStreamToFilter;
		this.valuesToRead = new byte[0];
		this.indexValueToRead = NO_VALUES_TO_READ;
	}
	
	
	// -------------------------------------------------------------------------- Private

	// =========================================================================== Public
	// --------------------------------------------------------------------------- Static
	// -------------------------------------------------------------------------- Dynamic

	public void setTagContentsToRemove(String[] contents) {
		this.tagsContentToRemove = contents;
		
	}

	public void setTagsToRemove(String[] tagNames) {
		this.tagsToRemove = tagNames;
		
	}

	@Override
	public int read() throws IOException {
		
		int nextValue = this.readAndFilter();
		
		// Checks the result of the conversion (int)-1 > char > int
		if (nextValue == 65535) nextValue = -1; 
		
//		char c = (char)nextValue;
//System.out.println("" + c);
		return nextValue;//(int)c;
	}
	
	@Override
	public void close() throws IOException {
		this.inputStreamToFilter.close();
		super.close();
	}
	
	
	// ======================================================================== Protected
	// --------------------------------------------------------------------------- Static
	// -------------------------------------------------------------------------- Dynamic

	// ========================================================================== Private
	// --------------------------------------------------------------------------- Static
	// -------------------------------------------------------------------------- Dynamic
	
	private int readAndFilter() throws IOException {
		
		// Checks if the buffer contains values to read
		if (this.indexValueToRead != NO_VALUES_TO_READ
				&& this.indexValueToRead < this.valuesToRead.length) {
			// read the first value of the buffer
			int ret = (int)this.valuesToRead[this.indexValueToRead];
			
			// Updates index
			this.indexValueToRead++;
			if (this.indexValueToRead >= this.valuesToRead.length) {
				this.valuesToRead = new byte[0];
				this.indexValueToRead = NO_VALUES_TO_READ;
			}
//System.out.println("read:" + new Character((char)ret).toString());	
			
			return ret;
		}
		// If the buffer is empty 
		else {
			// If it is the beginning of a tag: "<"
			char currentChar = (char)this.inputStreamToFilter.read();
			
			if (currentChar == '<') {
				StringBuilder valueToReadIfNormalTag = new StringBuilder();
				valueToReadIfNormalTag.append(currentChar);
				
				// --- Checks if it is a end tag
				currentChar = (char)this.inputStreamToFilter.read();
				if (currentChar == '/') {
					
					valueToReadIfNormalTag.append(currentChar);
					// If it is, we do not consider this as a tag to remove
					// So we store the read value and keep going
					this.valuesToRead = valueToReadIfNormalTag.toString().getBytes();
					this.indexValueToRead = 0;
									
					return this.readAndFilter();
				}
				
				// --- Scans the name of the tag
				// Remove whites
				while (currentChar == ' ' || currentChar == '\t' 
						|| currentChar == '\r' || currentChar == '\n' 
						|| currentChar == '\f') {
					valueToReadIfNormalTag.append(currentChar);
					currentChar = (char)this.inputStreamToFilter.read();
				}
				
				// Build the name of the tag
				StringBuilder nameTag = new StringBuilder();
				while (currentChar != '>' && currentChar != ' ' 
						&& currentChar != '\t' && currentChar != '\r'
						 && currentChar != '\n' && currentChar != '\f') {
					valueToReadIfNormalTag.append(currentChar);
					nameTag.append(currentChar);
					currentChar = (char)this.inputStreamToFilter.read();
				}
				valueToReadIfNormalTag.append(currentChar);
				
				// --- If it is a tag which content has to be removed
				String currentTagName = nameTag.toString().toLowerCase();
				String currentTagNameLowerStr = currentTagName.toLowerCase();
				for (String tagContentToRemove : 
						this.tagsContentToRemove ) {
					if (tagContentToRemove.equals(currentTagNameLowerStr)) {
						
						// scans until we find the corresponding end tag
						this.ignoreUntilEndTag(currentTagName);
						
						// The char just after the end of the end tag just opened
						currentChar = (char)this.readAndFilter();
						
//System.out.println("read:" + new Character((char)currentChar).toString());
						// return the next char
						return (int)currentChar;
					}
				}
				
				// Else, if it is the name of a tag to remove
				for (String tagContentToRemove : 
						this.tagsToRemove ) {
					if (tagContentToRemove.equals(currentTagName)) {
						
						// scans until the end of the tag
						this.ignoreCurrentTag(currentChar);
						
						currentChar = (char)this.readAndFilter();
						
//System.out.println("read:" + new Character((char)currentChar).toString());
						// return the next char
						return (int)currentChar;
					}
				}
				
				// Else, if it a normal tag, need to fill the buffer
				// Problem: going to return first char of valueToReadIfNormalTag 
				// and buffer all the others (plus the last read value stampNam)
				
				this.valuesToRead = valueToReadIfNormalTag.toString().getBytes();
				this.indexValueToRead = 0;
				
				return this.readAndFilter();
			
			}
			// If it is something else, 
			else {
//System.out.println("read:" + new Character((char)currentChar).toString());
				// returns the value
				return this.filterChar(currentChar);
			}
		}
	}
	
	/**
	 * Calls the method this.inputStreamToFilter.read() until the end of the end tag
	 * which name is given as a parameter.
	 * @param endTag
	 * 			end tag name delimiting when the ignoring has to stop.
	 * @throws IOException if any exceptions occur during the 
	 * this.inputStreamToFilter.read().
	 */
	private void ignoreUntilEndTag(String endTag) throws IOException {
		
		char stamp = (char)this.inputStreamToFilter.read();
		boolean hasBeFound = false;
		
		while (!hasBeFound) {
			// Does nothing until it find a '<'
			while (stamp != '<') {
				stamp = (char)this.inputStreamToFilter.read();
			}
			stamp = (char)this.inputStreamToFilter.read();

			// Checks if it is a closig tag
			if (stamp == '/') {
				
				// Remove whites before the name
				stamp = (char)this.inputStreamToFilter.read();
				while (stamp == ' ' || stamp == '\t' 
						|| stamp == '\r' || stamp == '\n' 
						|| stamp == '\f') {
					stamp = (char)this.inputStreamToFilter.read();
				}
				
				// Gets the name
				StringBuilder nameTag = new StringBuilder();
				while (stamp != '>' && stamp != ' ' 
						&& stamp != '\t' && stamp != '\n' 
						&& stamp != '\f' && stamp != '\r') {
					nameTag.append(stamp);
					stamp = (char)this.inputStreamToFilter.read();
				}
				
				// Checks if it closes the tag we are ignoring
				if (nameTag.toString().equalsIgnoreCase(endTag)) {
					// Ignore all whites until (including) the end of the tag
					while (stamp != '>') {
						stamp = (char)this.inputStreamToFilter.read();
					}
					hasBeFound = true;
				}
				// If it is not the right tag that it is closing, keep going!
			}
		}
	}

	/**
	 * Calls the method this.inputStreamToFilter.read() until the end of the 
	 * current tag.
	 * @param lastReadChar
	 * 			last char read before the call of this method.
	 * @throws IOException if any exceptions occur during the 
	 * this.inputStreamToFilter.read().
	 */
	private void ignoreCurrentTag(char lastReadChar) throws IOException {
		while (lastReadChar != '>') {
			lastReadChar = (char)this.inputStreamToFilter.read();
		}
	}

	/** 
	 * Checks if it is a specific char and add values to the buffer if so.
	 * @param charToTest
	 * 			The character to test.
	 * @return charToTest if no filter is relevant for this char, the first 
	 * character of the buffer if it changes have to be done.
	 * @throws IOException if any exceptions occur during the 
	 * this.inputStreamToFilter.read().
	 */
	private int filterChar(int charToTest) throws IOException {

		if (charToTest == -1) return charToTest;
		
		char testedChar = (char)charToTest;
		
		int filtered = this.filterCharAnd(testedChar);
		if (filtered != FILTER_NOT_RELEVANT) { 
			return filtered;
		}
		
		return charToTest;
	}
	
	/**
	 * If the charToTest equals '&', checks if it is the beginning of 
	 * '&' (XML value for '&'), '<' (XML value for '<') or for 
	 * '>', '"' or ''', and if not, add "amp;" to the buffer and returns 
	 * '&'.
	 * @param charToTest
	 * 			Character to test.
	 * @return FILTER_NOT_RELEVANT if charToTest is different of '&',
	 * '&' otherwise.
	 * @throws IOException if any exceptions occur during the 
	 * this.inputStreamToFilter.read().
	 */
	private int filterCharAnd(char charToTest) throws IOException {
		
		if (charToTest != '&') {
			return FILTER_NOT_RELEVANT;
		}
		
		StringBuilder toBuffer = new StringBuilder();
		toBuffer.append((char)charToTest);
		charToTest = (char)this.inputStreamToFilter.read();
		
		// If the '&' is before a space character
		if ((char)charToTest == ' ' || (char)charToTest == '\t'
			|| (char)charToTest == '\r' || (char)charToTest == '\n'
			|| (char)charToTest == '\f') {
			toBuffer.append("amp;");
			toBuffer.append(charToTest);
			
			this.valuesToRead = toBuffer.toString().getBytes();
			this.indexValueToRead = 0;
			return this.readAndFilter();
		}
		
		// gets 4 more char and compare the generated string with 
		// the definition of the specific XML characters.
		toBuffer.append(charToTest);
		for (int i = 0; i < 4; i++) {
			toBuffer.append((char)this.inputStreamToFilter.read());
		}
		
		// replace '&' by "&" if it is an illegal '&' in the current file
		String currentBuffer = toBuffer.toString();
		if (!(currentBuffer.toLowerCase().startsWith("&")
				|| currentBuffer.toLowerCase().startsWith("<")
				|| currentBuffer.toLowerCase().startsWith(">")
				|| currentBuffer.toLowerCase().startsWith(""")
				|| currentBuffer.toLowerCase().startsWith("'"))) {
			currentBuffer.replaceAll("&", "&");
			
		}
		
		// Loads the buffer
		this.valuesToRead = currentBuffer.getBytes();
		this.indexValueToRead = 0;
		
		return this.readAndFilter();
	}

}