tagProcessor.java example

Explorer
openbd-core-master
- src
/* 
 *  Copyright (C) 2000 - 2008 TagServlet Ltd
 *
 *  This file is part of Open BlueDragon (OpenBD) CFML Server Engine.
 *  
 *  OpenBD is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  Free Software Foundation,version 3.
 *  
 *  OpenBD is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with OpenBD.  If not, see http://www.gnu.org/licenses/
 *  
 *  Additional permission under GNU GPL version 3 section 7
 *  
 *  If you modify this Program, or any covered work, by linking or combining 
 *  it with any of the JARS listed in the README.txt (or a modified version of 
 *  (that library), containing parts covered by the terms of that JAR, the 
 *  licensors of this Program grant you additional permission to convey the 
 *  resulting work. 
 *  README.txt @ http://www.openbluedragon.org/license/README.txt
 *  
 *  http://www.openbluedragon.org/
 */

package com.nary.net.http;


/**
 * tagProcessor
 *
 * this class, given a url and port, will resolve urls in tags
 * passed to it when a tagFilterInputStream instance calls process tag.
 * Note this convert all urls. Only :
 *    - img src 
 *    - a href 
 *    - form action 
 *    - applet code 
 *    - script src 
 *    - embed src 
 *    - embed pluginspace 
 *    - body background 
 *    - frame src 
 *    - bgsound src 
 *    - object data 
 *    - object classid 
 *    - object codebase 
 *    - object usemap 
 *
 */


import java.io.ByteArrayOutputStream;

import com.nary.net.tagFilterInputStream;
import com.nary.net.tagListener;
import com.nary.util.byteArray;


public class tagProcessor implements tagListener{ 
  private ByteArrayOutputStream wordStream;

  private final static byte DEFAULT=0, BASE=1;

  private String sourceURL;
  private String baseURL = null;

  private final static int capsToSmallGap = (int)'a' - (int)'A';
  // for use in reading in and coverting tags
  private ByteArrayOutputStream temp;
  private byte [] buffer;
  private int bufferAt;

	private UrlLinkResolver urlutils;

  public tagProcessor( String _url, int _port ){
    // sourceURL must be absolute
    sourceURL = _url;
    // convert the url so that it contains all /'s as opposed to \'s,
    // and so that it ends with a '/'
    sourceURL.replace('\\', '/');
    // ensure url ends with a /
    if (sourceURL.lastIndexOf('/') <= 7){
      sourceURL += "/";
    }
    if (!sourceURL.endsWith("/")){
      sourceURL = sourceURL.substring(0, sourceURL.lastIndexOf("/") + 1);
    }

    int thirdSlashIndex = sourceURL.indexOf('/', 7);
    // if a port number isn't given in the url, add it in
    if (sourceURL.indexOf(':', sourceURL.indexOf(':') + 1) == -1){
      sourceURL = sourceURL.substring(0, thirdSlashIndex) + ":" + _port + sourceURL.substring(thirdSlashIndex);
    }

    temp 				= new ByteArrayOutputStream();
    wordStream 	= new ByteArrayOutputStream();
    urlutils		= new UrlLinkResolver();
  }//tagProcessor()
  

  public byte[] processTag( byte [] _tag ) {
    buffer = _tag;
    bufferAt = 0;

    // reset stream used for copying new tag into
    temp.reset();

    // notes: first byte should be a '<', last byte should be a '>' (might not be LATER)
    bufferAt = 0;

    // get the '<'
    temp.write(buffer[bufferAt]);
    bufferAt++;

    skipWhitespace();

    byte [] firstWord = getNextWord();

    try{
      temp.write(firstWord);
    }catch(java.io.IOException ignored){}


    // if A
    if (byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'A'})){
      // get href
      processRestOfTag(new byte[][]{{(byte)'h',(byte)'r',(byte)'e',(byte)'f'}}, DEFAULT);
    }

    // if form
    else if (byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'F',(byte)'O',(byte)'R',(byte)'M'})){
      // get action
      processRestOfTag(new byte[][]{{(byte)'a',(byte)'c',(byte)'t',(byte)'i',(byte)'o',(byte)'n'}}, DEFAULT);
    }

    // if embed
    else if (byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'E',(byte)'M',(byte)'B',(byte)'E',(byte)'D'})){
      // get pluginspace, src
      processRestOfTag(new byte[][]{{(byte)'p',(byte)'l',(byte)'u',(byte)'g',(byte)'i',(byte)'n',(byte)'s',(byte)'p',(byte)'a',(byte)'c',(byte)'e'},{(byte)'s',(byte)'r',(byte)'c'}}, DEFAULT);
    }

    // if frame, bgsound, img, script, base
    else if (byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'F',(byte)'R',(byte)'A',(byte)'M',(byte)'E'})
      || byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'B',(byte)'G',(byte)'S',(byte)'O',(byte)'U',(byte)'N',(byte)'D'})
      || byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'S',(byte)'C',(byte)'R',(byte)'I',(byte)'P',(byte)'T'})
      || byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'I',(byte)'M',(byte)'G'})){
      // get src
      processRestOfTag(new byte[][]{{(byte)'s',(byte)'r',(byte)'c'}}, DEFAULT);
    }

    // if base
    else if (byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'B',(byte)'A',(byte)'S',(byte)'E'})){
      // get src
      processRestOfTag(new byte[][]{{(byte)'s',(byte)'r',(byte)'c'}}, BASE);
    }

    // if body
    else if (byteArray.equalsIgnoreCase(firstWord, new byte[]{(byte)'B',(byte)'O',(byte)'D',(byte)'Y'})){
      // get background
      processRestOfTag(new byte[][]{{(byte)'b',(byte)'a',(byte)'c',(byte)'g',(byte)'r',(byte)'o',(byte)'u',(byte)'n',(byte)'d'}}, DEFAULT);
    }

    // NOTE: Object and Applet tags are special cases that may involve a codebase
    // if object
    else if (byteArray.equalsIgnoreCase(firstWord, new byte[] {(byte)'O',(byte)'B',(byte)'J',(byte)'E',(byte)'C',(byte)'T'})){
      processObjectTag();
    }

    // if applet
    else if (byteArray.equalsIgnoreCase(firstWord, new byte[] {(byte)'A',(byte)'P',(byte)'P',(byte)'L',(byte)'E',(byte)'T'})){
      processAppletTag();
    }

    else{
      // not a tag that has any urls that require resolving, so just
      // return the untouched buffer
      return buffer;
    }

    // get the '>'
    temp.write(buffer[bufferAt]);
    bufferAt++;

    return temp.toByteArray();
  }// checkTag()


  private void processRestOfTag(byte [][] _keywords, byte _tagType) {
    try{
      int bufferLen = buffer.length;
      // while haven't reached the '>'
      while (bufferAt < bufferLen-1){

        skipWhitespace();
  
        byte [] word = getNextWord();
        temp.write(word);

        // if it's a tag then get the value
        if (isKeyword(word, _keywords) != -1){
          skipWhitespace();
          if (buffer[bufferAt] == '='){
            temp.write(buffer[bufferAt]);
            bufferAt++;
            skipWhitespace();
            processURI(getURI(), _tagType);
          }
        }

      }// while
    }catch (java.io.IOException ignored){}
    
  }//processRestOfTag()


  private void skipWhitespace(){
    // skip LWS
    while ((bufferAt < buffer.length) && buffer[bufferAt] == ' ' ||
          buffer[bufferAt] == '\r' || buffer[bufferAt] == '\n' ||
	        buffer[bufferAt] == '\t'){
      temp.write(buffer[bufferAt]);
      bufferAt++;
    }
  }// skipWhitespace


  /**
   * returns the next word in the buffer (not the stream)
   * [used to parse the buffer]
   */
  private byte[] getNextWord(){
    wordStream.reset();

    // while haven't reached the end of the tag & current character is ok
    while((bufferAt < buffer.length-1) && (isChar(buffer[bufferAt]))){
      wordStream.write(buffer[bufferAt]);
      bufferAt++;
    }

    return wordStream.toByteArray();
  }// getNextWord


  // return if character is a legal character other than '='
  // except in case where '=' is treated as a word itself
  private boolean isChar(byte ch){
    return ((ch < 0) || (ch > 32 && ch < 256 && ch != 61) || (ch == 61 && wordStream.size() == 0));//((ch >= 65 && ch <=90) || (ch >= 97 && ch <= 122));
  }// isChar()


  /**
   * returns true if the given word is a tag keyword from the tag list 'tags'
   */

  private int isKeyword(byte[] word, byte[][] _keywords){
    int keywordIndex = 0;
    int wordIndex = 0;

    // check for each known tag
    for (int keywordNum = 0; keywordNum < _keywords.length; keywordNum++){
      keywordIndex = 0;
      int wordLen = _keywords[keywordNum].length;

      // no point comparing this tag if word lengths don't match
      if (word.length != wordLen)
        continue;

      // while the char in the word matches the char in the tag
      // AND the end of the tag hasn't been reached
      while (keywordIndex < wordLen && (toSmall(word[wordIndex]) == _keywords[keywordNum][keywordIndex])){
        wordIndex++;
        keywordIndex++;
      }

      if (keywordIndex == wordLen){
        return keywordNum;
      }
    }

    // no tags match
    return -1;
  }// isKeyword()


  /**
   * gets the next uri from the byte stream returning it as a byte[]
   */

  private byte[] getURI() {
    wordStream.reset();
    // if next char is " then get next chars up til the next "
    if (buffer[bufferAt] == '"' || buffer[bufferAt] == '\''){
      // don't write the "
      bufferAt++;

      this.skipWhitespace();
      //if the uri given is just "    "
      if (buffer[bufferAt] == '"' || buffer[bufferAt] == '\''){
        return new byte[0];
      }

      // while haven't reached the end '>' or the " for
      while ((bufferAt < buffer.length-1) && (buffer[bufferAt] != '"') &&
            (buffer[bufferAt] != '\'')){
        wordStream.write(buffer[bufferAt]);
	      bufferAt++;
      }

      // if stopped looping because " found
      if (bufferAt != buffer.length-1){
	      // don't write the "
	      bufferAt++;
      }

    }
    // else get the next chars up til the next white space or carriage return
    else{
      // fix this line to make it more efficient
      while ((bufferAt < buffer.length-1) && (buffer[bufferAt] != '"')
        && (buffer[bufferAt] != '\'') && buffer[bufferAt] != '\n' && buffer[bufferAt] != ' '){
	      wordStream.write(buffer[bufferAt]);
	      bufferAt++;
      }

      // if stopped looping because ", or ' found
      if (buffer[bufferAt] == '=' || (buffer[bufferAt] == '\'')){
	      // write the "
        wordStream.write(buffer[bufferAt]);
        bufferAt++;
      }
    }

    return wordStream.toByteArray();

  }// getURI


  /**
   *  processes the given url depending on the operation given
   *  if the op is DEFAULT, then encode the given url
   *  if the op is BASE, then set the BASE url as the given url
   *  @param in - the url to be processed
   *  @param op - the operation to be performed
   **/

  private void processURI(byte[] in, int op) {

    try{
      // if url is not an http url then
      if (!isHttpURL(in)){
        //leave the url as it is
        temp.write('"');
        temp.write(in);
        temp.write('"');
        return;
      }

      switch (op){
        case DEFAULT:

          if (baseURL == null){
            temp.write('"');
            temp.write((urlutils.encode(new String(in), sourceURL)).getBytes());
            temp.write('"');
          }else{
            temp.write('"');
            String resolved1 = urlutils.encode(baseURL, sourceURL);
            temp.write((urlutils.encode(new String(in), resolved1)).getBytes());
            temp.write('"');
          }
          break;
        case BASE:
        	// set BASE
        	temp.write('"');
          temp.write(in);
          temp.write('"');
          baseURL = (urlutils.encode(new String(in), sourceURL));
        	break;
		default:
		  throw new IllegalStateException("invalid op - " + op);
	  }//switch
    }catch (java.io.IOException ignored){}
  }// processURI()


  /**
   * resolves the code uri relative to the codebase uri if one exists
   */

  private void processAppletTag() {
    try{
      int bufferLen = buffer.length;
      // keywords - code, codebase

      byte [] codeURL = null;
      byte [] codebaseURL = null;
      String fullCodebase = null;

      // while haven't reached the '>'
      while (bufferAt < bufferLen-1){

        skipWhitespace();

        byte [] word = getNextWord();

        int wordIndex = isKeyword(word, new byte[][]{{(byte)'c',(byte)'o',(byte)'d',(byte)'e'},{(byte)'c',(byte)'o',(byte)'d',(byte)'e',(byte)'b',(byte)'a',(byte)'s',(byte)'e'}});
      
        // if code
        if (wordIndex == 0){
          skipWhitespace();
          bufferAt++; // skip the '='
          skipWhitespace();
          codeURL = getURI();
        
        // if codebase
        }else if (wordIndex == 1){
          skipWhitespace();
          bufferAt++; // skip the '='
          skipWhitespace();
          codebaseURL = getURI();
          fullCodebase = urlutils.encode(sourceURL, new String(codebaseURL));
          temp.write(word);
          temp.write('=');
          temp.write('"');
          temp.write(fullCodebase.getBytes());
          temp.write('"');
        
        }else{
          temp.write(word);
        }


      }// while

      if (codeURL != null){ // unlikely that it does equal null 
        temp.write(new byte[]{(byte)'C',(byte)'O',(byte)'D',(byte)'E',(byte)'=',(byte)'"'});

        if (fullCodebase != null){
          temp.write((urlutils.encode(new String(codeURL), fullCodebase)).getBytes());
      
        }else{
          temp.write((urlutils.encode(new String(codeURL), sourceURL)).getBytes());
        }

        temp.write('"');
      }

    }catch (java.io.IOException ignored){}  

  }// processAppletTag()


  /**
   * resolves the data uri relative to the codebase uri if one exists
   */

  private void processObjectTag() {
    try{
      // keywords - data, classid, usemap, codebase
      int bufferLen = buffer.length;

      byte [] dataURL = null;
      byte [] codebaseURL = null;
      String fullCodebase = null;

      // while haven't reached the '>'
      while (bufferAt < bufferLen-1){

        skipWhitespace();

        byte [] word = getNextWord();

        int wordIndex = isKeyword(word, new byte[][]{{(byte)'d',(byte)'a',(byte)'t',(byte)'a'},{(byte)'c',(byte)'o',(byte)'d',(byte)'e',(byte)'b',(byte)'a',(byte)'s',(byte)'e'},
          {(byte)'u',(byte)'s',(byte)'e',(byte)'m',(byte)'a',(byte)'p'}, {(byte)'c',(byte)'l',(byte)'a',(byte)'s',(byte)'s',(byte)'i',(byte)'d'}});
      
        // if code
        if (wordIndex == 0){
          skipWhitespace();
          bufferAt++; // skip the '='
          skipWhitespace();
          dataURL = getURI();
        
        // if codebase
        }else if (wordIndex == 1){
          skipWhitespace();
          bufferAt++; // skip the '='
          skipWhitespace();
          codebaseURL = getURI();
          fullCodebase = urlutils.encode(new String(codebaseURL), sourceURL);
          temp.write(word);
          temp.write('=');
          temp.write('"');
          temp.write(fullCodebase.getBytes());
          temp.write('"');
        
        }else if (wordIndex == 2 || wordIndex == 3){
          temp.write(word);
          skipWhitespace();
          bufferAt++; // skip the '='
          temp.write('=');
          skipWhitespace();
          temp.write('"');
          processURI(getURI(), DEFAULT);
          temp.write('"');

        }else{
          temp.write(word);
        }

      }// while

      if (dataURL != null){ // unlikely that it does equal null 
        temp.write(new byte[]{(byte)'D',(byte)'A',(byte)'T',(byte)'A',(byte)'=',(byte)'"'});

        if (fullCodebase != null){
          temp.write((urlutils.encode(new String(dataURL), fullCodebase)).getBytes());
        }else{
          temp.write((urlutils.encode(new String(dataURL), sourceURL)).getBytes());
        }

        temp.write('"');
      }

    }catch (java.io.IOException ignored){}
  }// processObjectTag()



  private static boolean isHttpURL(byte [] in){
    // check first if starts with http:
    // if uri put in is long enough to check that it begins with http:
    if (in.length > 5){
      // check if uri begins with "http:"
      if (in[0] == 'h' && in[1] == 't' && in[2] == 't' && in[3] == 'p' && in[4] == ':'){
        return true;
      }
    }

    // check if this is a relative url i.e. - the uri doesn't specify a protocol
    int index = 0;
    while (index < in.length){
      // if a colon is found then all chars previous to this make up the
      // protocol and hence this isn't http. Note that this colon cannot
      // be the colon preceding the port number since www.somesite.com:80 is
      // an invalid uri without the http://
      if (in[index] == ':'){
        return false;
      }

      // if the character is not a valid char for a protocol then assume
      // this is a relative http url - so return true
      if (!((in[index] >= 'A' && in[index] <= 'Z') ||
            (in[index] >= 'a' && in[index] <= 'z') ||
            (in[index] >= '0' && in[index] <= '9') ||
            in[index] == '+' || in[index] == '-' || in[index] == '.')){
        return true;
      }

      index++;

    }// while
    // if reached the end of the uri without finding the end of a protocol
    // (denoted by a ':') then must be a relative http url
    return true;

  }// isHttpURL()


  private static byte toSmall(byte in){
    if (in >= 'A' && in <= 'Z')
      return (byte)(in + capsToSmallGap);
    else
      return in;
  }

	
	public static void main( String [] args ){
		tagProcessor tp = new tagProcessor( "http://www.n-ary.com", 80 );
		tagFilterInputStream is = new tagFilterInputStream( new java.io.ByteArrayInputStream( ("<IMG SRC=\"/image/pic.gif\"><IMG SRC=\"/image/pic.gif\">").getBytes() ) );
		is.registerTagListener( tp );
		
		try{
			int ch = is.readChar();
			while( ch != -1 ){
				System.out.print( (char) ch );
				ch = is.readChar();
			}
		}catch( java.io.IOException ioe ){
			ioe.printStackTrace();
		}
		
	}

}// tagProcessor