// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2006 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: HtmlParser.java,v 1.3 2006/01/08 05:10:10 kyank Exp $
//
package com.salas.bb.utils.htmlparser;
import java.io.Reader;
import java.io.IOException;
/**
* Simplpified and fast parser of HTML that detects text, tags and entities separately.
*/
public class HtmlParser
{
private static final int TEXT = 0;
private static final int TAG = 1;
private static final int ENTITY = 2;
private int mode;
private StringBuffer full;
private StringBuffer name;
private StringBuffer temp;
private boolean nameRead;
private int tagCharNum;
private boolean closeTag;
private IHtmlParserListener listener;
private boolean swingMode;
/**
* Creates parser in non-Swing mode.
*/
public HtmlParser()
{
this(false);
}
/**
* Creates parser in specified mode.
*
* @param swingMode TRUE for Swing mode.
*/
public HtmlParser(boolean swingMode)
{
this.swingMode = swingMode;
full = new StringBuffer();
name = new StringBuffer(10);
temp = new StringBuffer(2);
}
/**
* Parses HTML read from given reader and sending events to the specified listener.
*
* @param reader reader to use for reading.
* @param l listener to notify.
*
* @throws IOException in case of any errors.
*/
public void parse(Reader reader, IHtmlParserListener l)
throws IOException
{
listener = l;
init();
// start reading
l.onStart();
// main loop
int ch;
while ((ch = reader.read()) != -1)
{
parse((char)ch);
}
if (mode == TEXT)
{
fireOnText(full.toString());
} else if (mode == ENTITY)
{
fireOnEntity(name.toString(), full.toString());
}
// finish reading
l.onFinish();
}
/**
* First start initialization.
*/
private void init()
{
mode = TEXT;
clearBuffers();
}
/**
* Parse single char.
*
* @param ch char.
*/
private void parse(char ch)
{
switch (mode)
{
case TAG:
parseTag(ch);
break;
case ENTITY:
parseEntity(ch);
break;
default:
parseText(ch);
break;
}
}
/**
* Parse char when in TEXT mode.
*
* @param ch char.
*/
private void parseText(char ch)
{
if (ch == '<')
{
fireOnText(full.toString());
startTag(ch);
} else if (ch == '&')
{
fireOnText(full.toString());
startEntity(ch);
} else
{
full.append(ch);
}
}
/**
* Parse char when in TAG mode.
*
* @param ch char.
*/
private void parseTag(char ch)
{
if (swingMode)
{
if (ch == '/')
{
temp.append(ch);
} else
{
if (ch != '>') full.append(temp);
full.append(ch);
clearBuffer(temp);
}
} else
{
full.append(ch);
}
tagCharNum++;
if (ch == '>')
{
fireOnTag(name.toString(), full.toString(), closeTag);
clearBuffers();
mode = TEXT;
} else if (!nameRead)
{
if (tagCharNum == 1 && ch == '/')
{
closeTag = true;
} else if (ch == '/' || Character.isWhitespace(ch))
{
nameRead = true;
} else
{
name.append(Character.toLowerCase(ch));
}
}
}
/**
* Parse char when in ENTITY mode.
*
* @param ch char.
*/
private void parseEntity(char ch)
{
boolean entityTerminator = ch == ';' || Character.isWhitespace(ch) || ch == '<' ||
ch == '&';
if (entityTerminator)
{
if (ch == ';') full.append(ch);
fireOnEntity(name.toString(), full.toString());
clearBuffers();
if (ch == '<')
{
startTag(ch);
} else if (ch == '&')
{
startEntity(ch);
} else
{
if (ch != ';') full.append(ch);
mode = TEXT;
}
} else
{
full.append(ch);
name.append(ch);
}
}
/**
* Initialize before start of TAG-mode parsing.
*
* @param ch char.
*/
private void startTag(char ch)
{
startBlock(TAG, ch);
tagCharNum = 0;
closeTag = false;
}
/**
* Initialize before start of ENTITY-mode parsing.
*
* @param ch char.
*/
private void startEntity(char ch)
{
startBlock(ENTITY, ch);
}
/**
* Initialize before start of parsing in specified mode.
*
* @param aMode mode to set.
* @param ch char.
*/
private void startBlock(int aMode, char ch)
{
clearBuffers();
this.mode = aMode;
nameRead = false;
full.append(ch);
}
/**
* Fire text block parsing completion event.
*
* @param text text.
*/
private void fireOnText(String text)
{
if (text == null || text.length() == 0) return;
listener.onText(text);
}
/**
* Fire tag parsing completion event.
*
* @param aName lower-case name of tag.
* @param aFull full version of text.
* @param aCloseTag TRUE if closing tag detected.
*/
private void fireOnTag(String aName, String aFull, boolean aCloseTag)
{
listener.onTag(aName, aFull, aCloseTag);
}
/**
* Fires entity parsing completion event.
*
* @param aName name of entity in original case.
* @param aFull full entity text.
*/
private void fireOnEntity(String aName, String aFull)
{
listener.onEntity(aName, aFull);
}
/**
* Clears internal buffers.
*/
private void clearBuffers()
{
clearBuffer(full);
clearBuffer(name);
clearBuffer(temp);
}
/**
* Clears given buffer.
*
* @param buf buffer to clear.
*/
private void clearBuffer(StringBuffer buf)
{
buf.delete(0, buf.length());
}
}