HTMLTextParser.java example

Explorer
AlbiteREADER-master
- src
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

package org.albite.book.model.parser;

import java.util.Vector;
import org.albite.albite.AlbiteMIDlet;
import org.albite.book.view.StylingConstants;
import org.albite.io.html.HTMLSubstitues;
import org.albite.io.html.XhtmlStreamReader;

////#define DEBUG_PARSER

/**
 *
 * This is a <i>very</i> simple HTML parser made for the specific purpose of
 * parsing HTMLs on the fly, i.e. without conversion. It preserves some of the
 * formatting: the one specified by the following tags:
 * - [meta encoding]
 * - [i] or [em]
 * - [b] or [strong]
 * - [center], [left], [right]
 *
 * Because of memory considerations, and File linking issues,
 * images will not be preserved.
 *
 * @author albus
 */
public class HTMLTextParser extends TextParser
        implements HTMLSubstitues, StylingConstants {

    private static final String TAG_P       = "p";
    private static final String TAG_BR      = "br";
    private static final String TAG_DIV     = "div";
    private static final String TAG_TR      = "tr";
    private static final String TAG_LI      = "li";

    private static final String TAG_IMG     = "img";
    private static final String TAG_SVG_IMAGE = "image";

    private static final String TAG_B       = "b";
    private static final String TAG_STRONG  = "strong";
    private static final String TAG_I       = "i";
    private static final String TAG_EM      = "em";

    private static final String TAG_H1      = "h1";
    private static final String TAG_H2      = "h2";
    private static final String TAG_H3      = "h3";
    private static final String TAG_H4      = "h4";
    private static final String TAG_H5      = "h5";
    private static final String TAG_H6      = "h6";

    private static final String TAG_CENTER  = "center";

    private static final String TAG_HR      = "hr";

    private static final String TAG_PRE     = "pre";

    private int ignoreTag = 0;

    private int pre = 0;

    private int bold = 0;
    private int italic = 0;
    private int heading = 0;

    private int center = 0;

    private boolean hr = false;

    private Vector instructions = new Vector(20);

    public HTMLTextParser() {
        processBreaks = false;
    }

    public final void reset() {
        ignoreTag = 0;
        pre = 0;
        bold = 0;
        italic = 0;
        heading = 0;
        center = 0;
        hr = false;
        
        if (instructions != null) {
            instructions.removeAllElements();
        }

        super.reset();
    }

    public final boolean parseNext(
            final char[] text,
            final int textSize) {

        //#ifdef DEBUG_PARSER
//#         AlbiteMIDlet.LOGGER.log("---------------\nParsing: " + text.length + " / " + textSize);
        //#endif

        if (!instructions.isEmpty()) {
            /*
             * Execute instructions before continuing;
             */
            //#ifdef DEBUG_PARSER
//#             AlbiteMIDlet.LOGGER.log("Executing pareser instructions");
            //#endif

            state = ((Integer) instructions.lastElement()).byteValue();
            instructions.removeElementAt(instructions.size() - 1);
            return true;
        }

        if (!proceed(textSize)) {
            return false;
        }

        if (processWhiteSpace(position, text, textSize)) {
            return true;
        }

        //Parse markup instructions
        if (parseMarkup(text, textSize)) {
            return true;
        }

        //#ifdef DEBUG_PARSER
//#         AlbiteMIDlet.LOGGER.log("No markup");
        //#endif

        /*
         * parsing normal text; stopping at stop-chars or end of textbuffer
         */
        state = (ignoreTag > 0 ? STATE_PASS : STATE_TEXT);
        for (int i = position; i < textSize; i++) {
            ch = text[i];
            if (isWhiteSpace(ch) || isNewLine(ch) || ch == START_TAG_CHAR) {
                length = i - position;
                
                //#ifdef DEBUG_PARSER
//#                 AlbiteMIDlet.LOGGER.log("Stop character.");
                //#endif

                return true;
            }
        }

        //#ifdef DEBUG_PARSER
//#         AlbiteMIDlet.LOGGER.log("end of 'parseNext'");
        //#endif

        length = textSize - position;
        state = STATE_TEXT;
        return true;
    }

    private boolean parseMarkup(final char[] text, final int textSize) {

        int pos = position;
        boolean terminatingTag = false;

        /*
         * At least one char for tags
         */
        //#ifdef DEBUG_PARSER
//#         AlbiteMIDlet.LOGGER.log("Trying markup: " + textSize + ", " + pos);
        //#endif

        if (textSize > pos && text[pos] == START_TAG_CHAR) {

            //#ifdef DEBUG_PARSER
//#             AlbiteMIDlet.LOGGER.log("parsing markup...");
            //#endif

            state = STATE_PASS;

            /*
             * check if it's a comment tag
             */
            if (pos + 3 < textSize) {
                if (
                           text[pos + 1] == '!'
                        && text[pos + 2] == '-'
                        && text[pos + 3] == '-') {
                    /*
                     * It's indeed a comment tag
                     */
                    position = pos + 4;
                    length = 0;
                    while (position < textSize) {
                        if (text[position] == END_TAG_CHAR
                                && text[position - 1] == '-'
                                && text[position - 2] == '-') {
                            /*
                             * End of comment
                             */
                            position++;
                            break;
                        }
                        position++;
                    }
                    /*
                     * end of comment (no matter closed or not)
                     */
                    return true;
                }
            }

            /*
             * Let's start parsing tags content (if such exists)
             */
            pos++;

            if (pos >= textSize) {
                /*
                 * It was a single '<' character dangling at the end of the file
                 */
                position = pos;
                return true;
            }

            if (text[pos] == '/') {
                terminatingTag = true;
                pos++;
            }

            /*
             * back to position
             */
            position = pos;
            
            if (text.length <= pos) {
                return false;
            }

            for (int i = pos; i < textSize; i++) {

                ch = text[i];

                if (ch == END_TAG_CHAR) {
                    length = i - position + 1;

                    /*
                     * Parse the name
                     */
                    int len = length - 1;
                    int max = position + length - 1;
                    for (int k = position; k < max; k++) {
                        ch = text[k];

                        if (isWhiteSpace(ch) || isNewLine(ch) || ch == '/') {
                            len = k - position;
                            break;
                        }
                    }

                    final String name = new String(text, position, len);

                    //#ifdef DEBUG_PARSER
//#                     AlbiteMIDlet.LOGGER.log("tag: _" + new String(text, position, length) + "_");
//#                     AlbiteMIDlet.LOGGER.log("tag name: _" + name + "_");
//# 
//#                     if (length + position < textSize) {
//#                         AlbiteMIDlet.LOGGER.log("next char to read after this: _" + text[length + position] + "_, " + ((int) text[length + position]));
//#                     }
                    //#endif

                    if (TAG_IMG.equalsIgnoreCase(name)) {
                        /*
                         * Image
                         */
                        final String attributes = new String(
                                text, position + len, length - 1 - len);

                        final int[] srcPositions =
                                XhtmlStreamReader.readAttribute(
                                attributes, "src");

                        if (srcPositions == null) {
                            imageURLPosition = 0;
                            imageURLLength = 0;
                        } else {
                            imageURLPosition = position + len + srcPositions[0];
                            imageURLLength = srcPositions[1];
                        }

                        final int[] altPositions =
                                XhtmlStreamReader.readAttribute(
                                attributes, "alt");

                        if (altPositions == null) {
                            imageTextPosition = 0;
                            imageTextLength = 0;
                        } else {
                            imageTextPosition =
                                    position + len + altPositions[0];
                            imageTextLength = altPositions[1];
                        }

                        state = STATE_IMAGE;
                        return true;
                    }

                    if (TAG_SVG_IMAGE.equalsIgnoreCase(name)) {
                        /*
                         * SVG Image
                         */
                        final String attributes = new String(
                                text, position + len, length - 1 - len);

                        final int[] srcPositions =
                                XhtmlStreamReader.readAttribute(
                                attributes, "xlink:href");

                        if (srcPositions == null) {
                            imageURLPosition = 0;
                            imageURLLength = 0;
                        } else {
                            imageURLPosition = position + len + srcPositions[0];
                            imageURLLength = srcPositions[1];
                        }

                        imageTextPosition = 0;
                        imageTextLength = 0;

                        state = STATE_IMAGE;
                        return true;
                    }

                    /*
                     * Obviously, the image tag won't affect the `hr` variable
                     */
                    final boolean hrOld = hr;
                    hr = false;

                    if (TAG_BR.equalsIgnoreCase(name)) {
                        /*
                         * New line
                         */
                        state = STATE_NEW_LINE;
                        return true;
                    }

                    if (TAG_P.equalsIgnoreCase(name)
                            || TAG_DIV.equalsIgnoreCase(name)
                            || TAG_TR.equalsIgnoreCase(name)
                            || TAG_LI.equalsIgnoreCase(name)) {
                        /*
                         * New line
                         */
                        //#ifdef DEBUG_PARSER
//#                         AlbiteMIDlet.LOGGER.log("executed: <P>");
                        //#endif
                        state = STATE_NEW_SOFT_LINE;
                        return true;
                    }

                    if (TAG_HR.equalsIgnoreCase(name)) {
                        /*
                         * Horizontal ruler
                         */
                        hr = true;

                        if (!hrOld) {
                            instructions.addElement(new Integer(STATE_NEW_SOFT_LINE));
                            instructions.addElement(new Integer(STATE_RULER));
                            instructions.addElement(new Integer(STATE_NEW_SOFT_LINE));
                        }
                        state = STATE_PASS;
                        return true;
                    }

                    if (terminatingTag) {
                        if (TAG_B.equalsIgnoreCase(name)
                                || TAG_STRONG.equalsIgnoreCase(name)) {
                            bold--;

                            if (bold <= 0) {
                                bold = 0;
                                disableBold = true;
                                state = STATE_STYLING;
                            } else {
                                state = STATE_PASS;
                            }
                            return true;
                        }

                        if (TAG_I.equalsIgnoreCase(name)
                                || TAG_EM.equalsIgnoreCase(name)) {
                            italic--;

                            if (italic <= 0) {
                                italic = 0;
                                disableItalic = true;
                                state = STATE_STYLING;
                            } else {
                                state = STATE_PASS;
                            }
                            return true;
                        }

                        if (TAG_H1.equalsIgnoreCase(name)
                                || TAG_H2.equalsIgnoreCase(name)
                                || TAG_H3.equalsIgnoreCase(name)
                                || TAG_H4.equalsIgnoreCase(name)
                                || TAG_H5.equalsIgnoreCase(name)
                                || TAG_H6.equalsIgnoreCase(name)) {
                            heading--;

                            if (heading <= 0) {
                                heading = 0;
                                disableHeading = true;
                                instructions.addElement(new Integer(STATE_STYLING));
                            }
                            
                            state = STATE_NEW_SOFT_LINE;
                            return true;
                        }

                        if (TAG_CENTER.equalsIgnoreCase(name)) {
                            center--;

                            if (center <= 0) {
                                center = 0;
                                disableCenterAlign = true;
                                instructions.addElement(new Integer(STATE_STYLING));
                            }
                            
                            state = STATE_NEW_SOFT_LINE;
                            return true;
                        }

                        if (TAG_PRE.equalsIgnoreCase(name)) {
                            pre--;

                            if (pre <= 0) {
                                pre = 0;
                                processBreaks = false;
                            }

                            state = STATE_PASS;
                            return true;
                        }

                        if (isIgnoreTag(name)) {
                            ignoreTag--;

                            if (ignoreTag < 0) {
                                ignoreTag = 0;
                            }
                            return true;
                        }
                    } else {
                        if (TAG_B.equalsIgnoreCase(name)
                                || TAG_STRONG.equalsIgnoreCase(name)) {
                            bold++;

                            enableBold = true;
                            state = STATE_STYLING;
                            return true;
                        }

                        if (TAG_I.equalsIgnoreCase(name)
                                || TAG_EM.equalsIgnoreCase(name)) {
                            italic++;

                            enableItalic = true;
                            state = STATE_STYLING;
                            return true;
                        }

                        if (TAG_H1.equalsIgnoreCase(name)
                                || TAG_H2.equalsIgnoreCase(name)
                                || TAG_H3.equalsIgnoreCase(name)
                                || TAG_H4.equalsIgnoreCase(name)
                                || TAG_H5.equalsIgnoreCase(name)
                                || TAG_H6.equalsIgnoreCase(name)) {
                            heading++;

                            enableHeading = true;
                            instructions.addElement(new Integer(STATE_NEW_SOFT_LINE));
                            state = STATE_STYLING;
                            return true;
                        }

                        if (TAG_CENTER.equalsIgnoreCase(name)) {
                            center++;

                            enableCenterAlign = true;
                            instructions.addElement(new Integer(STATE_NEW_SOFT_LINE));
                            state = STATE_STYLING;
                            return true;
                        }

                        if (TAG_PRE.equalsIgnoreCase(name)) {
                            int k = position + length + 1;

                            if (k < textSize) {
                                if (text[k] == '\n') {
                                    length += 2;
                                } else if (text[k] == '\r') {
                                    length += 2;
                                    k++;
                                    if (k < textSize && text[k] == '\n') {
                                        length++;
                                    }
                                }
                            }
                            pre++;
                            processBreaks = true;
                            state = STATE_PASS;
                            return true;
                        }

                        if (isIgnoreTag(name)) {
                            ignoreTag++;
                            return true;
                        }
                    }

                    return true;
                }
            }

            /*
             * TODO: Do not know if next line is OK.
             */
            position = textSize;
            length = 1;
            return true;
        }

        return false;
    }

    private static boolean isIgnoreTag(final String s) {
        return
                   "head".equalsIgnoreCase(s)
                || "style".equalsIgnoreCase(s)
                || "form".equalsIgnoreCase(s)
                || "frameset".equalsIgnoreCase(s)
                || "map".equalsIgnoreCase(s)
                || "script".equalsIgnoreCase(s)
                || "object".equalsIgnoreCase(s)
                || "applet".equalsIgnoreCase(s)
                || "noscript".equalsIgnoreCase(s)
                ;
    }
}